forked from FFmpeg/FFmpeg
![]() I spotted an interesting pattern that I didn't see before that leads to the implementation being faster. The bit shifting table I was using before is no longer needed, and was able to remove quite a few lines. I also add use of FMA on the AVX2 version. f32 1920x1080 1 thread with prelut c impl 1434012700 UNITS in lut3d->interp, 1 runs, 0 skips 1434035335 UNITS in lut3d->interp, 2 runs, 0 skips 1423615347 UNITS in lut3d->interp, 4 runs, 0 skips 1426268863 UNITS in lut3d->interp, 8 runs, 0 skips sse2 905484420 UNITS in lut3d->interp, 1 runs, 0 skips 905659010 UNITS in lut3d->interp, 2 runs, 0 skips 915167140 UNITS in lut3d->interp, 4 runs, 0 skips 915834222 UNITS in lut3d->interp, 8 runs, 0 skips avx 574794860 UNITS in lut3d->interp, 1 runs, 0 skips 581035090 UNITS in lut3d->interp, 2 runs, 0 skips 584116720 UNITS in lut3d->interp, 4 runs, 0 skips 581460290 UNITS in lut3d->interp, 8 runs, 0 skips avx2 301698880 UNITS in lut3d->interp, 1 runs, 0 skips 301982880 UNITS in lut3d->interp, 2 runs, 0 skips 306962430 UNITS in lut3d->interp, 4 runs, 0 skips 305472025 UNITS in lut3d->interp, 8 runs, 0 skips gbrap16 1920x1080 1 thread with prelut c impl 1480894840 UNITS in lut3d->interp, 1 runs, 0 skips 1502922990 UNITS in lut3d->interp, 2 runs, 0 skips 1496114307 UNITS in lut3d->interp, 4 runs, 0 skips 1492554551 UNITS in lut3d->interp, 8 runs, 0 skips sse2 980777180 UNITS in lut3d->interp, 1 runs, 0 skips 986121520 UNITS in lut3d->interp, 2 runs, 0 skips 986489840 UNITS in lut3d->interp, 4 runs, 0 skips 998832248 UNITS in lut3d->interp, 8 runs, 0 skips avx 622212360 UNITS in lut3d->interp, 1 runs, 0 skips 622981160 UNITS in lut3d->interp, 2 runs, 0 skips 645396315 UNITS in lut3d->interp, 4 runs, 0 skips 641057075 UNITS in lut3d->interp, 8 runs, 0 skips avx2 321336400 UNITS in lut3d->interp, 1 runs, 0 skips 321268920 UNITS in lut3d->interp, 2 runs, 0 skips 323459895 UNITS in lut3d->interp, 4 runs, 0 skips 324949967 UNITS in lut3d->interp, 8 runs, 0 skips |
||
---|---|---|
.. | ||
af_afir.asm | ||
af_afir_init.c | ||
af_anlmdn.asm | ||
af_anlmdn_init.c | ||
af_volume.asm | ||
af_volume_init.c | ||
avf_showcqt.asm | ||
avf_showcqt_init.c | ||
colorspacedsp.asm | ||
colorspacedsp_init.c | ||
Makefile | ||
scene_sad.asm | ||
scene_sad_init.c | ||
vf_atadenoise.asm | ||
vf_atadenoise_init.c | ||
vf_blend.asm | ||
vf_blend_init.c | ||
vf_bwdif.asm | ||
vf_bwdif_init.c | ||
vf_convolution.asm | ||
vf_convolution_init.c | ||
vf_eq.asm | ||
vf_eq_init.c | ||
vf_framerate.asm | ||
vf_framerate_init.c | ||
vf_fspp.asm | ||
vf_fspp_init.c | ||
vf_gblur.asm | ||
vf_gblur_init.c | ||
vf_gradfun.asm | ||
vf_gradfun_init.c | ||
vf_hflip.asm | ||
vf_hflip_init.c | ||
vf_hqdn3d.asm | ||
vf_hqdn3d_init.c | ||
vf_idet.asm | ||
vf_idet_init.c | ||
vf_interlace.asm | ||
vf_limiter.asm | ||
vf_limiter_init.c | ||
vf_lut3d.asm | ||
vf_lut3d_init.c | ||
vf_maskedclamp.asm | ||
vf_maskedclamp_init.c | ||
vf_maskedmerge.asm | ||
vf_maskedmerge_init.c | ||
vf_noise.c | ||
vf_overlay.asm | ||
vf_overlay_init.c | ||
vf_pp7.asm | ||
vf_pp7_init.c | ||
vf_psnr.asm | ||
vf_psnr_init.c | ||
vf_pullup.asm | ||
vf_pullup_init.c | ||
vf_removegrain.asm | ||
vf_removegrain_init.c | ||
vf_spp.c | ||
vf_ssim.asm | ||
vf_ssim_init.c | ||
vf_stereo3d.asm | ||
vf_stereo3d_init.c | ||
vf_threshold.asm | ||
vf_threshold_init.c | ||
vf_tinterlace_init.c | ||
vf_transpose.asm | ||
vf_transpose_init.c | ||
vf_v360.asm | ||
vf_v360_init.c | ||
vf_w3fdif.asm | ||
vf_w3fdif_init.c | ||
vf_yadif.asm | ||
vf_yadif_init.c | ||
yadif-10.asm | ||
yadif-16.asm |