diff options
author | John Koleszar <jkoleszar@google.com> | 2013-04-12 15:33:04 -0700 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2013-04-16 06:49:46 -0700 |
commit | 7f7d1357a2732e0a1c36f3baded7dd14f449e535 (patch) | |
tree | 6bee68dd36c842cd700ee8f670d1380e37acd77d /vp9/common/x86 | |
parent | 282c963923eb969c146d63e934bbece433a95282 (diff) | |
parent | 868ecb55a1528ca3f19286e7d1551572bf89b642 (diff) | |
download | libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.gz libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.bz2 libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.zip |
Merge branch 'experimental' into master
VP9 preview bitstream 2, commit '868ecb55a1528ca3f19286e7d1551572bf89b642'
Conflicts:
vp9/vp9_common.mk
Change-Id: I3f0f6e692c987ff24f98ceafbb86cb9cf64ad8d3
Diffstat (limited to 'vp9/common/x86')
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 818 | ||||
-rw-r--r-- | vp9/common/x86/vp9_filter_sse2.c | 290 | ||||
-rw-r--r-- | vp9/common/x86/vp9_filter_sse4.c | 362 | ||||
-rw-r--r-- | vp9/common/x86/vp9_idct_sse2.asm (renamed from vp9/common/x86/vp9_idctllm_sse2.asm) | 0 | ||||
-rw-r--r-- | vp9/common/x86/vp9_idct_x86.c | 1975 | ||||
-rw-r--r-- | vp9/common/x86/vp9_idct_x86.h | 13 | ||||
-rw-r--r-- | vp9/common/x86/vp9_idctllm_mmx.asm | 241 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_intrin_sse2.c | 600 | ||||
-rw-r--r-- | vp9/common/x86/vp9_postproc_mmx.asm | 4 | ||||
-rw-r--r-- | vp9/common/x86/vp9_postproc_sse2.asm | 4 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 729 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_mmx.asm | 268 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_sse2.asm | 1372 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_ssse3.asm | 1515 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_x86.h | 109 |
15 files changed, 3161 insertions, 5139 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index f09e2d78b..6d3bb021a 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -8,91 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include "./vpx_config.h" +#include "./vp9_rtcd.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_subpixel.h" - -extern const short vp9_six_tap_mmx[8][6 * 8]; - -extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_height, - unsigned int output_width); - -extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_lin, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - /////////////////////////////////////////////////////////////////////////// // the mmx function that does the bilinear filtering and var calculation // // int one pass // @@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { { 8, 8, 8, 8, 120, 120, 120, 120 } }; -#if HAVE_MMX -void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16); - const short *hfilter, *vfilter; - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 8, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch, - 8, 4, 4, 4, vfilter); -} - -void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, - fdata2 + 8, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, - fdata2 + 12, src_pixels_per_line, 1, 21, 32, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch, - 32, 16, 16, 16, vfilter); -} - -void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 13, 16, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 13, 16, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 8, 8, vfilter); -} - -void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 9, 16, hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 4, 8, vfilter); -} -#endif - -#if HAVE_SSE2 -void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 21, 32, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 21, 32); - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } -} - -void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 13, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, vfilter); - } -} - -void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, vfilter); - } -} -#endif - #if HAVE_SSSE3 -extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - fdata2, 16, 21, xoffset); - vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch, - 16, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 16, yoffset); - } -} - -void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 13, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset); - } else { - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, yoffset); - } -} - -void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 9, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 4, 9, xoffset); - vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset); - } else { - vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, const unsigned int src_pitch, unsigned char *output_ptr, @@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); -void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 23, hfilter_aligned16); - vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, - 16, hfilter_aligned16); - } else { - vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 16, vfilter_aligned16); - } - } -} - void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, const unsigned int src_pitch, unsigned char *output_ptr, @@ -551,51 +65,303 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); -void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); +void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 15, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 8, vfilter_aligned16); +void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; } + while (w >= 8) { + vp9_filter_block1d8_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); } } -void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 11, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 4, vfilter_aligned16); +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23); + + // check w/h due to fixed size fdata2 array + assert(w <= 16); + assert(h <= 16); + + if (x_step_q4 == 16 && y_step_q4 == 16 && + filter_x[3] != 128 && filter_y[3] != 128) { + if (w == 16) { + vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d16_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 8) { + vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d8_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 4) { + vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d4_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + } + vp9_convolve8_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} + +void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23); + + // check w/h due to fixed size fdata2 array + assert(w <= 16); + assert(h <= 16); + + if (x_step_q4 == 16 && y_step_q4 == 16 && + filter_x[3] != 128 && filter_y[3] != 128) { + if (w == 16) { + vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 8) { + vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 4) { + vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; } } + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); } #endif diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c deleted file mode 100644 index 8e02ac197..000000000 --- a/vp9/common/x86/vp9_filter_sse2.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <emmintrin.h> // SSE2 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/emmintrin_compat.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, src_ptr, offset) \ - { \ - /* Do shifted load to achieve require shuffles through unpacking */ \ - const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ - const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ - const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ - const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ - const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ - const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ - const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ - const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ - /* Shit by 4 bytes through suffle to get additional shifted loads */ \ - const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ - const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ - const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ - const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ - /* multiply accumulate them */ \ - const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); - const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); - const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); - const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); - // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 - // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx - // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); - const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); - // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 - // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 - // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx - // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx - const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); - const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx - // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(3, 2, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(3, 2, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - DECLARE_ALIGNED(16, unsigned char, temp[32]); - { - _mm_store_si128((__m128i *)temp, transpose3_0); - DO_FOUR_PIXELS(col0, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_1); - DO_FOUR_PIXELS(col1, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_2); - DO_FOUR_PIXELS(col2, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_3); - DO_FOUR_PIXELS(col3, temp, 0); - } - // transpose - { - __m128i T0 = _mm_unpacklo_epi32(col0, col1); - __m128i T1 = _mm_unpacklo_epi32(col2, col3); - __m128i T2 = _mm_unpackhi_epi32(col0, col1); - __m128i T3 = _mm_unpackhi_epi32(col2, col3); - col0 = _mm_unpacklo_epi64(T0, T1); - col1 = _mm_unpackhi_epi64(T0, T1); - col2 = _mm_unpacklo_epi64(T2, T3); - col3 = _mm_unpackhi_epi64(T2, T3); - } - // saturate to 8 bit - { - col0 = _mm_packs_epi32(col0, col0); - col0 = _mm_packus_epi16(col0, col0); - col1 = _mm_packs_epi32(col1, col1); - col1 = _mm_packus_epi16(col1, col1); - col2 = _mm_packs_epi32 (col2, col2); - col2 = _mm_packus_epi16(col2, col2); - col3 = _mm_packs_epi32 (col3, col3); - col3 = _mm_packus_epi16(col3, col3); - } - // store - { - *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c deleted file mode 100644 index 52c35b296..000000000 --- a/vp9/common/x86/vp9_filter_sse4.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <smmintrin.h> // SSE4.1 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Reduce source size by using macros instead of current code -// duplication. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { - 0x00, 0x01, - 0x01, 0x02, - 0x02, 0x03, - 0x03, 0x04, - 0x02, 0x03, - 0x03, 0x04, - 0x04, 0x05, - 0x05, 0x06, -}; -DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { - 0x04, 0x05, - 0x05, 0x06, - 0x06, 0x07, - 0x07, 0x08, - 0x06, 0x07, - 0x07, 0x08, - 0x08, 0x09, - 0x09, 0x0A, -}; -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; -DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, offset) \ - { \ - /*load pixels*/ \ - __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \ - /* extract the ones used for first column */ \ - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \ - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \ - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \ - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \ - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \ - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \ - /* multiply accumulate them */ \ - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); - const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose); - const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose); - const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx - const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(0, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(1, 1, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(2, 2, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(3, 3, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - { - //load pixels - __m128i src = transpose3_0; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col0 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_1; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col1 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_2; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col2 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_3; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col3 = _mm_packus_epi16(mad_all, mad_all); - } - { - __m128i col01 = _mm_unpacklo_epi8(col0, col1); - __m128i col23 = _mm_unpacklo_epi8(col2, col3); - __m128i col0123 = _mm_unpacklo_epi16(col01, col23); - //TODO(cd): look into Ronald's comment: - // Future suggestion: I believe here, too, you can merge the - // packs_epi32() and pacus_epi16() for the 4 cols above, so that - // you get the data in a single register, and then use pshufb - // (shuffle_epi8()) instead of the unpacks here. Should be - // 2+3+2 instructions faster. - *((unsigned int *)&dst_ptr[dst_stride * 0]) = - _mm_extract_epi32(col0123, 0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = - _mm_extract_epi32(col0123, 1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = - _mm_extract_epi32(col0123, 2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = - _mm_extract_epi32(col0123, 3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm index 8f3c6dfc3..8f3c6dfc3 100644 --- a/vp9/common/x86/vp9_idctllm_sse2.asm +++ b/vp9/common/x86/vp9_idct_sse2.asm diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c new file mode 100644 index 000000000..811ed9899 --- /dev/null +++ b/vp9/common/x86/vp9_idct_x86.c @@ -0,0 +1,1975 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +#if HAVE_SSE2 +// In order to improve performance, clip absolute diff values to [0, 255], +// which allows to keep the additions/subtractions in 8 bits. +void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, int pitch, int stride) { + int a1; + int16_t out; + uint8_t abs_diff; + __m128i p0, p1, p2, p3; + unsigned int extended_diff; + __m128i diff; + + out = dct_const_round_shift(input_dc * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + // Read prediction data. + p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch)); + p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch)); + p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch)); + p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch)); + + // Unpack prediction data, and store 4x4 array in 1 XMM register. + p0 = _mm_unpacklo_epi32(p0, p1); + p2 = _mm_unpacklo_epi32(p2, p3); + p0 = _mm_unpacklo_epi64(p0, p2); + + // Clip dc value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (a1 >= 0) { + abs_diff = (a1 > 255) ? 255 : a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_adds_epu8(p0, diff); + } else { + abs_diff = (a1 < -255) ? 255 : -a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_subs_epu8(p0, diff); + } + + // Store results to dst. + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); +} + +void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const int half_pitch = pitch >> 1; + __m128i input0, input1, input2, input3; + + // Rows + input0 = _mm_loadl_epi64((__m128i *)input); + input1 = _mm_loadl_epi64((__m128i *)(input + 4)); + input2 = _mm_loadl_epi64((__m128i *)(input + 8)); + input3 = _mm_loadl_epi64((__m128i *)(input + 12)); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input1 = _mm_shufflelo_epi16(input1, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input3 = _mm_shufflelo_epi16(input3, 0xd8); + + input0 = _mm_unpacklo_epi32(input0, input0); + input1 = _mm_unpacklo_epi32(input1, input1); + input2 = _mm_unpacklo_epi32(input2, input2); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, zero); + input1 = _mm_packs_epi32(input1, zero); + input2 = _mm_packs_epi32(input2, zero); + input3 = _mm_packs_epi32(input3, zero); + + // Transpose + input1 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpacklo_epi16(input2, input3); + input0 = _mm_unpacklo_epi32(input1, input3); + input1 = _mm_unpackhi_epi32(input1, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input2, 0xd8); + input1 = _mm_shufflehi_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input3, 0xd8); + input3 = _mm_shufflelo_epi16(input3, 0xd8); + + input0 = _mm_unpacklo_epi32(input0, input0); + input1 = _mm_unpackhi_epi32(input1, input1); + input2 = _mm_unpackhi_epi32(input2, input2); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, zero); + input1 = _mm_packs_epi32(input1, zero); + input2 = _mm_packs_epi32(input2, zero); + input3 = _mm_packs_epi32(input3, zero); + + // Transpose + input1 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpacklo_epi16(input2, input3); + input0 = _mm_unpacklo_epi32(input1, input3); + input1 = _mm_unpackhi_epi32(input1, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Store results + _mm_storel_epi64((__m128i *)output, input2); + input2 = _mm_srli_si128(input2, 8); + _mm_storel_epi64((__m128i *)(output + half_pitch), input2); + + _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3); + input3 = _mm_srli_si128(input3, 8); + _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3); +} + +void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { + const __m128i zero = _mm_setzero_si128(); + const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); + + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i in, temp; + + // Load input data. + in = _mm_loadl_epi64((__m128i *)input); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + in = _mm_shufflelo_epi16(in, 0xd8); + in = _mm_unpacklo_epi32(in, in); + + // Stage 1 + in = _mm_madd_epi16(in, c1); + in = _mm_add_epi32(in, rounding); + in = _mm_srai_epi32(in, DCT_CONST_BITS); + in = _mm_packs_epi32(in, zero); + + // Stage 2 + temp = _mm_shufflelo_epi16(in, 0x9c); + in = _mm_shufflelo_epi16(in, 0xc9); + in = _mm_unpacklo_epi64(temp, in); + in = _mm_madd_epi16(in, c2); + in = _mm_packs_epi32(in, zero); + + // Store results + _mm_storel_epi64((__m128i *)output, in); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = out5 = out6 = out7 = zero; \ + } + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ + in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define IDCT8x8_1D \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + in0 = _mm_adds_epi16(stp1_0, stp2_7); \ + in1 = _mm_adds_epi16(stp1_1, stp1_6); \ + in2 = _mm_adds_epi16(stp1_2, stp1_5); \ + in3 = _mm_adds_epi16(stp1_3, stp2_4); \ + in4 = _mm_subs_epi16(stp1_3, stp2_4); \ + in5 = _mm_subs_epi16(stp1_2, stp1_5); \ + in6 = _mm_subs_epi16(stp1_1, stp1_6); \ + in7 = _mm_subs_epi16(stp1_0, stp2_7); + +void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8x8_1D + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); +} + +void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = _mm_load_si128((__m128i *)input); + in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + + // 8x4 Transpose + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) + + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, zero); + stp1_7 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp4, zero); + stp1_6 = _mm_packs_epi32(tmp6, zero); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, zero); + stp2_1 = _mm_packs_epi32(tmp2, zero); + stp2_2 = _mm_packs_epi32(tmp4, zero); + stp2_3 = _mm_packs_epi32(tmp6, zero); + + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, zero); + stp1_6 = _mm_packs_epi32(tmp2, zero); + } + + // Stage4 + in0 = _mm_adds_epi16(stp1_0, stp2_7); + in1 = _mm_adds_epi16(stp1_1, stp1_6); + in2 = _mm_adds_epi16(stp1_2, stp1_5); + in3 = _mm_adds_epi16(stp1_3, stp2_4); + in4 = _mm_subs_epi16(stp1_3, stp2_4); + in5 = _mm_subs_epi16(stp1_2, stp1_5); + in6 = _mm_subs_epi16(stp1_1, stp1_6); + in7 = _mm_subs_epi16(stp1_0, stp2_7); + + // Columns. 4x8 Transpose + TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7) + + // 1D idct8x8 + IDCT8x8_1D + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); +} + +#define IDCT16x16_1D \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, + in10 = zero, in11 = zero, in12 = zero, in13 = zero, + in14 = zero, in15 = zero; + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, + l12 = zero, l13 = zero, l14 = zero, l15 = zero; + __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, + r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, + r12 = zero, r13 = zero, r14 = zero, r15 = zero; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. + for (i = 0; i < 4; i++) { + // 1-D idct + if (i < 2) { + if (i == 1) input += 128; + + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + } + + if (i == 2) { + TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, + in13, in14, in15); + } + + if (i == 3) { + TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, + in12, in13, in14, in15); + } + + IDCT16x16_1D + + // Stage7 + if (i == 0) { + // Left 8x16 + l0 = _mm_add_epi16(stp2_0, stp1_15); + l1 = _mm_add_epi16(stp2_1, stp1_14); + l2 = _mm_add_epi16(stp2_2, stp2_13); + l3 = _mm_add_epi16(stp2_3, stp2_12); + l4 = _mm_add_epi16(stp2_4, stp2_11); + l5 = _mm_add_epi16(stp2_5, stp2_10); + l6 = _mm_add_epi16(stp2_6, stp1_9); + l7 = _mm_add_epi16(stp2_7, stp1_8); + l8 = _mm_sub_epi16(stp2_7, stp1_8); + l9 = _mm_sub_epi16(stp2_6, stp1_9); + l10 = _mm_sub_epi16(stp2_5, stp2_10); + l11 = _mm_sub_epi16(stp2_4, stp2_11); + l12 = _mm_sub_epi16(stp2_3, stp2_12); + l13 = _mm_sub_epi16(stp2_2, stp2_13); + l14 = _mm_sub_epi16(stp2_1, stp1_14); + l15 = _mm_sub_epi16(stp2_0, stp1_15); + } else if (i == 1) { + // Right 8x16 + r0 = _mm_add_epi16(stp2_0, stp1_15); + r1 = _mm_add_epi16(stp2_1, stp1_14); + r2 = _mm_add_epi16(stp2_2, stp2_13); + r3 = _mm_add_epi16(stp2_3, stp2_12); + r4 = _mm_add_epi16(stp2_4, stp2_11); + r5 = _mm_add_epi16(stp2_5, stp2_10); + r6 = _mm_add_epi16(stp2_6, stp1_9); + r7 = _mm_add_epi16(stp2_7, stp1_8); + r8 = _mm_sub_epi16(stp2_7, stp1_8); + r9 = _mm_sub_epi16(stp2_6, stp1_9); + r10 = _mm_sub_epi16(stp2_5, stp2_10); + r11 = _mm_sub_epi16(stp2_4, stp2_11); + r12 = _mm_sub_epi16(stp2_3, stp2_12); + r13 = _mm_sub_epi16(stp2_2, stp2_13); + r14 = _mm_sub_epi16(stp2_1, stp1_14); + r15 = _mm_sub_epi16(stp2_0, stp1_15); + } else { + // 2-D + in0 = _mm_add_epi16(stp2_0, stp1_15); + in1 = _mm_add_epi16(stp2_1, stp1_14); + in2 = _mm_add_epi16(stp2_2, stp2_13); + in3 = _mm_add_epi16(stp2_3, stp2_12); + in4 = _mm_add_epi16(stp2_4, stp2_11); + in5 = _mm_add_epi16(stp2_5, stp2_10); + in6 = _mm_add_epi16(stp2_6, stp1_9); + in7 = _mm_add_epi16(stp2_7, stp1_8); + in8 = _mm_sub_epi16(stp2_7, stp1_8); + in9 = _mm_sub_epi16(stp2_6, stp1_9); + in10 = _mm_sub_epi16(stp2_5, stp2_10); + in11 = _mm_sub_epi16(stp2_4, stp2_11); + in12 = _mm_sub_epi16(stp2_3, stp2_12); + in13 = _mm_sub_epi16(stp2_2, stp2_13); + in14 = _mm_sub_epi16(stp2_1, stp1_14); + in15 = _mm_sub_epi16(stp2_0, stp1_15); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + + output += 8; + } + } +} + +void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, + in10 = zero, in11 = zero, in12 = zero, in13 = zero, + in14 = zero, in15 = zero; + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, + l12 = zero, l13 = zero, l14 = zero, l15 = zero; + + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // 1-D idct. Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); + const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); + const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); + const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); + tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); + tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); + tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, zero); + stp2_15 = _mm_packs_epi32(tmp2, zero); + stp2_9 = _mm_packs_epi32(tmp4, zero); + stp2_14 = _mm_packs_epi32(tmp6, zero); + + stp2_10 = _mm_packs_epi32(tmp1, zero); + stp2_13 = _mm_packs_epi32(tmp3, zero); + stp2_11 = _mm_packs_epi32(tmp5, zero); + stp2_12 = _mm_packs_epi32(tmp7, zero); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); + const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); + tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, zero); + stp1_7 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp4, zero); + stp1_6 = _mm_packs_epi32(tmp6, zero); + + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); + const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); + tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, zero); + stp2_1 = _mm_packs_epi32(tmp2, zero); + stp2_2 = _mm_packs_epi32(tmp4, zero); + stp2_3 = _mm_packs_epi32(tmp6, zero); + stp2_9 = _mm_packs_epi32(tmp1, zero); + stp2_14 = _mm_packs_epi32(tmp3, zero); + stp2_10 = _mm_packs_epi32(tmp5, zero); + stp2_13 = _mm_packs_epi32(tmp7, zero); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + // Stage5 and Stage6 + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp1, zero); + stp1_6 = _mm_packs_epi32(tmp3, zero); + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + } + + // Stage7. Left 8x16 only. + l0 = _mm_add_epi16(stp2_0, stp1_15); + l1 = _mm_add_epi16(stp2_1, stp1_14); + l2 = _mm_add_epi16(stp2_2, stp2_13); + l3 = _mm_add_epi16(stp2_3, stp2_12); + l4 = _mm_add_epi16(stp2_4, stp2_11); + l5 = _mm_add_epi16(stp2_5, stp2_10); + l6 = _mm_add_epi16(stp2_6, stp1_9); + l7 = _mm_add_epi16(stp2_7, stp1_8); + l8 = _mm_sub_epi16(stp2_7, stp1_8); + l9 = _mm_sub_epi16(stp2_6, stp1_9); + l10 = _mm_sub_epi16(stp2_5, stp2_10); + l11 = _mm_sub_epi16(stp2_4, stp2_11); + l12 = _mm_sub_epi16(stp2_3, stp2_12); + l13 = _mm_sub_epi16(stp2_2, stp2_13); + l14 = _mm_sub_epi16(stp2_1, stp1_14); + l15 = _mm_sub_epi16(stp2_0, stp1_15); + + // 2-D idct. We do 2 8x16 blocks. + for (i = 0; i < 2; i++) { + if (i == 0) + TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, + in5, in6, in7); + + if (i == 1) + TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, + in4, in5, in6, in7); + + in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; + + IDCT16x16_1D + + // Stage7 + in0 = _mm_add_epi16(stp2_0, stp1_15); + in1 = _mm_add_epi16(stp2_1, stp1_14); + in2 = _mm_add_epi16(stp2_2, stp2_13); + in3 = _mm_add_epi16(stp2_3, stp2_12); + in4 = _mm_add_epi16(stp2_4, stp2_11); + in5 = _mm_add_epi16(stp2_5, stp2_10); + in6 = _mm_add_epi16(stp2_6, stp1_9); + in7 = _mm_add_epi16(stp2_7, stp1_8); + in8 = _mm_sub_epi16(stp2_7, stp1_8); + in9 = _mm_sub_epi16(stp2_6, stp1_9); + in10 = _mm_sub_epi16(stp2_5, stp2_10); + in11 = _mm_sub_epi16(stp2_4, stp2_11); + in12 = _mm_sub_epi16(stp2_3, stp2_12); + in13 = _mm_sub_epi16(stp2_2, stp2_13); + in14 = _mm_sub_epi16(stp2_1, stp1_14); + in15 = _mm_sub_epi16(stp2_0, stp1_15); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + output += 8; + } +} + +void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, + in24, in25, in26, in27, in28, in29, in30, in31; + __m128i col[128]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j; + + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. + for (i = 0; i < 8; i++) { + if (i < 4) { + // First 1-D idct + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); + in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); + in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); + in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); + in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); + in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); + in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); + in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); + in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); + + input += 256; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, + in18, in19, in20, in21, in22, in23); + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, + in26, in27, in28, in29, in30, in31); + } else { + // Second 1-D idct + j = i - 4; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, + in5, in6, in7); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, + in11, in12, in13, in14, in15); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, + in19, in20, in21, in22, in23); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, + in28, in29, in30, in31); + } + + // Stage1 + { + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); + + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); + + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); + + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); + + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, + stp1_17, stp1_30) + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, + stp1_23, stp1_24) + } + + // Stage2 + { + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); + + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); + + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, + stp2_14) + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, + stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); + + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); + + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); + } + + // Stage3 + { + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); + + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, + stp1_6) + + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, + stp1_18, stp1_29) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, + stp1_22, stp1_25) + + stp1_16 = stp2_16; + stp1_31 = stp2_31; + stp1_19 = stp2_19; + stp1_20 = stp2_20; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_27 = stp2_27; + stp1_28 = stp2_28; + } + + // Stage4 + { + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); + + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, + stp2_2, stp2_3) + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, + stp2_10, stp2_13) + + stp2_8 = stp1_8; + stp2_15 = stp1_15; + stp2_11 = stp1_11; + stp2_12 = stp1_12; + + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); + } + + // Stage5 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp1); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + stp1_4 = stp2_4; + stp1_7 = stp2_7; + + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, + stp1_21, stp1_26) + + stp1_22 = stp2_22; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_25 = stp2_25; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // Stage6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); + + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); + + stp2_8 = stp1_8; + stp2_9 = stp1_9; + stp2_14 = stp1_14; + stp2_15 = stp1_15; + + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, + stp2_13, stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); + + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); + } + + // Stage7 + { + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + stp1_18 = stp2_18; + stp1_19 = stp2_19; + + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, + stp1_23, stp1_24) + + stp1_28 = stp2_28; + stp1_29 = stp2_29; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // final stage + if (i < 4) { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } else { + // 2_D: Calculate the results and store them to destination. + in0 = _mm_add_epi16(stp1_0, stp1_31); + in1 = _mm_add_epi16(stp1_1, stp1_30); + in2 = _mm_add_epi16(stp1_2, stp1_29); + in3 = _mm_add_epi16(stp1_3, stp1_28); + in4 = _mm_add_epi16(stp1_4, stp1_27); + in5 = _mm_add_epi16(stp1_5, stp1_26); + in6 = _mm_add_epi16(stp1_6, stp1_25); + in7 = _mm_add_epi16(stp1_7, stp1_24); + in8 = _mm_add_epi16(stp1_8, stp1_23); + in9 = _mm_add_epi16(stp1_9, stp1_22); + in10 = _mm_add_epi16(stp1_10, stp1_21); + in11 = _mm_add_epi16(stp1_11, stp1_20); + in12 = _mm_add_epi16(stp1_12, stp1_19); + in13 = _mm_add_epi16(stp1_13, stp1_18); + in14 = _mm_add_epi16(stp1_14, stp1_17); + in15 = _mm_add_epi16(stp1_15, stp1_16); + in16 = _mm_sub_epi16(stp1_15, stp1_16); + in17 = _mm_sub_epi16(stp1_14, stp1_17); + in18 = _mm_sub_epi16(stp1_13, stp1_18); + in19 = _mm_sub_epi16(stp1_12, stp1_19); + in20 = _mm_sub_epi16(stp1_11, stp1_20); + in21 = _mm_sub_epi16(stp1_10, stp1_21); + in22 = _mm_sub_epi16(stp1_9, stp1_22); + in23 = _mm_sub_epi16(stp1_8, stp1_23); + in24 = _mm_sub_epi16(stp1_7, stp1_24); + in25 = _mm_sub_epi16(stp1_6, stp1_25); + in26 = _mm_sub_epi16(stp1_5, stp1_26); + in27 = _mm_sub_epi16(stp1_4, stp1_27); + in28 = _mm_sub_epi16(stp1_3, stp1_28); + in29 = _mm_sub_epi16(stp1_2, stp1_29); + in30 = _mm_sub_epi16(stp1_1, stp1_30); + in31 = _mm_sub_epi16(stp1_0, stp1_31); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + in16 = _mm_adds_epi16(in16, final_rounding); + in17 = _mm_adds_epi16(in17, final_rounding); + in18 = _mm_adds_epi16(in18, final_rounding); + in19 = _mm_adds_epi16(in19, final_rounding); + in20 = _mm_adds_epi16(in20, final_rounding); + in21 = _mm_adds_epi16(in21, final_rounding); + in22 = _mm_adds_epi16(in22, final_rounding); + in23 = _mm_adds_epi16(in23, final_rounding); + in24 = _mm_adds_epi16(in24, final_rounding); + in25 = _mm_adds_epi16(in25, final_rounding); + in26 = _mm_adds_epi16(in26, final_rounding); + in27 = _mm_adds_epi16(in27, final_rounding); + in28 = _mm_adds_epi16(in28, final_rounding); + in29 = _mm_adds_epi16(in29, final_rounding); + in30 = _mm_adds_epi16(in30, final_rounding); + in31 = _mm_adds_epi16(in31, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + in16 = _mm_srai_epi16(in16, 6); + in17 = _mm_srai_epi16(in17, 6); + in18 = _mm_srai_epi16(in18, 6); + in19 = _mm_srai_epi16(in19, 6); + in20 = _mm_srai_epi16(in20, 6); + in21 = _mm_srai_epi16(in21, 6); + in22 = _mm_srai_epi16(in22, 6); + in23 = _mm_srai_epi16(in23, 6); + in24 = _mm_srai_epi16(in24, 6); + in25 = _mm_srai_epi16(in25, 6); + in26 = _mm_srai_epi16(in26, 6); + in27 = _mm_srai_epi16(in27, 6); + in28 = _mm_srai_epi16(in28, 6); + in29 = _mm_srai_epi16(in29, 6); + in30 = _mm_srai_epi16(in30, 6); + in31 = _mm_srai_epi16(in31, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + _mm_store_si128((__m128i *)(output + half_pitch * 16), in16); + _mm_store_si128((__m128i *)(output + half_pitch * 17), in17); + _mm_store_si128((__m128i *)(output + half_pitch * 18), in18); + _mm_store_si128((__m128i *)(output + half_pitch * 19), in19); + _mm_store_si128((__m128i *)(output + half_pitch * 20), in20); + _mm_store_si128((__m128i *)(output + half_pitch * 21), in21); + _mm_store_si128((__m128i *)(output + half_pitch * 22), in22); + _mm_store_si128((__m128i *)(output + half_pitch * 23), in23); + _mm_store_si128((__m128i *)(output + half_pitch * 24), in24); + _mm_store_si128((__m128i *)(output + half_pitch * 25), in25); + _mm_store_si128((__m128i *)(output + half_pitch * 26), in26); + _mm_store_si128((__m128i *)(output + half_pitch * 27), in27); + _mm_store_si128((__m128i *)(output + half_pitch * 28), in28); + _mm_store_si128((__m128i *)(output + half_pitch * 29), in29); + _mm_store_si128((__m128i *)(output + half_pitch * 30), in30); + _mm_store_si128((__m128i *)(output + half_pitch * 31), in31); + + output += 8; + } + } +} +#endif diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h index 8320cf87d..bd66d8c72 100644 --- a/vp9/common/x86/vp9_idct_x86.h +++ b/vp9/common/x86/vp9_idct_x86.h @@ -20,23 +20,10 @@ */ #if HAVE_MMX -extern prototype_idct(vp9_short_idct4x4llm_1_mmx); -extern prototype_idct(vp9_short_idct4x4llm_mmx); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx); - extern prototype_second_order(vp9_short_inv_walsh4x4_mmx); extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx - #undef vp9_idct_iwalsh16 #define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm deleted file mode 100644 index 15e81addb..000000000 --- a/vp9/common/x86/vp9_idctllm_mmx.asm +++ /dev/null @@ -1,241 +0,0 @@ -; -; Copyright (c) 2012 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -align 16 -x_s1sqr2: times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: times 4 dw 0x4E7B -align 16 -pw_16: times 4 dw 16 - -SECTION .text - - -; /**************************************************************************** -; * Notes: -; * -; * This implementation makes use of 16 bit fixed point version of two multiply -; * constants: -; * 1. sqrt(2) * cos (pi/8) -; * 2. sqrt(2) * sin (pi/8) -; * Because the first constant is bigger than 1, to maintain the same 16 bit -; * fixed point precision as the second one, we use a trick of -; * x * a = x + x*(a-1) -; * so -; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). -; * -; * For the second constant, because of the 16bit version is 35468, which -; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative -; * number. -; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x -; * -; **************************************************************************/ - -INIT_MMX - -;void short_idct4x4llm_mmx(short *input, short *output, int pitch) -cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit - mova m0, [inpq +0] - mova m1, [inpq +8] - - mova m2, [inpq+16] - mova m3, [inpq+24] - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2] ; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1] ; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova m3, m5 ; 33 23 13 03 - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2] ; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1] ; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - paddw m0, [pw_16] - - paddw m2, [pw_16] - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - psraw m2, 5 - - psraw m0, 5 - psraw m4, 5 - - psraw m6, 5 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova [outq], m0 - - mova [outq+r2], m1 - mova [outq+pitq*2], m2 - - add outq, pitq - mova [outq+pitq*2], m5 - RET - -;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) -cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit - movh m0, [inpq] - paddw m0, [pw_16] - psraw m0, 5 - punpcklwd m0, m0 - punpckldq m0, m0 - - mova [outq], m0 - mova [outq+pitq], m0 - - mova [outq+pitq*2], m0 - add r1, r2 - - mova [outq+pitq*2], m0 - RET - - -;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) -cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride -%if ARCH_X86_64 - movsxd strideq, dword stridem -%else - mov strideq, stridem -%endif - pxor m0, m0 - - movh m5, in_dcq ; dc - paddw m5, [pw_16] - - psraw m5, 5 - - punpcklwd m5, m5 - punpckldq m5, m5 - - movh m1, [predq] - punpcklbw m1, m0 - paddsw m1, m5 - packuswb m1, m0 ; pack and unpack to saturate - movh [dstq], m1 - - movh m2, [predq+pitq] - punpcklbw m2, m0 - paddsw m2, m5 - packuswb m2, m0 ; pack and unpack to saturate - movh [dstq+strideq], m2 - - movh m3, [predq+2*pitq] - punpcklbw m3, m0 - paddsw m3, m5 - packuswb m3, m0 ; pack and unpack to saturate - movh [dstq+2*strideq], m3 - - add dstq, strideq - add predq, pitq - movh m4, [predq+2*pitq] - punpcklbw m4, m0 - paddsw m4, m5 - packuswb m4, m0 ; pack and unpack to saturate - movh [dstq+2*strideq], m4 - RET - diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index d319bf2d5..08447a62d 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -26,14 +26,16 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + + DECLARE_ALIGNED(16, unsigned char, ap[8][16]); + DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + + __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; @@ -58,12 +60,24 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + + _mm_store_si128((__m128i *)ap[4], p4); + _mm_store_si128((__m128i *)ap[3], p3); + _mm_store_si128((__m128i *)ap[2], p2); + _mm_store_si128((__m128i *)ap[1], p1); + _mm_store_si128((__m128i *)ap[0], p0); + _mm_store_si128((__m128i *)aq[4], q4); + _mm_store_si128((__m128i *)aq[3], q3); + _mm_store_si128((__m128i *)aq[2], q2); + _mm_store_si128((__m128i *)aq[1], q1); + _mm_store_si128((__m128i *)aq[0], q0); + + { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), @@ -95,246 +109,8 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); - - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat = _mm_max_epu8(work, flat); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // calculate flat2 - p4 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 5 * p)); -// p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); -// q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 7 * p)); - - { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); - __m128i work; - flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - // calculate flat2 - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - i = 0; - do { - __m128i workp_a, workp_b, workp_shft; - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // wide flat - // TODO(slavarnway): interleave with the flat pixel calculations (see above) - { - const __m128i eight = _mm_set1_epi16(8); - unsigned char *src = s; - int i = 0; - do { - __m128i workp_a, workp_b, workp_shft; - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero); - - - workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 - workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a); - workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2)); - workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // lp filter { const __m128i t4 = _mm_set1_epi8(4); @@ -345,14 +121,10 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -374,6 +146,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); /* Filter2 >> 3 */ work_a = _mm_cmpgt_epi8(zero, filter2); @@ -381,6 +154,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); /* filt >> 1 */ filt = _mm_adds_epi8(filter1, t1); @@ -389,20 +163,265 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), + _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), + _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), + _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), + _mm_subs_epu8(q0, q5))); + _mm_store_si128((__m128i *)ap[5], p5); + _mm_store_si128((__m128i *)aq[5], q5); + flat2 = _mm_max_epu8(work, flat2); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), + _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), + _mm_subs_epu8(q0, q6))); + _mm_store_si128((__m128i *)ap[6], p6); + _mm_store_si128((__m128i *)aq[6], q6); + flat2 = _mm_max_epu8(work, flat2); + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), + _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), + _mm_subs_epu8(q0, q7))); + _mm_store_si128((__m128i *)ap[7], p7); + _mm_store_si128((__m128i *)aq[7], q7); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i temp_flat2 = flat2; + unsigned char *src = s; + int i = 0; + do { + __m128i workp_shft; + __m128i a, b, c; + + unsigned int off = i * 8; + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + + c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 + c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); + + b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); + a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); + a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); + + _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q1, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); + _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q2, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); + _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q3, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); + _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); + _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(q4, c); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); + _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + a = _mm_add_epi16(q5, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q6, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + temp_flat2 = _mm_srli_si128(temp_flat2, 8); + src += 8; + } while (++i < 2); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + work_a = _mm_load_si128((__m128i *)ap[2]); + p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + _mm_store_si128((__m128i *)flat_op[2], p2); + + p1 = _mm_load_si128((__m128i *)flat_op[1]); + work_a = _mm_andnot_si128(flat, ps1); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + _mm_store_si128((__m128i *)flat_op[1], p1); + + p0 = _mm_load_si128((__m128i *)flat_op[0]); + work_a = _mm_andnot_si128(flat, ps0); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + _mm_store_si128((__m128i *)flat_op[0], p0); + + q0 = _mm_load_si128((__m128i *)flat_oq[0]); + work_a = _mm_andnot_si128(flat, qs0); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + _mm_store_si128((__m128i *)flat_oq[0], q0); + + q1 = _mm_load_si128((__m128i *)flat_oq[1]); + work_a = _mm_andnot_si128(flat, qs1); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + _mm_store_si128((__m128i *)flat_oq[1], q1); + + work_a = _mm_load_si128((__m128i *)aq[2]); + q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + _mm_store_si128((__m128i *)flat_oq[2], q2); // write out op6 - op3 { unsigned char *dst = (s - 7 * p); for (i = 6; i > 2; i--) { __m128i flat2_output; - work_a = _mm_loadu_si128((__m128i *)dst); + work_a = _mm_load_si128((__m128i *)ap[i]); flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); @@ -412,62 +431,42 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, } } - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - work_a = _mm_or_si128(work_a, p2); + work_a = _mm_load_si128((__m128i *)flat_op[2]); p2 = _mm_load_si128((__m128i *)flat2_op[2]); work_a = _mm_andnot_si128(flat2, work_a); p2 = _mm_and_si128(flat2, p2); p2 = _mm_or_si128(work_a, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, ps1); - p1 = _mm_and_si128(flat, p1); - work_a = _mm_or_si128(work_a, p1); + work_a = _mm_load_si128((__m128i *)flat_op[1]); p1 = _mm_load_si128((__m128i *)flat2_op[1]); work_a = _mm_andnot_si128(flat2, work_a); p1 = _mm_and_si128(flat2, p1); p1 = _mm_or_si128(work_a, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, ps0); - p0 = _mm_and_si128(flat, p0); - work_a = _mm_or_si128(work_a, p0); + work_a = _mm_load_si128((__m128i *)flat_op[0]); p0 = _mm_load_si128((__m128i *)flat2_op[0]); work_a = _mm_andnot_si128(flat2, work_a); p0 = _mm_and_si128(flat2, p0); p0 = _mm_or_si128(work_a, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, qs0); - q0 = _mm_and_si128(flat, q0); - work_a = _mm_or_si128(work_a, q0); + work_a = _mm_load_si128((__m128i *)flat_oq[0]); q0 = _mm_load_si128((__m128i *)flat2_oq[0]); work_a = _mm_andnot_si128(flat2, work_a); q0 = _mm_and_si128(flat2, q0); q0 = _mm_or_si128(work_a, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, qs1); - q1 = _mm_and_si128(flat, q1); - work_a = _mm_or_si128(work_a, q1); + work_a = _mm_load_si128((__m128i *)flat_oq[1]); q1 = _mm_load_si128((__m128i *)flat2_oq[1]); work_a = _mm_andnot_si128(flat2, work_a); q1 = _mm_and_si128(flat2, q1); q1 = _mm_or_si128(work_a, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - work_a = _mm_or_si128(work_a, q2); + work_a = _mm_load_si128((__m128i *)flat_oq[2]); q2 = _mm_load_si128((__m128i *)flat2_oq[2]); work_a = _mm_andnot_si128(flat2, work_a); q2 = _mm_and_si128(flat2, q2); @@ -479,7 +478,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, unsigned char *dst = (s + 3 * p); for (i = 3; i < 7; i++) { __m128i flat2_output; - work_a = _mm_loadu_si128((__m128i *)dst); + work_a = _mm_load_si128((__m128i *)aq[i]); flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); @@ -504,7 +503,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); __m128i mask, hev, flat; const __m128i zero = _mm_set1_epi16(0); - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; const unsigned int extended_thresh = _thresh[0] * 0x01010101u; const unsigned int extended_limit = _limit[0] * 0x01010101u; const unsigned int extended_blimit = _blimit[0] * 0x01010101u; @@ -515,7 +514,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); @@ -524,7 +522,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); @@ -573,11 +570,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat = _mm_max_epu8(work, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); @@ -588,7 +580,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, int i = 0; do { __m128i workp_a, workp_b, workp_shft; - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); @@ -597,11 +588,10 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_op2[i*8], _mm_packus_epi16(workp_shft, workp_shft)); @@ -611,7 +601,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, _mm_storel_epi64((__m128i *)&flat_op1[i*8], _mm_packus_epi16(workp_shft, workp_shft)); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_op0[i*8], @@ -623,13 +613,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, _mm_storel_epi64((__m128i *)&flat_oq0[i*8], _mm_packus_epi16(workp_shft, workp_shft)); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_oq1[i*8], _mm_packus_epi16(workp_shft, workp_shft)); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_oq2[i*8], @@ -813,8 +803,8 @@ void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u, _mm_loadl_epi64((__m128i *)(src + 120))); } -static __inline void transpose8x16(unsigned char *in0, unsigned char *in1, - int in_p, unsigned char *out, int out_p) { +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i x8, x9, x10, x11, x12, x13, x14, x15; @@ -879,9 +869,9 @@ static __inline void transpose8x16(unsigned char *in0, unsigned char *in1, _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); } -static __inline void transpose(unsigned char *src[], int in_p, - unsigned char *dst[], int out_p, - int num_8x8_to_transpose) { +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { int idx8x8 = 0; __m128i x0, x1, x2, x3, x4, x5, x6, x7; do { diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm index 5f06f0ea0..c2118dbb7 100644 --- a/vp9/common/x86/vp9_postproc_mmx.asm +++ b/vp9/common/x86/vp9_postproc_mmx.asm @@ -459,11 +459,11 @@ sym(vp9_mbpost_proc_down_mmx): %undef flimit2 -;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, +;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, ; unsigned char blackclamp[16], ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) +; unsigned int width, unsigned int height, int pitch) extern sym(rand) global sym(vp9_plane_add_noise_mmx) PRIVATE sym(vp9_plane_add_noise_mmx): diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm index 8bbb3794b..858fc99b6 100644 --- a/vp9/common/x86/vp9_postproc_sse2.asm +++ b/vp9/common/x86/vp9_postproc_sse2.asm @@ -624,11 +624,11 @@ sym(vp9_mbpost_proc_across_ip_xmm): %undef flimit4 -;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, +;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise, ; unsigned char blackclamp[16], ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) +; unsigned int width, unsigned int height, int pitch) extern sym(rand) global sym(vp9_plane_add_noise_wmt) PRIVATE sym(vp9_plane_add_noise_wmt): diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index b644da64c..32f00e289 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -21,34 +21,92 @@ ; ;*************************************************************************************/ -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] +%macro VERTx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movd xmm0, [rsi] ;A + movd xmm1, [rsi + rdx] ;B + movd xmm2, [rsi + rdx * 2] ;C + movd xmm3, [rax + rdx * 2] ;D + movd xmm4, [rsi + rdx * 4] ;E + movd xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movd xmm6, [rsi + rbx] ;G + movd xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + +%macro VERTx8 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -86,7 +144,7 @@ sym(vp9_filter_block1d8_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d8_v8_ssse3_loop: +.loop: movq xmm0, [rsi] ;A movq xmm1, [rsi + rdx] ;B movq xmm2, [rsi + rdx * 2] ;C @@ -117,7 +175,10 @@ sym(vp9_filter_block1d8_v8_ssse3): add rsi, rdx add rax, rdx - +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movq [rdi], xmm0 %if ABI_IS_32BIT @@ -126,47 +187,11 @@ sym(vp9_filter_block1d8_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d8_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog + jnz .loop +%endm - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] +%macro VERTx16 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -204,7 +229,7 @@ sym(vp9_filter_block1d16_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d16_v8_ssse3_loop: +.loop: movq xmm0, [rsi] ;A movq xmm1, [rsi + rdx] ;B movq xmm2, [rsi + rdx * 2] ;C @@ -232,7 +257,10 @@ sym(vp9_filter_block1d16_v8_ssse3): psraw xmm0, 7 packuswb xmm0, xmm0 - +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movq [rdi], xmm0 movq xmm0, [rsi + 8] ;A @@ -267,6 +295,10 @@ sym(vp9_filter_block1d16_v8_ssse3): add rsi, rdx add rax, rdx +%if %1 + movq xmm1, [rdi+8] + pavgb xmm0, xmm1 +%endif movq [rdi+8], xmm0 @@ -276,7 +308,38 @@ sym(vp9_filter_block1d16_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d16_v8_ssse3_loop + jnz .loop +%endm + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 0 add rsp, 16*5 pop rsp @@ -289,24 +352,65 @@ sym(vp9_filter_block1d16_v8_ssse3): pop rbp ret -;void vp9_filter_block1d8_h8_ssse3 +;void vp9_filter_block1d8_v8_ssse3 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_ssse3): +global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 - GET_GOT rbx push rsi push rdi + push rbx ; end prolog ALIGN_STACK 16, rax @@ -317,6 +421,121 @@ sym(vp9_filter_block1d8_h8_ssse3): %define k6k7 [rsp + 16*3] %define krd [rsp + 16*4] + VERTx16 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +%macro HORIZx4 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -340,19 +559,16 @@ sym(vp9_filter_block1d8_h8_ssse3): pshufd xmm5, xmm5, 0 movdqa k4k5, xmm2 movdqa k6k7, xmm3 -; movdqa krd, xmm5 + movdqa krd, xmm5 movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height -.filter_block1d8_h8_rowloop_ssse3: +.loop: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 punpcklqdq xmm0, xmm3 movdqa xmm1, xmm0 @@ -371,59 +587,94 @@ sym(vp9_filter_block1d8_h8_ssse3): pmaddubsw xmm4, k6k7 paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 - +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif lea rsi, [rsi + rax] - movq [rdi], xmm0 + movd [rdi], xmm0 lea rdi, [rdi + rdx] dec rcx - jnz .filter_block1d8_h8_rowloop_ssse3 + jnz .loop +%endm - add rsp, 16*5 - pop rsp +%macro HORIZx8 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 -;void vp9_filter_block1d16_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height +.loop: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + lea rsi, [rsi + rax] + movq [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +%macro HORIZx16 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -453,13 +704,10 @@ sym(vp9_filter_block1d16_h8_ssse3): movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height -.filter_block1d16_h8_rowloop_ssse3: +.loop: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 punpcklqdq xmm0, xmm3 movdqa xmm1, xmm0 @@ -486,10 +734,7 @@ sym(vp9_filter_block1d16_h8_ssse3): movq xmm3, [rsi + 5] -; movq xmm7, [rsi + 12] movq xmm7, [rsi + 13] -;note: same as above -; punpcklbw xmm3, xmm7 punpcklqdq xmm3, xmm7 movdqa xmm1, xmm3 @@ -508,19 +753,54 @@ sym(vp9_filter_block1d16_h8_ssse3): pmaddubsw xmm4, k6k7 paddsw xmm3, xmm1 + paddsw xmm3, xmm4 paddsw xmm3, xmm2 paddsw xmm3, krd - paddsw xmm3, xmm4 psraw xmm3, 7 packuswb xmm3, xmm3 punpcklqdq xmm0, xmm3 +%if %1 + movdqa xmm1, [rdi] + pavgb xmm0, xmm1 +%endif lea rsi, [rsi + rax] movdqa [rdi], xmm0 lea rdi, [rdi + rdx] dec rcx - jnz .filter_block1d16_h8_rowloop_ssse3 + jnz .loop +%endm + +;void vp9_filter_block1d4_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 0 add rsp, 16*5 pop rsp @@ -534,7 +814,188 @@ sym(vp9_filter_block1d16_h8_ssse3): pop rbp ret +;void vp9_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 0 + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret SECTION_RODATA align 16 shuf_t0t1: diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm deleted file mode 100644 index dee29b8fb..000000000 --- a/vp9/common/x86/vp9_subpixel_mmx.asm +++ /dev/null @@ -1,268 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -%define BLOCK_HEIGHT_WIDTH 4 -%define vp9_filter_weight 128 -%define VP9_FILTER_SHIFT 7 - - -;void vp9_filter_block1d_h6_mmx -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1d_h6_mmx) PRIVATE -sym(vp9_filter_block1d_h6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - - movq mm1, [rdx + 16] ; do both the negative taps first!!! - movq mm2, [rdx + 32] ; - movq mm6, [rdx + 48] ; - movq mm7, [rdx + 64] ; - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - movq mm3, [rsi-2] ; mm3 = p-2..p5 - movq mm4, mm3 ; mm4 = p-2..p5 - psrlq mm3, 8 ; mm3 = p-1..p5 - punpcklbw mm3, mm0 ; mm3 = p-1..p2 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - movq mm5, mm4 ; mm5 = p-2..p5 - punpckhbw mm4, mm0 ; mm5 = p2..p5 - pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - movq mm4, mm5 ; mm4 = p-2..p5; - psrlq mm5, 16 ; mm5 = p0..p5; - punpcklbw mm5, mm0 ; mm5 = p0..p3 - pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - movq mm5, mm4 ; mm5 = p-2..p5 - psrlq mm4, 24 ; mm4 = p1..p5 - punpcklbw mm4, mm0 ; mm4 = p1..p4 - pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - ; do outer positive taps - movd mm4, [rsi+3] - punpcklbw mm4, mm0 ; mm5 = p3..p6 - pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - paddsw mm3, [GLOBAL(rd)] ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and unpack to saturate - punpcklbw mm3, mm0 ; - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line - add rdi, rax; -%else - movsxd r8, dword ptr arg(2) ;src_pixels_per_line - add rdi, rax; - - add rsi, r8 ; next line -%endif - - dec rcx ; decrement count - jnz .nextrow ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1dc_v6_mmx -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int output_pitch, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1dc_v6_mmx) PRIVATE -sym(vp9_filter_block1dc_v6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movq mm5, [GLOBAL(rd)] - push rbx - mov rbx, arg(7) ;vp9_filter - movq mm1, [rbx + 16] ; do both the negative taps first!!! - movq mm2, [rbx + 32] ; - movq mm6, [rbx + 48] ; - movq mm7, [rbx + 64] ; - - movsxd rdx, dword ptr arg(3) ;pixels_per_line - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - sub rsi, rdx - sub rsi, rdx - movsxd rcx, DWORD PTR arg(5) ;output_height - movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - - -.nextrow_cv: - movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 - pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 - pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi] ; mm4 = p0..p3 = row -2 - pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - add rsi, rdx ; move source forward 1 line to avoid 3 * pitch - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 - pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 - pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - paddsw mm3, mm5 ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and saturate - - movd [rdi],mm3 ; store the results in the destination - ; the subsequent iterations repeat 3 out of 4 of these reads. Since the - ; recon block should be in cache this shouldn't cost much. Its obviously - ; avoidable!!!. - lea rdi, [rdi+rax] ; - dec rcx ; decrement count - jnz .nextrow_cv ; next row - - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -rd: - times 4 dw 0x40 - -align 16 -global HIDDEN_DATA(sym(vp9_six_tap_mmx)) -sym(vp9_six_tap_mmx): - times 8 dw 0 - times 8 dw 0 - times 8 dw 128 - times 8 dw 0 - times 8 dw 0 - times 8 dw 0 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw 0 - - times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - - times 8 dw 0 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw 0 - - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw 0 - - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - - times 8 dw 0 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - times 8 dw 0 - diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm deleted file mode 100644 index b0c4f1282..000000000 --- a/vp9/common/x86/vp9_subpixel_sse2.asm +++ /dev/null @@ -1,1372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -global sym(vp9_filter_block1d8_h6_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_h6_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi+16], xmm4 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_sse2 -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d8_v6_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_sse2_loop: - movdqa xmm1, XMMWORD PTR [rsi] - pmullw xmm1, [rax] - - movdqa xmm2, XMMWORD PTR [rsi + rdx] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] - pmullw xmm3, [rax + 32] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] - pmullw xmm5, [rax + 64] - - add rsi, rdx - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] - - pmullw xmm4, [rax + 48] - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] - - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_v6_sse2 -;( -; unsigned short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; const short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_v6_sse2) PRIVATE -sym(vp9_filter_block1d16_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d16_v6_sse2_loop: -; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. - movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 - movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] - pmullw xmm1, [rax + 16] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm3, [rax + 64] - pmullw xmm4, [rax + 64] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm5, [rax + 32] - pmullw xmm6, [rax + 32] - - movdqa xmm7, XMMWORD PTR [rsi] ; line 1 - movdqa xmm0, XMMWORD PTR [rsi + 16] - pmullw xmm7, [rax] - pmullw xmm0, [rax] - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - paddsw xmm1, xmm7 - paddsw xmm2, xmm0 - - add rsi, rdx - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm3, [rax + 48] - pmullw xmm4, [rax + 48] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm5, [rax + 80] - pmullw xmm6, [rax + 80] - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] - pxor xmm0, xmm0 ; clear xmm0 - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - - paddsw xmm1, xmm7 - paddsw xmm2, xmm7 - - psraw xmm1, 7 - psraw xmm2, 7 - - packuswb xmm1, xmm2 ; pack and saturate - movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d16_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_only_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - - movq QWORD PTR [rdi], xmm4 ; store the results in the destination - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_only_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_only_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; lower 8 bytes - - movq QWORD Ptr [rdi], xmm4 ; store the results in the destination - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; higher 8 bytes - - movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_only_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; Second-pass filter only when xoffset==0 -global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - mov rax, arg(5) ;vp9_filter - - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_only_sse2_loop: - movq xmm1, MMWORD PTR [rsi] - movq xmm2, MMWORD PTR [rsi + rdx] - movq xmm3, MMWORD PTR [rsi + rdx * 2] - movq xmm5, MMWORD PTR [rsi + rdx * 4] - add rsi, rdx - movq xmm4, MMWORD PTR [rsi + rdx * 2] - movq xmm6, MMWORD PTR [rsi + rdx * 4] - - punpcklbw xmm1, xmm0 - pmullw xmm1, [rax] - - punpcklbw xmm2, xmm0 - pmullw xmm2, [rax + 16] - - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax + 32] - - punpcklbw xmm5, xmm0 - pmullw xmm5, [rax + 64] - - punpcklbw xmm4, xmm0 - pmullw xmm4, [rax + 48] - - punpcklbw xmm6, xmm0 - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_unpack_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int output_height, -; unsigned int output_width -;) -global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE -sym(vp9_unpack_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(3) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source - - pxor xmm0, xmm0 ; clear xmm0 for unpack -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source -%endif - -.unpack_block1d16_h6_sse2_rowloop: - movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 - movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - punpcklbw xmm1, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm1 - movdqa XMMWORD Ptr [rdi + 16], xmm3 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(4) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - jnz .unpack_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict16x16_sse2) PRIVATE -sym(vp9_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict8x8_sse2) PRIVATE -sym(vp9_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -rd: - times 8 dw 0x40 diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm deleted file mode 100644 index b260480e0..000000000 --- a/vp9/common/x86/vp9_subpixel_ssse3.asm +++ /dev/null @@ -1,1515 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE -sym(vp9_filter_block1d8_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 - - movdqa xmm7, [GLOBAL(rd)] - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - mov rdi, arg(2) ;output_ptr - - cmp esi, DWORD PTR [rax] - je vp9_filter_block1d8_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx -;xmm3 free -.filter_block1d8_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - pmaddubsw xmm1, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm1 - paddsw xmm2, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - jnz .filter_block1d8_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -vp9_filter_block1d8_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] - movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] - - mov rsi, arg(0) ;src_ptr - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx - -.filter_block1d8_h4_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm2, xmm0 - pshufb xmm0, xmm3 - - pshufb xmm2, xmm4 - pmaddubsw xmm0, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - - jnz .filter_block1d8_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d16_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE -sym(vp9_filter_block1d16_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - mov rdi, arg(2) ;output_ptr - - mov rsi, arg(0) ;src_ptr - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d16_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - movq xmm3, MMWORD PTR [rsi + 6] - - pmaddubsw xmm1, xmm5 - movq xmm7, MMWORD PTR [rsi + 11] - - pmaddubsw xmm2, xmm6 - punpcklbw xmm3, xmm7 - - paddsw xmm0, xmm1 - movdqa xmm1, xmm3 - - pmaddubsw xmm3, xmm4 - paddsw xmm0, xmm2 - - movdqa xmm2, xmm1 - paddsw xmm0, [GLOBAL(rd)] - - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - - psraw xmm0, 7 - pmaddubsw xmm1, xmm5 - - pmaddubsw xmm2, xmm6 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - paddsw xmm3, xmm1 - - paddsw xmm3, xmm2 - - paddsw xmm3, [GLOBAL(rd)] - - psraw xmm3, 7 - - packuswb xmm3, xmm3 - - punpcklqdq xmm0, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d4_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE -sym(vp9_filter_block1d4_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - movdqa xmm7, [GLOBAL(rd)] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -;xmm3 free -.filter_block1d4_h6_rowloop_ssse3: - movdqu xmm0, XMMWORD PTR [rsi - 2] - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf1b)] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pmaddubsw xmm0, xmm4 - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm0, xmm1 - paddsw xmm0, xmm7 - pxor xmm1, xmm1 - paddsw xmm0, xmm2 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movd DWORD PTR [rdi], xmm0 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d4_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] - pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm1, xmm7 - paddsw xmm1, xmm2 - psraw xmm1, 7 - packuswb xmm1, xmm1 - - movd DWORD PTR [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp9_filter_block1d16_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE -sym(vp9_filter_block1d16_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d16_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - -.vp9_filter_block1d16_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 ;store the results - - movq xmm1, MMWORD PTR [rsi + 8] ;A - movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi+8], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d16_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - -.vp9_filter_block1d16_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - paddsw xmm2, [GLOBAL(rd)] - paddsw xmm2, xmm3 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - punpcklbw xmm5, xmm4 ;B D - punpcklbw xmm1, xmm0 ;C E - - pmaddubsw xmm1, xmm6 - pmaddubsw xmm5, xmm7 - - movdqa xmm4, [GLOBAL(rd)] - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm5, xmm1 - paddsw xmm5, xmm4 - psraw xmm5, 7 - packuswb xmm5, xmm5 - - punpcklqdq xmm2, xmm5 - - movdqa XMMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE -sym(vp9_filter_block1d8_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d8_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - movdqa xmm4, [GLOBAL(rd)] - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d8_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm5, [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm5 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d4_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE -sym(vp9_filter_block1d4_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_v4_ssse3 - - movq mm5, MMWORD PTR [rax] ;k0_k5 - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v6_ssse3_loop: - movd mm1, DWORD PTR [rsi] ;A - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - movd mm0, DWORD PTR [rax + rdx * 4] ;F - - movq mm4, [GLOBAL(rd)] - - pmaddubsw mm3, mm6 - punpcklbw mm1, mm0 ;A F - pmaddubsw mm2, mm7 - pmaddubsw mm1, mm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm1 - paddsw mm2, mm4 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_v4_ssse3: - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - movq mm5, MMWORD PTR [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v4_ssse3_loop: - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - pmaddubsw mm3, mm6 - pmaddubsw mm2, mm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm5 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict16x16_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE -sym(vp9_bilinear_predict16x16_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(2) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 4 - lea rax, [rax + rcx] ; HFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ; src_pixels_per_line - - movdqa xmm2, [rax] - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ; dst_pitch -%endif - movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 - - punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 - pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm6, xmm5 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm6, xmm1 - - punpcklbw xmm4, xmm5 - pmaddubsw xmm4, xmm1 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - packuswb xmm6, xmm4 - movdqa xmm5, xmm7 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm2 - - punpckhbw xmm7, xmm6 - pmaddubsw xmm7, xmm2 - - paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value - psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm5, xmm7 - movdqa xmm7, xmm6 - - movdqa [rdi], xmm5 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ; dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - - ; get the first horizontal line done - movq xmm4, [rsi] ; load row 0 - movq xmm2, [rsi + 8] ; load row 0 - - lea rsi, [rsi + rax] ; next line -.next_row_sp: - movq xmm3, [rsi] ; load row + 1 - movq xmm5, [rsi + 8] ; load row + 1 - - punpcklbw xmm4, xmm3 - punpcklbw xmm2, xmm5 - - pmaddubsw xmm4, xmm1 - movq xmm7, [rsi + rax] ; load row + 2 - - pmaddubsw xmm2, xmm1 - movq xmm6, [rsi + rax + 8] ; load row + 2 - - punpcklbw xmm3, xmm7 - punpcklbw xmm5, xmm6 - - pmaddubsw xmm3, xmm1 - paddw xmm4, [GLOBAL(rd)] - - pmaddubsw xmm5, xmm1 - paddw xmm2, [GLOBAL(rd)] - - psraw xmm4, VP9_FILTER_SHIFT - psraw xmm2, VP9_FILTER_SHIFT - - packuswb xmm4, xmm2 - paddw xmm3, [GLOBAL(rd)] - - movdqa [rdi], xmm4 ; store row 0 - paddw xmm5, [GLOBAL(rd)] - - psraw xmm3, VP9_FILTER_SHIFT - psraw xmm5, VP9_FILTER_SHIFT - - packuswb xmm3, xmm5 - movdqa xmm4, xmm7 - - movdqa [rdi + rdx],xmm3 ; store row 1 - lea rsi, [rsi + 2*rax] - - movdqa xmm2, xmm6 - lea rdi, [rdi + 2*rdx] - - cmp rdi, rcx - jne .next_row_sp - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - -.next_row_fp: - movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm2, xmm4 - movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 - - pmaddubsw xmm2, xmm1 - movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rax] ; next line - punpcklbw xmm3, xmm4 - - pmaddubsw xmm3, xmm1 - movq xmm5, [rsi] - - paddw xmm2, [GLOBAL(rd)] - movq xmm7, [rsi+1] - - movq xmm6, [rsi+8] - psraw xmm2, VP9_FILTER_SHIFT - - punpcklbw xmm5, xmm7 - movq xmm7, [rsi+9] - - paddw xmm3, [GLOBAL(rd)] - pmaddubsw xmm5, xmm1 - - psraw xmm3, VP9_FILTER_SHIFT - punpcklbw xmm6, xmm7 - - packuswb xmm2, xmm3 - pmaddubsw xmm6, xmm1 - - movdqa [rdi], xmm2 ; store the results in the destination - paddw xmm5, [GLOBAL(rd)] - - lea rdi, [rdi + rdx] ; dst_pitch - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm6, VP9_FILTER_SHIFT - - packuswb xmm5, xmm6 - lea rsi, [rsi + rax] ; next line - - movdqa [rdi], xmm5 ; store the results in the destination - lea rdi, [rdi + rdx] ; dst_pitch - - cmp rdi, rcx - - jne .next_row_fp - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict8x8_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE -sym(vp9_bilinear_predict8x8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ; xoffset - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b8x8_sp_only - - shl rax, 4 - add rax, rcx ; HFilter - - mov rdi, arg(4) ; dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b8x8_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm1, [rax] - - ; get the first horizontal line done - movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx - - psrldq xmm5, 1 - lea rsp, [rsp + 16] ; next line - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - lea rsp, [rsp + 16] ; next line - - movdqa xmm5, xmm6 - - psrldq xmm5, 1 - - punpcklbw xmm6, xmm5 - pmaddubsw xmm6, xmm0 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - packuswb xmm6, xmm6 - - punpcklbw xmm7, xmm6 - pmaddubsw xmm7, xmm1 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm7, xmm7 - - movq [rdi], xmm7 ; store the results in the destination - lea rdi, [rdi + rdx] - - movdqa xmm7, xmm6 - - cmp rdi, rcx - jne .next_row - - jmp .done8x8 - -.b8x8_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] ; VFilter - - movq xmm1, XMMWORD PTR [rsp] - movq xmm2, XMMWORD PTR [rsp+16] - - movq xmm3, XMMWORD PTR [rsp+32] - punpcklbw xmm1, xmm2 - - movq xmm4, XMMWORD PTR [rsp+48] - punpcklbw xmm2, xmm3 - - movq xmm5, XMMWORD PTR [rsp+64] - punpcklbw xmm3, xmm4 - - movq xmm6, XMMWORD PTR [rsp+80] - punpcklbw xmm4, xmm5 - - movq xmm7, XMMWORD PTR [rsp+96] - punpcklbw xmm5, xmm6 - - pmaddubsw xmm1, xmm0 - pmaddubsw xmm2, xmm0 - - pmaddubsw xmm3, xmm0 - pmaddubsw xmm4, xmm0 - - pmaddubsw xmm5, xmm0 - punpcklbw xmm6, xmm7 - - pmaddubsw xmm6, xmm0 - paddw xmm1, [GLOBAL(rd)] - - paddw xmm2, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm2, VP9_FILTER_SHIFT - - paddw xmm4, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - psraw xmm6, VP9_FILTER_SHIFT - packuswb xmm1, xmm1 - - packuswb xmm2, xmm2 - movq [rdi], xmm1 - - packuswb xmm3, xmm3 - movq [rdi+rdx], xmm2 - - packuswb xmm4, xmm4 - movq xmm1, XMMWORD PTR [rsp+112] - - lea rdi, [rdi + 2*rdx] - movq xmm2, XMMWORD PTR [rsp+128] - - packuswb xmm5, xmm5 - movq [rdi], xmm3 - - packuswb xmm6, xmm6 - movq [rdi+rdx], xmm4 - - lea rdi, [rdi + 2*rdx] - punpcklbw xmm7, xmm1 - - movq [rdi], xmm5 - pmaddubsw xmm7, xmm0 - - movq [rdi+rdx], xmm6 - punpcklbw xmm1, xmm2 - - pmaddubsw xmm1, xmm0 - paddw xmm7, [GLOBAL(rd)] - - psraw xmm7, VP9_FILTER_SHIFT - paddw xmm1, [GLOBAL(rd)] - - psraw xmm1, VP9_FILTER_SHIFT - packuswb xmm7, xmm7 - - packuswb xmm1, xmm1 - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm7 - - movq [rdi+rdx], xmm1 - lea rsp, [rsp + 144] - - jmp .done8x8 - -.b8x8_fp_only: - lea rcx, [rdi+rdx*8] - -.next_row_fp: - movdqa xmm1, XMMWORD PTR [rsp] - movdqa xmm3, XMMWORD PTR [rsp+16] - - movdqa xmm2, xmm1 - movdqa xmm5, XMMWORD PTR [rsp+32] - - psrldq xmm2, 1 - movdqa xmm7, XMMWORD PTR [rsp+48] - - movdqa xmm4, xmm3 - psrldq xmm4, 1 - - movdqa xmm6, xmm5 - psrldq xmm6, 1 - - punpcklbw xmm1, xmm2 - pmaddubsw xmm1, xmm0 - - punpcklbw xmm3, xmm4 - pmaddubsw xmm3, xmm0 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm0 - - movdqa xmm2, xmm7 - psrldq xmm2, 1 - - punpcklbw xmm7, xmm2 - pmaddubsw xmm7, xmm0 - - paddw xmm1, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm7, [GLOBAL(rd)] - psraw xmm7, VP9_FILTER_SHIFT - - packuswb xmm1, xmm1 - packuswb xmm3, xmm3 - - packuswb xmm5, xmm5 - movq [rdi], xmm1 - - packuswb xmm7, xmm7 - movq [rdi+rdx], xmm3 - - lea rdi, [rdi + 2*rdx] - movq [rdi], xmm5 - - lea rsp, [rsp + 4*16] - movq [rdi+rdx], xmm7 - - lea rdi, [rdi + 2*rdx] - cmp rdi, rcx - - jne .next_row_fp - - lea rsp, [rsp + 16] - -.done8x8: - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -shuf2b: - db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 -shuf3b: - db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 - -align 16 -shuf2bfrom1: - db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 -align 16 -shuf3bfrom1: - db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 - -align 16 -rd: - times 8 dw 0x40 - -align 16 -k0_k5: - times 8 db 0, 0 ;placeholder - times 8 db 0, 0 - times 8 db 2, 1 - times 8 db 0, 0 - times 8 db 3, 3 - times 8 db 0, 0 - times 8 db 1, 2 - times 8 db 0, 0 -k1_k3: - times 8 db 0, 0 ;placeholder - times 8 db -6, 12 - times 8 db -11, 36 - times 8 db -9, 50 - times 8 db -16, 77 - times 8 db -6, 93 - times 8 db -8, 108 - times 8 db -1, 123 -k2_k4: - times 8 db 128, 0 ;placeholder - times 8 db 123, -1 - times 8 db 108, -8 - times 8 db 93, -6 - times 8 db 77, -16 - times 8 db 50, -9 - times 8 db 36, -11 - times 8 db 12, -6 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 - diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h deleted file mode 100644 index 25bc26d9b..000000000 --- a/vp9/common/x86/vp9_subpixel_x86.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ -#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2); - - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2 - -#endif -#endif - -#if HAVE_SSSE3 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3 - - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3 - -#endif -#endif - - - -#endif |