Merge branch 'experimental' into master

VP9 preview bitstream 2, commit '868ecb55a1528ca3f19286e7d1551572bf89b642' Conflicts: vp9/vp9_common.mk Change-Id: I3f0f6e692c987ff24f98ceafbb86cb9cf64ad8d3
author: John Koleszar <jkoleszar@google.com> 2013-04-12 15:33:04 -0700
committer: John Koleszar <jkoleszar@google.com> 2013-04-16 06:49:46 -0700
commit: 7f7d1357a2732e0a1c36f3baded7dd14f449e535 (patch)
tree: 6bee68dd36c842cd700ee8f670d1380e37acd77d /vp9/common/x86
parent: 282c963923eb969c146d63e934bbece433a95282 (diff)
parent: 868ecb55a1528ca3f19286e7d1551572bf89b642 (diff)
download: libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar
libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.gz
libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.bz2
libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.zip
15 files changed, 3161 insertions, 5139 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f09e2d78b..6d3bb021a 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,91 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_subpixel.h"
-
-extern const short vp9_six_tap_mmx[8][6 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
-                                      unsigned short  *output_ptr,
-                                      unsigned int     src_pixels_per_line,
-                                      unsigned int     pixel_step,
-                                      unsigned int     output_height,
-                                      unsigned int     output_width,
-                                      const short     *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
-                                       unsigned char  *output_ptr,
-                                       int             output_pitch,
-                                       unsigned int    pixels_per_line,
-                                       unsigned int    pixel_step,
-                                       unsigned int    output_height,
-                                       unsigned int    output_width,
-                                       const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,
-                                        unsigned short *output_ptr,
-                                        unsigned int    src_pixels_per_line,
-                                        unsigned int    pixel_step,
-                                        unsigned int    output_height,
-                                        unsigned int    output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    pixel_step,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
-                                        unsigned char *output_ptr,
-                                        int dst_ptich,
-                                        unsigned int pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
-                                         unsigned char *output_ptr,
-                                         int dst_ptich,
-                                         unsigned int pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
-                                              unsigned int   src_pixels_per_lin,
-                                              unsigned char *output_ptr,
-                                              int            dst_pitch,
-                                              unsigned int   output_height,
-                                              const short   *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
 ///////////////////////////////////////////////////////////////////////////
 // the mmx function that does the bilinear filtering and var calculation //
 // int one pass                                                          //
@@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
   {   8,  8,  8,  8, 120, 120, 120, 120 }
 };
 
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
-  const short *hfilter, *vfilter;
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
-                            src_pixels_per_line, 1, 9, 8, hfilter);
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
-                             8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
-                             32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 8, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 4, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 4, 8, vfilter);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,
-                                  int  src_pixels_per_line,
-                                  int  xoffset,
-                                  int  yoffset,
-                                  unsigned char *dst_ptr,
-                                  int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                   src_pixels_per_line, 1, 21, 32, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                   32, 16, 16, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                        dst_ptr, dst_pitch, 16, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                 src_pixels_per_line, 21, 32);
-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                 32, 16, 16, dst_pitch, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 13, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 8, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 8, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 8, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 9, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 4, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 4, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 4, vfilter);
-  }
-}
-#endif
-
 #if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,
-                                          unsigned int    src_pixels_per_line,
-                                          unsigned char  *output_ptr,
-                                          unsigned int    output_pitch,
-                                          unsigned int    output_height,
-                                          unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
-                                          unsigned int   src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int   out_pitch,
-                                          unsigned int   output_height,
-                                          unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                    src_pixels_per_line,
-                                    fdata2, 16, 21, xoffset);
-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
-                                    16, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
-                                    dst_ptr, dst_pitch, 16, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                  src_pixels_per_line,
-                                  dst_ptr, dst_pitch, 16, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
-    } else {
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 8, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 8, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,
-                                 int   src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);
-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
                                    const unsigned int src_pitch,
                                    unsigned char *output_ptr,
@@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
                                    unsigned int output_height,
                                    const short *filter);
 
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
-                                      const unsigned int src_stride,
-                                      const short *hfilter_aligned16,
-                                      const short *vfilter_aligned16,
-                                      unsigned char *dst_ptr,
-                                      unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                  fdata2, 16, 23, hfilter_aligned16);
-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
-                                  vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
-                                    16, hfilter_aligned16);
-    } else {
-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);
-    }
-  }
-}
-
 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
                                    const unsigned int src_pitch,
                                    unsigned char *output_ptr,
@@ -551,51 +65,303 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
                                    unsigned int output_height,
                                    const short *filter);
 
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
 
-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                 fdata2, 16, 15, hfilter_aligned16);
-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
-                                 vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);
+void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
+                                       const unsigned int src_pitch,
+                                       unsigned char *output_ptr,
+                                       unsigned int out_pitch,
+                                       unsigned int output_height,
+                                       const short *filter);
+
+void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
+                                       const unsigned int src_pitch,
+                                       unsigned char *output_ptr,
+                                       unsigned int out_pitch,
+                                       unsigned int output_height,
+                                       const short *filter);
+
+void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_ssse3(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
     }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+  }
+}
+
+void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4, filter_y, y_step_q4,
+                         w, h);
   }
 }
 
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+  }
+}
 
-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   fdata2, 16, 11, hfilter_aligned16);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
-                                   vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+  }
+}
+
+void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+  // check w/h due to fixed size fdata2 array
+  assert(w <= 16);
+  assert(h <= 16);
+
+  if (x_step_q4 == 16 && y_step_q4 == 16 &&
+      filter_x[3] != 128 && filter_y[3] != 128) {
+    if (w == 16) {
+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+                                    fdata2, 16,
+                                    h + 7, filter_x);
+      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      return;
+    }
+    if (w == 8) {
+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      return;
+    }
+    if (w == 4) {
+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d4_v8_ssse3(fdata2, 16,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      return;
+    }
+  }
+  vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                  filter_x, x_step_q4, filter_y, y_step_q4,
+                  w, h);
+}
+
+void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+  // check w/h due to fixed size fdata2 array
+  assert(w <= 16);
+  assert(h <= 16);
+
+  if (x_step_q4 == 16 && y_step_q4 == 16 &&
+      filter_x[3] != 128 && filter_y[3] != 128) {
+    if (w == 16) {
+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+                                    fdata2, 16,
+                                    h + 7, filter_x);
+      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
+                                        dst, dst_stride,
+                                        h, filter_y);
+      return;
+    }
+    if (w == 8) {
+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
+                                       dst, dst_stride,
+                                       h, filter_y);
+      return;
+    }
+    if (w == 4) {
+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
+                                       dst, dst_stride,
+                                       h, filter_y);
+      return;
     }
   }
+  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h);
 }
 #endif
diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c
deleted file mode 100644
index 8e02ac197..000000000
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/emmintrin_compat.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
-  {                                                                            \
-  /* Do shifted load to achieve require shuffles through unpacking */          \
-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
-  /* multiply accumulate them */                                               \
-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-        DECLARE_ALIGNED(16, unsigned char, temp[32]);
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_0);
-        DO_FOUR_PIXELS(col0, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_1);
-        DO_FOUR_PIXELS(col1, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_2);
-        DO_FOUR_PIXELS(col2, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_3);
-        DO_FOUR_PIXELS(col3, temp, 0);
-      }
-      // transpose
-      {
-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
-        col0 = _mm_unpacklo_epi64(T0, T1);
-        col1 = _mm_unpackhi_epi64(T0, T1);
-        col2 = _mm_unpacklo_epi64(T2, T3);
-        col3 = _mm_unpackhi_epi64(T2, T3);
-      }
-      // saturate to 8 bit
-      {
-        col0 = _mm_packs_epi32(col0, col0);
-        col0 = _mm_packus_epi16(col0, col0);
-        col1 = _mm_packs_epi32(col1, col1);
-        col1 = _mm_packus_epi16(col1, col1);
-        col2 = _mm_packs_epi32 (col2, col2);
-        col2 = _mm_packus_epi16(col2, col2);
-        col3 = _mm_packs_epi32 (col3, col3);
-        col3 = _mm_packus_epi16(col3, col3);
-      }
-      // store
-      {
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
-                                  HFilter_aligned16, VFilter_aligned16,
-                                  dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c
deleted file mode 100644
index 52c35b296..000000000
--- a/vp9/common/x86/vp9_filter_sse4.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-//           duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
-  0x00, 0x01,
-  0x01, 0x02,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x04, 0x05,
-  0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
-  0x04, 0x05,
-  0x05, 0x06,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x08, 0x09,
-  0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
-  0, 4,  8, 12,
-  1, 5,  9, 13,
-  2, 6, 10, 14,
-  3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset)                                         \
-  {                                                                            \
-  /*load pixels*/                                                              \
-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \
-  /* extract the ones used for first column */                                 \
-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \
-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \
-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \
-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \
-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \
-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \
-  /* multiply accumulate them */                                               \
-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \
-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \
-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \
-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \
-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \
-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(0, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(1, 1, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(2, 2, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(3, 3, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-      {
-        //load pixels
-        __m128i src  = transpose3_0;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col0 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_1;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col1 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_2;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col2 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_3;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col3 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);
-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);
-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
-        //TODO(cd): look into Ronald's comment:
-        //    Future suggestion: I believe here, too, you can merge the
-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that
-        //    you get the data in a single register, and then use pshufb
-        //    (shuffle_epi8()) instead of the unpacks here. Should be
-        //    2+3+2 instructions faster.
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =
-            _mm_extract_epi32(col0123, 0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =
-            _mm_extract_epi32(col0123, 1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =
-            _mm_extract_epi32(col0123, 2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =
-            _mm_extract_epi32(col0123, 3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
index 8f3c6dfc3..8f3c6dfc3 100644
--- a/vp9/common/x86/vp9_idctllm_sse2.asm
+++ b/vp9/common/x86/vp9_idct_sse2.asm
diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c
new file mode 100644
index 000000000..811ed9899
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_x86.c
@@ -0,0 +1,1975 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+#if HAVE_SSE2
+// In order to improve performance, clip absolute diff values to [0, 255],
+// which allows to keep the additions/subtractions in 8 bits.
+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
+                               uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
+  int16_t out;
+  uint8_t abs_diff;
+  __m128i p0, p1, p2, p3;
+  unsigned int extended_diff;
+  __m128i diff;
+
+  out = dct_const_round_shift(input_dc * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  // Read prediction data.
+  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
+  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
+  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
+  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
+
+  // Unpack prediction data, and store 4x4 array in 1 XMM register.
+  p0 = _mm_unpacklo_epi32(p0, p1);
+  p2 = _mm_unpacklo_epi32(p2, p3);
+  p0 = _mm_unpacklo_epi64(p0, p2);
+
+  // Clip dc value to [0, 255] range. Then, do addition or subtraction
+  // according to its sign.
+  if (a1 >= 0) {
+    abs_diff = (a1 > 255) ? 255 : a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_adds_epu8(p0, diff);
+  } else {
+    abs_diff = (a1 < -255) ? 255 : -a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_subs_epu8(p0, diff);
+  }
+
+  // Store results to dst.
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+}
+
+void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const int half_pitch = pitch >> 1;
+  __m128i input0, input1, input2, input3;
+
+  // Rows
+  input0 = _mm_loadl_epi64((__m128i *)input);
+  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
+  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
+  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input0, 0xd8);
+  input1 = _mm_shufflelo_epi16(input1, 0xd8);
+  input2 = _mm_shufflelo_epi16(input2, 0xd8);
+  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input1 = _mm_unpacklo_epi32(input1, input1);
+  input2 = _mm_unpacklo_epi32(input2, input2);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, zero);
+  input1 = _mm_packs_epi32(input1, zero);
+  input2 = _mm_packs_epi32(input2, zero);
+  input3 = _mm_packs_epi32(input3, zero);
+
+  // Transpose
+  input1 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpacklo_epi16(input2, input3);
+  input0 = _mm_unpacklo_epi32(input1, input3);
+  input1 = _mm_unpackhi_epi32(input1, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Columns
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input2, 0xd8);
+  input1 = _mm_shufflehi_epi16(input2, 0xd8);
+  input2 = _mm_shufflehi_epi16(input3, 0xd8);
+  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input1 = _mm_unpackhi_epi32(input1, input1);
+  input2 = _mm_unpackhi_epi32(input2, input2);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, zero);
+  input1 = _mm_packs_epi32(input1, zero);
+  input2 = _mm_packs_epi32(input2, zero);
+  input3 = _mm_packs_epi32(input3, zero);
+
+  // Transpose
+  input1 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpacklo_epi16(input2, input3);
+  input0 = _mm_unpacklo_epi32(input1, input3);
+  input1 = _mm_unpackhi_epi32(input1, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Final round and shift
+  input2 = _mm_add_epi16(input2, eight);
+  input3 = _mm_add_epi16(input3, eight);
+
+  input2 = _mm_srai_epi16(input2, 4);
+  input3 = _mm_srai_epi16(input3, 4);
+
+  // Store results
+  _mm_storel_epi64((__m128i *)output, input2);
+  input2 = _mm_srli_si128(input2, 8);
+  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+
+  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
+  input3 = _mm_srli_si128(input3, 8);
+  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+}
+
+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
+
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i in, temp;
+
+  // Load input data.
+  in = _mm_loadl_epi64((__m128i *)input);
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  in = _mm_shufflelo_epi16(in, 0xd8);
+  in = _mm_unpacklo_epi32(in, in);
+
+  // Stage 1
+  in = _mm_madd_epi16(in, c1);
+  in = _mm_add_epi32(in, rounding);
+  in = _mm_srai_epi32(in, DCT_CONST_BITS);
+  in = _mm_packs_epi32(in, zero);
+
+  // Stage 2
+  temp = _mm_shufflelo_epi16(in, 0x9c);
+  in = _mm_shufflelo_epi16(in, 0xc9);
+  in = _mm_unpacklo_epi64(temp, in);
+  in = _mm_madd_epi16(in, c2);
+  in = _mm_packs_epi32(in, zero);
+
+  // Store results
+  _mm_storel_epi64((__m128i *)output, in);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+                                                        \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+                                                            \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+  }
+
+#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+                                                        \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+                                                            \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    out4 = out5 = out6 = out7 = zero; \
+  }
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
+    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
+  }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define IDCT8x8_1D  \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
+  in7 = _mm_subs_epi16(stp1_0, stp2_7);
+
+void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int half_pitch = pitch >> 1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data.
+  in0 = _mm_load_si128((__m128i *)input);
+  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+  // 2-D
+  for (i = 0; i < 2; i++) {
+    // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                  in4, in5, in6, in7);
+
+    // 4-stage 1D idct8x8
+    IDCT8x8_1D
+  }
+
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  // Store results
+  _mm_store_si128((__m128i *)output, in0);
+  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+}
+
+void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int half_pitch = pitch >> 1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // Rows. Load 4-row input data.
+  in0 = _mm_load_si128((__m128i *)input);
+  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+
+  // 8x4 Transpose
+  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
+
+  // Stage1
+  {
+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
+
+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_4 = _mm_packs_epi32(tmp0, zero);
+    stp1_7 = _mm_packs_epi32(tmp2, zero);
+    stp1_5 = _mm_packs_epi32(tmp4, zero);
+    stp1_6 = _mm_packs_epi32(tmp6, zero);
+  }
+
+  // Stage2
+  {
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
+
+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp2_0 = _mm_packs_epi32(tmp0, zero);
+    stp2_1 = _mm_packs_epi32(tmp2, zero);
+    stp2_2 = _mm_packs_epi32(tmp4, zero);
+    stp2_3 = _mm_packs_epi32(tmp6, zero);
+
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
+
+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp0, zero);
+    stp1_6 = _mm_packs_epi32(tmp2, zero);
+  }
+
+  // Stage4
+  in0 = _mm_adds_epi16(stp1_0, stp2_7);
+  in1 = _mm_adds_epi16(stp1_1, stp1_6);
+  in2 = _mm_adds_epi16(stp1_2, stp1_5);
+  in3 = _mm_adds_epi16(stp1_3, stp2_4);
+  in4 = _mm_subs_epi16(stp1_3, stp2_4);
+  in5 = _mm_subs_epi16(stp1_2, stp1_5);
+  in6 = _mm_subs_epi16(stp1_1, stp1_6);
+  in7 = _mm_subs_epi16(stp1_0, stp2_7);
+
+  // Columns. 4x8 Transpose
+  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                in4, in5, in6, in7)
+
+  // 1D idct8x8
+  IDCT8x8_1D
+
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  // Store results
+  _mm_store_si128((__m128i *)output, in0);
+  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+}
+
+#define IDCT16x16_1D \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+    \
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  } \
+  \
+  /* Stage4 */ \
+  { \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
+    \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+    \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+    \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+  } \
+    \
+  /* Stage6 */ \
+  { \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+    \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  }
+
+void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int half_pitch = pitch >> 1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
+          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
+          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
+          in14 = zero, in15 = zero;
+  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
+          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
+          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
+  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
+          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
+          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
+  for (i = 0; i < 4; i++) {
+    // 1-D idct
+    if (i < 2) {
+      if (i == 1) input += 128;
+
+      // Load input data.
+      in0 = _mm_load_si128((__m128i *)input);
+      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
+      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
+      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
+      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
+      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
+      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
+      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
+      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
+      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
+      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
+      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
+      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
+      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+
+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                    in10, in11, in12, in13, in14, in15);
+    }
+
+    if (i == 2) {
+      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
+                    in13, in14, in15);
+    }
+
+    if (i == 3) {
+      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
+                    in12, in13, in14, in15);
+    }
+
+    IDCT16x16_1D
+
+    // Stage7
+    if (i == 0) {
+      // Left 8x16
+      l0 = _mm_add_epi16(stp2_0, stp1_15);
+      l1 = _mm_add_epi16(stp2_1, stp1_14);
+      l2 = _mm_add_epi16(stp2_2, stp2_13);
+      l3 = _mm_add_epi16(stp2_3, stp2_12);
+      l4 = _mm_add_epi16(stp2_4, stp2_11);
+      l5 = _mm_add_epi16(stp2_5, stp2_10);
+      l6 = _mm_add_epi16(stp2_6, stp1_9);
+      l7 = _mm_add_epi16(stp2_7, stp1_8);
+      l8 = _mm_sub_epi16(stp2_7, stp1_8);
+      l9 = _mm_sub_epi16(stp2_6, stp1_9);
+      l10 = _mm_sub_epi16(stp2_5, stp2_10);
+      l11 = _mm_sub_epi16(stp2_4, stp2_11);
+      l12 = _mm_sub_epi16(stp2_3, stp2_12);
+      l13 = _mm_sub_epi16(stp2_2, stp2_13);
+      l14 = _mm_sub_epi16(stp2_1, stp1_14);
+      l15 = _mm_sub_epi16(stp2_0, stp1_15);
+    } else if (i == 1) {
+      // Right 8x16
+      r0 = _mm_add_epi16(stp2_0, stp1_15);
+      r1 = _mm_add_epi16(stp2_1, stp1_14);
+      r2 = _mm_add_epi16(stp2_2, stp2_13);
+      r3 = _mm_add_epi16(stp2_3, stp2_12);
+      r4 = _mm_add_epi16(stp2_4, stp2_11);
+      r5 = _mm_add_epi16(stp2_5, stp2_10);
+      r6 = _mm_add_epi16(stp2_6, stp1_9);
+      r7 = _mm_add_epi16(stp2_7, stp1_8);
+      r8 = _mm_sub_epi16(stp2_7, stp1_8);
+      r9 = _mm_sub_epi16(stp2_6, stp1_9);
+      r10 = _mm_sub_epi16(stp2_5, stp2_10);
+      r11 = _mm_sub_epi16(stp2_4, stp2_11);
+      r12 = _mm_sub_epi16(stp2_3, stp2_12);
+      r13 = _mm_sub_epi16(stp2_2, stp2_13);
+      r14 = _mm_sub_epi16(stp2_1, stp1_14);
+      r15 = _mm_sub_epi16(stp2_0, stp1_15);
+    } else {
+      // 2-D
+      in0 = _mm_add_epi16(stp2_0, stp1_15);
+      in1 = _mm_add_epi16(stp2_1, stp1_14);
+      in2 = _mm_add_epi16(stp2_2, stp2_13);
+      in3 = _mm_add_epi16(stp2_3, stp2_12);
+      in4 = _mm_add_epi16(stp2_4, stp2_11);
+      in5 = _mm_add_epi16(stp2_5, stp2_10);
+      in6 = _mm_add_epi16(stp2_6, stp1_9);
+      in7 = _mm_add_epi16(stp2_7, stp1_8);
+      in8 = _mm_sub_epi16(stp2_7, stp1_8);
+      in9 = _mm_sub_epi16(stp2_6, stp1_9);
+      in10 = _mm_sub_epi16(stp2_5, stp2_10);
+      in11 = _mm_sub_epi16(stp2_4, stp2_11);
+      in12 = _mm_sub_epi16(stp2_3, stp2_12);
+      in13 = _mm_sub_epi16(stp2_2, stp2_13);
+      in14 = _mm_sub_epi16(stp2_1, stp1_14);
+      in15 = _mm_sub_epi16(stp2_0, stp1_15);
+
+      // Final rounding and shift
+      in0 = _mm_adds_epi16(in0, final_rounding);
+      in1 = _mm_adds_epi16(in1, final_rounding);
+      in2 = _mm_adds_epi16(in2, final_rounding);
+      in3 = _mm_adds_epi16(in3, final_rounding);
+      in4 = _mm_adds_epi16(in4, final_rounding);
+      in5 = _mm_adds_epi16(in5, final_rounding);
+      in6 = _mm_adds_epi16(in6, final_rounding);
+      in7 = _mm_adds_epi16(in7, final_rounding);
+      in8 = _mm_adds_epi16(in8, final_rounding);
+      in9 = _mm_adds_epi16(in9, final_rounding);
+      in10 = _mm_adds_epi16(in10, final_rounding);
+      in11 = _mm_adds_epi16(in11, final_rounding);
+      in12 = _mm_adds_epi16(in12, final_rounding);
+      in13 = _mm_adds_epi16(in13, final_rounding);
+      in14 = _mm_adds_epi16(in14, final_rounding);
+      in15 = _mm_adds_epi16(in15, final_rounding);
+
+      in0 = _mm_srai_epi16(in0, 6);
+      in1 = _mm_srai_epi16(in1, 6);
+      in2 = _mm_srai_epi16(in2, 6);
+      in3 = _mm_srai_epi16(in3, 6);
+      in4 = _mm_srai_epi16(in4, 6);
+      in5 = _mm_srai_epi16(in5, 6);
+      in6 = _mm_srai_epi16(in6, 6);
+      in7 = _mm_srai_epi16(in7, 6);
+      in8 = _mm_srai_epi16(in8, 6);
+      in9 = _mm_srai_epi16(in9, 6);
+      in10 = _mm_srai_epi16(in10, 6);
+      in11 = _mm_srai_epi16(in11, 6);
+      in12 = _mm_srai_epi16(in12, 6);
+      in13 = _mm_srai_epi16(in13, 6);
+      in14 = _mm_srai_epi16(in14, 6);
+      in15 = _mm_srai_epi16(in15, 6);
+
+      // Store results
+      _mm_store_si128((__m128i *)output, in0);
+      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+
+      output += 8;
+    }
+  }
+}
+
+void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int half_pitch = pitch >> 1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
+          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
+          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
+          in14 = zero, in15 = zero;
+  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
+          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
+          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
+
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // 1-D idct. Load input data.
+  in0 = _mm_load_si128((__m128i *)input);
+  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
+  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
+  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
+  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
+  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
+  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
+    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
+    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
+
+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
+    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
+    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
+    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp2_8 = _mm_packs_epi32(tmp0, zero);
+    stp2_15 = _mm_packs_epi32(tmp2, zero);
+    stp2_9 = _mm_packs_epi32(tmp4, zero);
+    stp2_14 = _mm_packs_epi32(tmp6, zero);
+
+    stp2_10 = _mm_packs_epi32(tmp1, zero);
+    stp2_13 = _mm_packs_epi32(tmp3, zero);
+    stp2_11 = _mm_packs_epi32(tmp5, zero);
+    stp2_12 = _mm_packs_epi32(tmp7, zero);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
+
+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
+    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_4 = _mm_packs_epi32(tmp0, zero);
+    stp1_7 = _mm_packs_epi32(tmp2, zero);
+    stp1_5 = _mm_packs_epi32(tmp4, zero);
+    stp1_6 = _mm_packs_epi32(tmp6, zero);
+
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+
+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
+    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp2_0 = _mm_packs_epi32(tmp0, zero);
+    stp2_1 = _mm_packs_epi32(tmp2, zero);
+    stp2_2 = _mm_packs_epi32(tmp4, zero);
+    stp2_3 = _mm_packs_epi32(tmp6, zero);
+    stp2_9 = _mm_packs_epi32(tmp1, zero);
+    stp2_14 = _mm_packs_epi32(tmp3, zero);
+    stp2_10 = _mm_packs_epi32(tmp5, zero);
+    stp2_13 = _mm_packs_epi32(tmp7, zero);
+
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+  }
+
+  // Stage5 and Stage6
+  {
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
+
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp1, zero);
+    stp1_6 = _mm_packs_epi32(tmp3, zero);
+    stp2_10 = _mm_packs_epi32(tmp0, zero);
+    stp2_13 = _mm_packs_epi32(tmp2, zero);
+    stp2_11 = _mm_packs_epi32(tmp4, zero);
+    stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+  }
+
+  // Stage7. Left 8x16 only.
+  l0 = _mm_add_epi16(stp2_0, stp1_15);
+  l1 = _mm_add_epi16(stp2_1, stp1_14);
+  l2 = _mm_add_epi16(stp2_2, stp2_13);
+  l3 = _mm_add_epi16(stp2_3, stp2_12);
+  l4 = _mm_add_epi16(stp2_4, stp2_11);
+  l5 = _mm_add_epi16(stp2_5, stp2_10);
+  l6 = _mm_add_epi16(stp2_6, stp1_9);
+  l7 = _mm_add_epi16(stp2_7, stp1_8);
+  l8 = _mm_sub_epi16(stp2_7, stp1_8);
+  l9 = _mm_sub_epi16(stp2_6, stp1_9);
+  l10 = _mm_sub_epi16(stp2_5, stp2_10);
+  l11 = _mm_sub_epi16(stp2_4, stp2_11);
+  l12 = _mm_sub_epi16(stp2_3, stp2_12);
+  l13 = _mm_sub_epi16(stp2_2, stp2_13);
+  l14 = _mm_sub_epi16(stp2_1, stp1_14);
+  l15 = _mm_sub_epi16(stp2_0, stp1_15);
+
+  // 2-D idct. We do 2 8x16 blocks.
+  for (i = 0; i < 2; i++) {
+    if (i == 0)
+      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+
+    if (i == 1)
+      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+
+    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
+
+    IDCT16x16_1D
+
+    // Stage7
+    in0 = _mm_add_epi16(stp2_0, stp1_15);
+    in1 = _mm_add_epi16(stp2_1, stp1_14);
+    in2 = _mm_add_epi16(stp2_2, stp2_13);
+    in3 = _mm_add_epi16(stp2_3, stp2_12);
+    in4 = _mm_add_epi16(stp2_4, stp2_11);
+    in5 = _mm_add_epi16(stp2_5, stp2_10);
+    in6 = _mm_add_epi16(stp2_6, stp1_9);
+    in7 = _mm_add_epi16(stp2_7, stp1_8);
+    in8 = _mm_sub_epi16(stp2_7, stp1_8);
+    in9 = _mm_sub_epi16(stp2_6, stp1_9);
+    in10 = _mm_sub_epi16(stp2_5, stp2_10);
+    in11 = _mm_sub_epi16(stp2_4, stp2_11);
+    in12 = _mm_sub_epi16(stp2_3, stp2_12);
+    in13 = _mm_sub_epi16(stp2_2, stp2_13);
+    in14 = _mm_sub_epi16(stp2_1, stp1_14);
+    in15 = _mm_sub_epi16(stp2_0, stp1_15);
+
+    // Final rounding and shift
+    in0 = _mm_adds_epi16(in0, final_rounding);
+    in1 = _mm_adds_epi16(in1, final_rounding);
+    in2 = _mm_adds_epi16(in2, final_rounding);
+    in3 = _mm_adds_epi16(in3, final_rounding);
+    in4 = _mm_adds_epi16(in4, final_rounding);
+    in5 = _mm_adds_epi16(in5, final_rounding);
+    in6 = _mm_adds_epi16(in6, final_rounding);
+    in7 = _mm_adds_epi16(in7, final_rounding);
+    in8 = _mm_adds_epi16(in8, final_rounding);
+    in9 = _mm_adds_epi16(in9, final_rounding);
+    in10 = _mm_adds_epi16(in10, final_rounding);
+    in11 = _mm_adds_epi16(in11, final_rounding);
+    in12 = _mm_adds_epi16(in12, final_rounding);
+    in13 = _mm_adds_epi16(in13, final_rounding);
+    in14 = _mm_adds_epi16(in14, final_rounding);
+    in15 = _mm_adds_epi16(in15, final_rounding);
+
+    in0 = _mm_srai_epi16(in0, 6);
+    in1 = _mm_srai_epi16(in1, 6);
+    in2 = _mm_srai_epi16(in2, 6);
+    in3 = _mm_srai_epi16(in3, 6);
+    in4 = _mm_srai_epi16(in4, 6);
+    in5 = _mm_srai_epi16(in5, 6);
+    in6 = _mm_srai_epi16(in6, 6);
+    in7 = _mm_srai_epi16(in7, 6);
+    in8 = _mm_srai_epi16(in8, 6);
+    in9 = _mm_srai_epi16(in9, 6);
+    in10 = _mm_srai_epi16(in10, 6);
+    in11 = _mm_srai_epi16(in11, 6);
+    in12 = _mm_srai_epi16(in12, 6);
+    in13 = _mm_srai_epi16(in13, 6);
+    in14 = _mm_srai_epi16(in14, 6);
+    in15 = _mm_srai_epi16(in15, 6);
+
+    // Store results
+    _mm_store_si128((__m128i *)output, in0);
+    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+    output += 8;
+  }
+}
+
+void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int half_pitch = pitch >> 1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+          in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i col[128];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j;
+
+  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+  for (i = 0; i < 8; i++) {
+    if (i < 4) {
+      // First 1-D idct
+      // Load input data.
+      in0 = _mm_load_si128((__m128i *)input);
+      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
+      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
+      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
+      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
+      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
+      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
+      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
+      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
+      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
+      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
+      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
+      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
+      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
+      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
+
+      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
+      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
+      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
+      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
+      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
+      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
+      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
+      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
+      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
+      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
+      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
+      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
+      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
+      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
+      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
+      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
+
+      input += 256;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                    in4, in5, in6, in7);
+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                    in10, in11, in12, in13, in14, in15);
+      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+                    in18, in19, in20, in21, in22, in23);
+      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+                    in26, in27, in28, in29, in30, in31);
+    } else {
+      // Second 1-D idct
+      j = i - 4;
+
+      // Transpose 32x8 block to 8x32 block
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+                    in5, in6, in7);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+                    in11, in12, in13, in14, in15);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+                    in19, in20, in21, in22, in23);
+      j += 4;
+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+                    in28, in29, in30, in31);
+    }
+
+    // Stage1
+    {
+      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
+      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
+      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
+      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
+
+      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
+      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
+      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
+      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
+
+      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
+      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
+      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
+      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
+
+      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
+      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
+      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
+      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
+
+      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
+                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
+                             stp1_17, stp1_30)
+      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
+                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
+                             stp1_19, stp1_28)
+      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
+                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
+                             stp1_21, stp1_26)
+      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
+                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
+                             stp1_23, stp1_24)
+    }
+
+    // Stage2
+    {
+      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
+      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
+      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
+      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
+
+      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
+      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
+      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
+      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
+
+      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
+                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
+                             stp2_14)
+      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
+                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
+                             stp2_11, stp2_12)
+
+      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
+      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
+      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
+      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
+
+      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
+      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
+      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
+      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
+
+      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
+      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
+      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
+      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
+
+      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
+      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
+      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
+      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
+    }
+
+    // Stage3
+    {
+      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
+      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
+      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
+      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
+
+      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
+      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
+      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+
+      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
+                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
+                             stp1_6)
+
+      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
+      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
+      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+
+      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
+                             stp1_18, stp1_29)
+      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
+                             stp1_22, stp1_25)
+
+      stp1_16 = stp2_16;
+      stp1_31 = stp2_31;
+      stp1_19 = stp2_19;
+      stp1_20 = stp2_20;
+      stp1_23 = stp2_23;
+      stp1_24 = stp2_24;
+      stp1_27 = stp2_27;
+      stp1_28 = stp2_28;
+    }
+
+    // Stage4
+    {
+      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
+      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
+      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
+      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
+
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
+                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
+                             stp2_2, stp2_3)
+
+      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
+                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
+                             stp2_10, stp2_13)
+
+      stp2_8 = stp1_8;
+      stp2_15 = stp1_15;
+      stp2_11 = stp1_11;
+      stp2_12 = stp1_12;
+
+      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
+      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
+      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
+      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
+      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
+      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
+      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
+      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
+
+      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
+      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
+      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
+      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
+      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
+      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
+      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
+      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
+    }
+
+    // Stage5
+    {
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+      tmp0 = _mm_add_epi32(tmp0, rounding);
+      tmp1 = _mm_add_epi32(tmp1, rounding);
+      tmp2 = _mm_add_epi32(tmp2, rounding);
+      tmp3 = _mm_add_epi32(tmp3, rounding);
+
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1);
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+      stp1_4 = stp2_4;
+      stp1_7 = stp2_7;
+
+      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
+      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
+
+      stp1_16 = stp2_16;
+      stp1_17 = stp2_17;
+
+      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
+                             stp1_19, stp1_28)
+      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
+                             stp1_21, stp1_26)
+
+      stp1_22 = stp2_22;
+      stp1_23 = stp2_23;
+      stp1_24 = stp2_24;
+      stp1_25 = stp2_25;
+      stp1_30 = stp2_30;
+      stp1_31 = stp2_31;
+    }
+
+    // Stage6
+    {
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
+
+      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
+      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
+
+      stp2_8 = stp1_8;
+      stp2_9 = stp1_9;
+      stp2_14 = stp1_14;
+      stp2_15 = stp1_15;
+
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
+                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
+                             stp2_13, stp2_11, stp2_12)
+
+      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
+      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
+      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
+      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
+      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
+      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
+      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
+      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
+
+      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
+      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
+      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
+      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
+      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
+      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
+      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
+      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
+    }
+
+    // Stage7
+    {
+      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+
+      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
+      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
+      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
+      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
+      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
+      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
+      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
+      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
+      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
+      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
+      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
+      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
+      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
+      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
+      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
+      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
+
+      stp1_16 = stp2_16;
+      stp1_17 = stp2_17;
+      stp1_18 = stp2_18;
+      stp1_19 = stp2_19;
+
+      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
+                             stp1_21, stp1_26)
+      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
+                             stp1_23, stp1_24)
+
+      stp1_28 = stp2_28;
+      stp1_29 = stp2_29;
+      stp1_30 = stp2_30;
+      stp1_31 = stp2_31;
+    }
+
+    // final stage
+    if (i < 4) {
+      // 1_D: Store 32 intermediate results for each 8x32 block.
+      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    } else {
+      // 2_D: Calculate the results and store them to destination.
+      in0 = _mm_add_epi16(stp1_0, stp1_31);
+      in1 = _mm_add_epi16(stp1_1, stp1_30);
+      in2 = _mm_add_epi16(stp1_2, stp1_29);
+      in3 = _mm_add_epi16(stp1_3, stp1_28);
+      in4 = _mm_add_epi16(stp1_4, stp1_27);
+      in5 = _mm_add_epi16(stp1_5, stp1_26);
+      in6 = _mm_add_epi16(stp1_6, stp1_25);
+      in7 = _mm_add_epi16(stp1_7, stp1_24);
+      in8 = _mm_add_epi16(stp1_8, stp1_23);
+      in9 = _mm_add_epi16(stp1_9, stp1_22);
+      in10 = _mm_add_epi16(stp1_10, stp1_21);
+      in11 = _mm_add_epi16(stp1_11, stp1_20);
+      in12 = _mm_add_epi16(stp1_12, stp1_19);
+      in13 = _mm_add_epi16(stp1_13, stp1_18);
+      in14 = _mm_add_epi16(stp1_14, stp1_17);
+      in15 = _mm_add_epi16(stp1_15, stp1_16);
+      in16 = _mm_sub_epi16(stp1_15, stp1_16);
+      in17 = _mm_sub_epi16(stp1_14, stp1_17);
+      in18 = _mm_sub_epi16(stp1_13, stp1_18);
+      in19 = _mm_sub_epi16(stp1_12, stp1_19);
+      in20 = _mm_sub_epi16(stp1_11, stp1_20);
+      in21 = _mm_sub_epi16(stp1_10, stp1_21);
+      in22 = _mm_sub_epi16(stp1_9, stp1_22);
+      in23 = _mm_sub_epi16(stp1_8, stp1_23);
+      in24 = _mm_sub_epi16(stp1_7, stp1_24);
+      in25 = _mm_sub_epi16(stp1_6, stp1_25);
+      in26 = _mm_sub_epi16(stp1_5, stp1_26);
+      in27 = _mm_sub_epi16(stp1_4, stp1_27);
+      in28 = _mm_sub_epi16(stp1_3, stp1_28);
+      in29 = _mm_sub_epi16(stp1_2, stp1_29);
+      in30 = _mm_sub_epi16(stp1_1, stp1_30);
+      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+      // Final rounding and shift
+      in0 = _mm_adds_epi16(in0, final_rounding);
+      in1 = _mm_adds_epi16(in1, final_rounding);
+      in2 = _mm_adds_epi16(in2, final_rounding);
+      in3 = _mm_adds_epi16(in3, final_rounding);
+      in4 = _mm_adds_epi16(in4, final_rounding);
+      in5 = _mm_adds_epi16(in5, final_rounding);
+      in6 = _mm_adds_epi16(in6, final_rounding);
+      in7 = _mm_adds_epi16(in7, final_rounding);
+      in8 = _mm_adds_epi16(in8, final_rounding);
+      in9 = _mm_adds_epi16(in9, final_rounding);
+      in10 = _mm_adds_epi16(in10, final_rounding);
+      in11 = _mm_adds_epi16(in11, final_rounding);
+      in12 = _mm_adds_epi16(in12, final_rounding);
+      in13 = _mm_adds_epi16(in13, final_rounding);
+      in14 = _mm_adds_epi16(in14, final_rounding);
+      in15 = _mm_adds_epi16(in15, final_rounding);
+      in16 = _mm_adds_epi16(in16, final_rounding);
+      in17 = _mm_adds_epi16(in17, final_rounding);
+      in18 = _mm_adds_epi16(in18, final_rounding);
+      in19 = _mm_adds_epi16(in19, final_rounding);
+      in20 = _mm_adds_epi16(in20, final_rounding);
+      in21 = _mm_adds_epi16(in21, final_rounding);
+      in22 = _mm_adds_epi16(in22, final_rounding);
+      in23 = _mm_adds_epi16(in23, final_rounding);
+      in24 = _mm_adds_epi16(in24, final_rounding);
+      in25 = _mm_adds_epi16(in25, final_rounding);
+      in26 = _mm_adds_epi16(in26, final_rounding);
+      in27 = _mm_adds_epi16(in27, final_rounding);
+      in28 = _mm_adds_epi16(in28, final_rounding);
+      in29 = _mm_adds_epi16(in29, final_rounding);
+      in30 = _mm_adds_epi16(in30, final_rounding);
+      in31 = _mm_adds_epi16(in31, final_rounding);
+
+      in0 = _mm_srai_epi16(in0, 6);
+      in1 = _mm_srai_epi16(in1, 6);
+      in2 = _mm_srai_epi16(in2, 6);
+      in3 = _mm_srai_epi16(in3, 6);
+      in4 = _mm_srai_epi16(in4, 6);
+      in5 = _mm_srai_epi16(in5, 6);
+      in6 = _mm_srai_epi16(in6, 6);
+      in7 = _mm_srai_epi16(in7, 6);
+      in8 = _mm_srai_epi16(in8, 6);
+      in9 = _mm_srai_epi16(in9, 6);
+      in10 = _mm_srai_epi16(in10, 6);
+      in11 = _mm_srai_epi16(in11, 6);
+      in12 = _mm_srai_epi16(in12, 6);
+      in13 = _mm_srai_epi16(in13, 6);
+      in14 = _mm_srai_epi16(in14, 6);
+      in15 = _mm_srai_epi16(in15, 6);
+      in16 = _mm_srai_epi16(in16, 6);
+      in17 = _mm_srai_epi16(in17, 6);
+      in18 = _mm_srai_epi16(in18, 6);
+      in19 = _mm_srai_epi16(in19, 6);
+      in20 = _mm_srai_epi16(in20, 6);
+      in21 = _mm_srai_epi16(in21, 6);
+      in22 = _mm_srai_epi16(in22, 6);
+      in23 = _mm_srai_epi16(in23, 6);
+      in24 = _mm_srai_epi16(in24, 6);
+      in25 = _mm_srai_epi16(in25, 6);
+      in26 = _mm_srai_epi16(in26, 6);
+      in27 = _mm_srai_epi16(in27, 6);
+      in28 = _mm_srai_epi16(in28, 6);
+      in29 = _mm_srai_epi16(in29, 6);
+      in30 = _mm_srai_epi16(in30, 6);
+      in31 = _mm_srai_epi16(in31, 6);
+
+      // Store results
+      _mm_store_si128((__m128i *)output, in0);
+      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
+      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
+      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
+      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
+      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
+      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
+      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
+      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
+      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
+      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
+      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
+      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
+      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
+      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
+      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
+      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+
+      output += 8;
+    }
+  }
+}
+#endif
diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h
index 8320cf87d..bd66d8c72 100644
--- a/vp9/common/x86/vp9_idct_x86.h
+++ b/vp9/common/x86/vp9_idct_x86.h
@@ -20,23 +20,10 @@
  */
 
 #if HAVE_MMX
-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
-extern prototype_idct(vp9_short_idct4x4llm_mmx);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
-
 extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
 extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
-
-#undef  vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
-
-#undef  vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
-
 #undef vp9_idct_iwalsh16
 #define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
 
diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm
deleted file mode 100644
index 15e81addb..000000000
--- a/vp9/common/x86/vp9_idctllm_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2:      times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16:         times 4 dw 16
-
-SECTION .text
-
-
-; /****************************************************************************
-; * Notes:
-; *
-; * This implementation makes use of 16 bit fixed point version of two multiply
-; * constants:
-; *        1.   sqrt(2) * cos (pi/8)
-; *        2.   sqrt(2) * sin (pi/8)
-; * Because the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point precision as the second one, we use a trick of
-; *        x * a = x + x*(a-1)
-; * so
-; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
-; *
-; * For the second constant, because of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
-; * number.
-; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
-; *
-; **************************************************************************/
-
-INIT_MMX
-
-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
-    mova            m0,     [inpq +0]
-    mova            m1,     [inpq +8]
-
-    mova            m2,     [inpq+16]
-    mova            m3,     [inpq+24]
-
-    psubw           m0,      m2             ; b1= 0-2
-    paddw           m2,      m2             ;
-
-    mova            m5,      m1
-    paddw           m2,      m0             ; a1 =0+2
-
-    pmulhw          m5,     [x_s1sqr2]       ;
-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova            m7,      m3             ;
-    pmulhw          m7,     [x_c1sqr2less1]   ;
-
-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw           m7,      m5             ; c1
-
-    mova            m5,      m1
-    mova            m4,      m3
-
-    pmulhw          m5,     [x_c1sqr2less1]
-    paddw           m5,      m1
-
-    pmulhw          m3,     [x_s1sqr2]
-    paddw           m3,      m4
-
-    paddw           m3,      m5             ; d1
-    mova            m6,      m2             ; a1
-
-    mova            m4,      m0             ; b1
-    paddw           m2,      m3             ;0
-
-    paddw           m4,      m7             ;1
-    psubw           m0,      m7             ;2
-
-    psubw           m6,      m3             ;3
-
-    mova            m1,      m2             ; 03 02 01 00
-    mova            m3,      m4             ; 23 22 21 20
-
-    punpcklwd       m1,      m0             ; 11 01 10 00
-    punpckhwd       m2,      m0             ; 13 03 12 02
-
-    punpcklwd       m3,      m6             ; 31 21 30 20
-    punpckhwd       m4,      m6             ; 33 23 32 22
-
-    mova            m0,      m1             ; 11 01 10 00
-    mova            m5,      m2             ; 13 03 12 02
-
-    punpckldq       m0,      m3             ; 30 20 10 00
-    punpckhdq       m1,      m3             ; 31 21 11 01
-
-    punpckldq       m2,      m4             ; 32 22 12 02
-    punpckhdq       m5,      m4             ; 33 23 13 03
-
-    mova            m3,      m5             ; 33 23 13 03
-
-    psubw           m0,      m2             ; b1= 0-2
-    paddw           m2,      m2             ;
-
-    mova            m5,      m1
-    paddw           m2,      m0             ; a1 =0+2
-
-    pmulhw          m5,     [x_s1sqr2]        ;
-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova            m7,      m3             ;
-    pmulhw          m7,     [x_c1sqr2less1]   ;
-
-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw           m7,      m5             ; c1
-
-    mova            m5,      m1
-    mova            m4,      m3
-
-    pmulhw          m5,     [x_c1sqr2less1]
-    paddw           m5,      m1
-
-    pmulhw          m3,     [x_s1sqr2]
-    paddw           m3,      m4
-
-    paddw           m3,      m5             ; d1
-    paddw           m0,     [pw_16]
-
-    paddw           m2,     [pw_16]
-    mova            m6,      m2             ; a1
-
-    mova            m4,      m0             ; b1
-    paddw           m2,      m3             ;0
-
-    paddw           m4,      m7             ;1
-    psubw           m0,      m7             ;2
-
-    psubw           m6,      m3             ;3
-    psraw           m2,      5
-
-    psraw           m0,      5
-    psraw           m4,      5
-
-    psraw           m6,      5
-
-    mova            m1,      m2             ; 03 02 01 00
-    mova            m3,      m4             ; 23 22 21 20
-
-    punpcklwd       m1,      m0             ; 11 01 10 00
-    punpckhwd       m2,      m0             ; 13 03 12 02
-
-    punpcklwd       m3,      m6             ; 31 21 30 20
-    punpckhwd       m4,      m6             ; 33 23 32 22
-
-    mova            m0,      m1             ; 11 01 10 00
-    mova            m5,      m2             ; 13 03 12 02
-
-    punpckldq       m0,      m3             ; 30 20 10 00
-    punpckhdq       m1,      m3             ; 31 21 11 01
-
-    punpckldq       m2,      m4             ; 32 22 12 02
-    punpckhdq       m5,      m4             ; 33 23 13 03
-
-    mova        [outq],      m0
-
-    mova     [outq+r2],      m1
-    mova [outq+pitq*2],      m2
-
-    add           outq,      pitq
-    mova [outq+pitq*2],      m5
-    RET
-
-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
-    movh            m0,     [inpq]
-    paddw           m0,     [pw_16]
-    psraw           m0,      5
-    punpcklwd       m0,      m0
-    punpckldq       m0,      m0
-
-    mova        [outq],      m0
-    mova   [outq+pitq],      m0
-
-    mova [outq+pitq*2],      m0
-    add             r1,      r2
-
-    mova [outq+pitq*2],      m0
-    RET
-
-
-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
-%if ARCH_X86_64
-    movsxd         strideq,      dword stridem
-%else
-    mov            strideq,      stridem
-%endif
-    pxor                m0,      m0
-
-    movh                m5,      in_dcq ; dc
-    paddw               m5,     [pw_16]
-
-    psraw               m5,      5
-
-    punpcklwd           m5,      m5
-    punpckldq           m5,      m5
-
-    movh                m1,     [predq]
-    punpcklbw           m1,      m0
-    paddsw              m1,      m5
-    packuswb            m1,      m0              ; pack and unpack to saturate
-    movh            [dstq],      m1
-
-    movh                m2,     [predq+pitq]
-    punpcklbw           m2,      m0
-    paddsw              m2,      m5
-    packuswb            m2,      m0              ; pack and unpack to saturate
-    movh    [dstq+strideq],      m2
-
-    movh                m3,     [predq+2*pitq]
-    punpcklbw           m3,      m0
-    paddsw              m3,      m5
-    packuswb            m3,      m0              ; pack and unpack to saturate
-    movh  [dstq+2*strideq],      m3
-
-    add               dstq,      strideq
-    add              predq,      pitq
-    movh                m4,     [predq+2*pitq]
-    punpcklbw           m4,      m0
-    paddsw              m4,      m5
-    packuswb            m4,      m0              ; pack and unpack to saturate
-    movh  [dstq+2*strideq],      m4
-    RET
-
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index d319bf2d5..08447a62d 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -26,14 +26,16 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
   DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
   DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
 
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+
+  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
+  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+
+
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
   __m128i q5, q6, q7;
@@ -58,12 +60,24 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+
+  _mm_store_si128((__m128i *)ap[4], p4);
+  _mm_store_si128((__m128i *)ap[3], p3);
+  _mm_store_si128((__m128i *)ap[2], p2);
+  _mm_store_si128((__m128i *)ap[1], p1);
+  _mm_store_si128((__m128i *)ap[0], p0);
+  _mm_store_si128((__m128i *)aq[4], q4);
+  _mm_store_si128((__m128i *)aq[3], q3);
+  _mm_store_si128((__m128i *)aq[2], q2);
+  _mm_store_si128((__m128i *)aq[1], q1);
+  _mm_store_si128((__m128i *)aq[0], q0);
+
+
   {
     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
                                           _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
                                           _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
@@ -95,246 +109,8 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     mask = _mm_max_epu8(work, mask);
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
   }
 
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // calculate flat2
-  p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-//  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-//  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i work;
-    flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-  }
-  // calculate flat2
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    i = 0;
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // wide flat
-  // TODO(slavarnway): interleave with the flat pixel calculations (see above)
-  {
-    const __m128i eight = _mm_set1_epi16(8);
-    unsigned char *src = s;
-    int i = 0;
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);
-      p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);
-      p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-      q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);
-      q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);
-      q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);
-
-
-      workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
-      workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);
-      workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));
-      workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // wide flat
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
   // lp filter
   {
     const __m128i t4 = _mm_set1_epi8(4);
@@ -345,14 +121,10 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
-    __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -374,6 +146,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
     /* Filter2 >> 3 */
     work_a = _mm_cmpgt_epi8(zero, filter2);
@@ -381,6 +154,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
     /* filt >> 1 */
     filt = _mm_adds_epi8(filter1, t1);
@@ -389,20 +163,265 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     work_a = _mm_and_si128(work_a, t80);
     filt = _mm_and_si128(filt, t7f);
     filt = _mm_or_si128(filt, work_a);
-
     filt = _mm_andnot_si128(hev, filt);
-
-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    // loopfilter done
+
+    {
+      __m128i work;
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                       _mm_subs_epu8(p0, p2)),
+                           _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                        _mm_subs_epu8(q0, q2)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                       _mm_subs_epu8(p0, p3)),
+                           _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                        _mm_subs_epu8(q0, q3)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                       _mm_subs_epu8(p0, p4)),
+                           _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                        _mm_subs_epu8(q0, q4)));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
+                                       _mm_subs_epu8(p0, p5)),
+                           _mm_or_si128(_mm_subs_epu8(q5, q0),
+                                        _mm_subs_epu8(q0, q5)));
+      _mm_store_si128((__m128i *)ap[5], p5);
+      _mm_store_si128((__m128i *)aq[5], q5);
+      flat2 = _mm_max_epu8(work, flat2);
+      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
+                                       _mm_subs_epu8(p0, p6)),
+                           _mm_or_si128(_mm_subs_epu8(q6, q0),
+                                        _mm_subs_epu8(q0, q6)));
+      _mm_store_si128((__m128i *)ap[6], p6);
+      _mm_store_si128((__m128i *)aq[6], q6);
+      flat2 = _mm_max_epu8(work, flat2);
+
+      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
+                                       _mm_subs_epu8(p0, p7)),
+                           _mm_or_si128(_mm_subs_epu8(q7, q0),
+                                        _mm_subs_epu8(q0, q7)));
+      _mm_store_si128((__m128i *)ap[7], p7);
+      _mm_store_si128((__m128i *)aq[7], q7);
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i four = _mm_set1_epi16(4);
+      __m128i temp_flat2 = flat2;
+      unsigned char *src = s;
+      int i = 0;
+      do {
+        __m128i workp_shft;
+        __m128i a, b, c;
+
+        unsigned int off = i * 8;
+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+
+        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
+        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+
+        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
+        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
+        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+
+        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q1, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
+        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q2, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
+        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q3, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
+        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        b = _mm_add_epi16(q3, b);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
+        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(q4, c);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        b = _mm_add_epi16(q3, b);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
+        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+        a = _mm_add_epi16(q5, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q6, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
+        src += 8;
+      } while (++i < 2);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    work_a = _mm_load_si128((__m128i *)ap[2]);
+    p2 = _mm_load_si128((__m128i *)flat_op[2]);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+    _mm_store_si128((__m128i *)flat_op[2], p2);
+
+    p1 = _mm_load_si128((__m128i *)flat_op[1]);
+    work_a = _mm_andnot_si128(flat, ps1);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+    _mm_store_si128((__m128i *)flat_op[1], p1);
+
+    p0 = _mm_load_si128((__m128i *)flat_op[0]);
+    work_a = _mm_andnot_si128(flat, ps0);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+    _mm_store_si128((__m128i *)flat_op[0], p0);
+
+    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+    work_a = _mm_andnot_si128(flat, qs0);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+    _mm_store_si128((__m128i *)flat_oq[0], q0);
+
+    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+    work_a = _mm_andnot_si128(flat, qs1);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+    _mm_store_si128((__m128i *)flat_oq[1], q1);
+
+    work_a = _mm_load_si128((__m128i *)aq[2]);
+    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+    _mm_store_si128((__m128i *)flat_oq[2], q2);
 
     // write out op6 - op3
     {
       unsigned char *dst = (s - 7 * p);
       for (i = 6; i > 2; i--) {
         __m128i flat2_output;
-        work_a = _mm_loadu_si128((__m128i *)dst);
+        work_a = _mm_load_si128((__m128i *)ap[i]);
         flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
@@ -412,62 +431,42 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
       }
     }
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    work_a = _mm_or_si128(work_a, p2);
+    work_a = _mm_load_si128((__m128i *)flat_op[2]);
     p2 = _mm_load_si128((__m128i *)flat2_op[2]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p2 = _mm_and_si128(flat2, p2);
     p2 = _mm_or_si128(work_a, p2);
     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
 
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, ps1);
-    p1 = _mm_and_si128(flat, p1);
-    work_a = _mm_or_si128(work_a, p1);
+    work_a = _mm_load_si128((__m128i *)flat_op[1]);
     p1 = _mm_load_si128((__m128i *)flat2_op[1]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p1 = _mm_and_si128(flat2, p1);
     p1 = _mm_or_si128(work_a, p1);
     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
 
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, ps0);
-    p0 = _mm_and_si128(flat, p0);
-    work_a = _mm_or_si128(work_a, p0);
+    work_a = _mm_load_si128((__m128i *)flat_op[0]);
     p0 = _mm_load_si128((__m128i *)flat2_op[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p0 = _mm_and_si128(flat2, p0);
     p0 = _mm_or_si128(work_a, p0);
     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
 
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, qs0);
-    q0 = _mm_and_si128(flat, q0);
-    work_a = _mm_or_si128(work_a, q0);
+    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
     q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q0 = _mm_and_si128(flat2, q0);
     q0 = _mm_or_si128(work_a, q0);
     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, qs1);
-    q1 = _mm_and_si128(flat, q1);
-    work_a = _mm_or_si128(work_a, q1);
+    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
     q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q1 = _mm_and_si128(flat2, q1);
     q1 = _mm_or_si128(work_a, q1);
     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    work_a = _mm_or_si128(work_a, q2);
+    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
     q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q2 = _mm_and_si128(flat2, q2);
@@ -479,7 +478,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
       unsigned char *dst = (s + 3 * p);
       for (i = 3; i < 7; i++) {
         __m128i flat2_output;
-        work_a = _mm_loadu_si128((__m128i *)dst);
+        work_a = _mm_load_si128((__m128i *)aq[i]);
         flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
@@ -504,7 +503,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   __m128i mask, hev, flat;
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
   const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
   const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
@@ -515,7 +514,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   const __m128i blimit =
       _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
@@ -524,7 +522,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
   {
     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
                                           _mm_subs_epu8(p0, p1));
@@ -573,11 +570,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
                          _mm_or_si128(_mm_subs_epu8(q3, q0),
                                       _mm_subs_epu8(q0, q3)));
     flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat = _mm_max_epu8(work, flat);
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
@@ -588,7 +580,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     int i = 0;
     do {
       __m128i workp_a, workp_b, workp_shft;
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
@@ -597,11 +588,10 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
 
-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_op2[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
@@ -611,7 +601,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       _mm_storel_epi64((__m128i *)&flat_op1[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_op0[i*8],
@@ -623,13 +613,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
@@ -813,8 +803,8 @@ void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
                    _mm_loadl_epi64((__m128i *)(src + 120)));
 }
 
-static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                   int in_p, unsigned char *out, int out_p) {
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+                                 int in_p, unsigned char *out, int out_p) {
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
@@ -879,9 +869,9 @@ static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 }
 
-static __inline void transpose(unsigned char *src[], int in_p,
-                               unsigned char *dst[], int out_p,
-                               int num_8x8_to_transpose) {
+static INLINE void transpose(unsigned char *src[], int in_p,
+                             unsigned char *dst[], int out_p,
+                             int num_8x8_to_transpose) {
   int idx8x8 = 0;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   do {
diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm
index 5f06f0ea0..c2118dbb7 100644
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ b/vp9/common/x86/vp9_postproc_mmx.asm
@@ -459,11 +459,11 @@ sym(vp9_mbpost_proc_down_mmx):
 %undef flimit2
 
 
-;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
 ;                            unsigned char blackclamp[16],
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
+;                            unsigned int width, unsigned int height, int pitch)
 extern sym(rand)
 global sym(vp9_plane_add_noise_mmx) PRIVATE
 sym(vp9_plane_add_noise_mmx):
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index 8bbb3794b..858fc99b6 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -624,11 +624,11 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 %undef flimit4
 
 
-;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
 ;                            unsigned char blackclamp[16],
 ;                            unsigned char whiteclamp[16],
 ;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
+;                            unsigned int width, unsigned int height, int pitch)
 extern sym(rand)
 global sym(vp9_plane_add_noise_wmt) PRIVATE
 sym(vp9_plane_add_noise_wmt):
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index b644da64c..32f00e289 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -21,34 +21,92 @@
 ;
 ;*************************************************************************************/
 
-;void vp9_filter_block1d8_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
+%macro VERTx4 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.loop:
+    movd        xmm0, [rsi]                 ;A
+    movd        xmm1, [rsi + rdx]           ;B
+    movd        xmm2, [rsi + rdx * 2]       ;C
+    movd        xmm3, [rax + rdx * 2]       ;D
+    movd        xmm4, [rsi + rdx * 4]       ;E
+    movd        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movd        xmm6, [rsi + rbx]           ;G
+    movd        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, krd
+    paddsw      xmm4, xmm6
+    paddsw      xmm0, xmm4
 
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .loop
+%endm
+
+%macro VERTx8 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -86,7 +144,7 @@ sym(vp9_filter_block1d8_v8_ssse3):
     lea         rbx, [rdx + rdx*4]
     add         rbx, rdx                    ;pitch * 6
 
-.vp9_filter_block1d8_v8_ssse3_loop:
+.loop:
     movq        xmm0, [rsi]                 ;A
     movq        xmm1, [rsi + rdx]           ;B
     movq        xmm2, [rsi + rdx * 2]       ;C
@@ -117,7 +175,10 @@ sym(vp9_filter_block1d8_v8_ssse3):
 
     add         rsi,  rdx
     add         rax,  rdx
-
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
     movq        [rdi], xmm0
 
 %if ABI_IS_32BIT
@@ -126,47 +187,11 @@ sym(vp9_filter_block1d8_v8_ssse3):
     add         rdi, r8
 %endif
     dec         rcx
-    jnz         .vp9_filter_block1d8_v8_ssse3_loop
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
+    jnz         .loop
+%endm
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
 
+%macro VERTx16 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -204,7 +229,7 @@ sym(vp9_filter_block1d16_v8_ssse3):
     lea         rbx, [rdx + rdx*4]
     add         rbx, rdx                    ;pitch * 6
 
-.vp9_filter_block1d16_v8_ssse3_loop:
+.loop:
     movq        xmm0, [rsi]                 ;A
     movq        xmm1, [rsi + rdx]           ;B
     movq        xmm2, [rsi + rdx * 2]       ;C
@@ -232,7 +257,10 @@ sym(vp9_filter_block1d16_v8_ssse3):
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
-
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
     movq        [rdi], xmm0
 
     movq        xmm0, [rsi + 8]             ;A
@@ -267,6 +295,10 @@ sym(vp9_filter_block1d16_v8_ssse3):
 
     add         rsi,  rdx
     add         rax,  rdx
+%if %1
+    movq    xmm1, [rdi+8]
+    pavgb   xmm0, xmm1
+%endif
 
     movq        [rdi+8], xmm0
 
@@ -276,7 +308,38 @@ sym(vp9_filter_block1d16_v8_ssse3):
     add         rdi, r8
 %endif
     dec         rcx
-    jnz         .vp9_filter_block1d16_v8_ssse3_loop
+    jnz         .loop
+%endm
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx4 0
 
     add rsp, 16*5
     pop rsp
@@ -289,24 +352,65 @@ sym(vp9_filter_block1d16_v8_ssse3):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_ssse3
+;void vp9_filter_block1d8_v8_ssse3
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx8 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
     SAVE_XMM 7
-    GET_GOT     rbx
     push        rsi
     push        rdi
+    push        rbx
     ; end prolog
 
     ALIGN_STACK 16, rax
@@ -317,6 +421,121 @@ sym(vp9_filter_block1d8_h8_ssse3):
     %define k6k7 [rsp + 16*3]
     %define krd [rsp + 16*4]
 
+    VERTx16 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx4 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx8 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx16 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+%macro HORIZx4 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -340,19 +559,16 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pshufd      xmm5, xmm5, 0
     movdqa      k4k5, xmm2
     movdqa      k6k7, xmm3
-;    movdqa      krd, xmm5
+    movdqa      krd, xmm5
 
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
 
-.filter_block1d8_h8_rowloop_ssse3:
+.loop:
     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
     punpcklqdq  xmm0,   xmm3
 
     movdqa      xmm1,   xmm0
@@ -371,59 +587,94 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pmaddubsw   xmm4,   k6k7
 
     paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   xmm5
     paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
     psraw       xmm0,   7
     packuswb    xmm0,   xmm0
-
+%if %1
+    movd        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
     lea         rsi,    [rsi + rax]
-    movq        [rdi],  xmm0
+    movd        [rdi],  xmm0
 
     lea         rdi,    [rdi + rdx]
     dec         rcx
-    jnz         .filter_block1d8_h8_rowloop_ssse3
+    jnz         .loop
+%endm
 
-    add rsp, 16*5
-    pop rsp
+%macro HORIZx8 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
 
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
 
-;void vp9_filter_block1d16_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
 
+.loop:
+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+
+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      xmm1,   xmm0
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
+    pmaddubsw   xmm0,   k0k1
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   xmm1,   k2k3
+
+    movdqa      xmm4,   xmm2
+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
+    pmaddubsw   xmm2,   k4k5
+
+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
+    pmaddubsw   xmm4,   k6k7
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
+    psraw       xmm0,   7
+    packuswb    xmm0,   xmm0
+%if %1
+    movq        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
+
+    lea         rsi,    [rsi + rax]
+    movq        [rdi],  xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+%endm
+
+%macro HORIZx16 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -453,13 +704,10 @@ sym(vp9_filter_block1d16_h8_ssse3):
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
 
-.filter_block1d16_h8_rowloop_ssse3:
+.loop:
     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
     punpcklqdq  xmm0,   xmm3
 
     movdqa      xmm1,   xmm0
@@ -486,10 +734,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
 
 
     movq        xmm3,   [rsi +  5]
-;    movq        xmm7,   [rsi + 12]
     movq        xmm7,   [rsi + 13]
-;note: same as above
-;    punpcklbw   xmm3,   xmm7
     punpcklqdq  xmm3,   xmm7
 
     movdqa      xmm1,   xmm3
@@ -508,19 +753,54 @@ sym(vp9_filter_block1d16_h8_ssse3):
     pmaddubsw   xmm4,   k6k7
 
     paddsw      xmm3,   xmm1
+    paddsw      xmm3,   xmm4
     paddsw      xmm3,   xmm2
     paddsw      xmm3,   krd
-    paddsw      xmm3,   xmm4
     psraw       xmm3,   7
     packuswb    xmm3,   xmm3
     punpcklqdq  xmm0,   xmm3
+%if %1
+    movdqa      xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
 
     lea         rsi,    [rsi + rax]
     movdqa      [rdi],  xmm0
 
     lea         rdi,    [rdi + rdx]
     dec         rcx
-    jnz         .filter_block1d16_h8_rowloop_ssse3
+    jnz         .loop
+%endm
+
+;void vp9_filter_block1d4_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx4 0
 
     add rsp, 16*5
     pop rsp
@@ -534,7 +814,188 @@ sym(vp9_filter_block1d16_h8_ssse3):
     pop         rbp
     ret
 
+;void vp9_filter_block1d8_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx8 0
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx16 0
 
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx4 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx8 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx16 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 SECTION_RODATA
 align 16
 shuf_t0t1:
diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm
deleted file mode 100644
index dee29b8fb..000000000
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ /dev/null
@@ -1,268 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT  7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-;    unsigned char   *src_ptr,
-;    unsigned short  *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx) PRIVATE
-sym(vp9_filter_block1d_h6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,    arg(6) ;vp9_filter
-
-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
-        movq        mm2,    [rdx + 32]         ;
-        movq        mm6,    [rdx + 48]        ;
-        movq        mm7,    [rdx + 64]        ;
-
-        mov         rdi,    arg(1) ;output_ptr
-        mov         rsi,    arg(0) ;src_ptr
-        movsxd      rcx,    dword ptr arg(4) ;output_height
-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
-        pxor        mm0,    mm0              ; mm0 = 00000000
-
-.nextrow:
-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
-        movq        mm4,    mm3              ; mm4 = p-2..p5
-        psrlq       mm3,    8                ; mm3 = p-1..p5
-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        movq        mm4,    mm5              ; mm4 = p-2..p5;
-        psrlq       mm5,    16               ; mm5 = p0..p5;
-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        psrlq       mm4,    24               ; mm4 = p1..p5
-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        ; do outer positive taps
-        movd        mm4,    [rsi+3]
-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3,    mm0              ; pack and unpack to saturate
-        punpcklbw   mm3,    mm0              ;
-
-        movq        [rdi],  mm3              ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
-        add         rdi,    rax;
-%else
-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
-        add         rdi,    rax;
-
-        add         rsi,    r8               ; next line
-%endif
-
-        dec         rcx                      ; decrement count
-        jnz         .nextrow                 ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;    int output_pitch,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
-sym(vp9_filter_block1dc_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(7) ;vp9_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(5) ;output_height
-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-.nextrow_cv:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
-        ; recon block should be in cache this shouldn't cost much.  Its obviously
-        ; avoidable!!!.
-        lea         rdi,  [rdi+rax] ;
-        dec         rcx                   ; decrement count
-        jnz         .nextrow_cv           ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-rd:
-    times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 128
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 0
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 123
-    times 8 dw 12
-    times 8 dw -1
-    times 8 dw 0
-
-    times 8 dw 2
-    times 8 dw -11
-    times 8 dw 108
-    times 8 dw 36
-    times 8 dw -8
-    times 8 dw 1
-
-    times 8 dw 0
-    times 8 dw -9
-    times 8 dw 93
-    times 8 dw 50
-    times 8 dw -6
-    times 8 dw 0
-
-    times 8 dw 3
-    times 8 dw -16
-    times 8 dw 77
-    times 8 dw 77
-    times 8 dw -16
-    times 8 dw 3
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 50
-    times 8 dw 93
-    times 8 dw -9
-    times 8 dw 0
-
-    times 8 dw 1
-    times 8 dw -8
-    times 8 dw 36
-    times 8 dw 108
-    times 8 dw -11
-    times 8 dw 2
-
-    times 8 dw 0
-    times 8 dw -1
-    times 8 dw 12
-    times 8 dw 123
-    times 8 dw -6
-    times 8 dw 0
-
diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm
deleted file mode 100644
index b0c4f1282..000000000
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi+16],      xmm4
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-;    short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        pmullw      xmm1,       [rax]
-
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
-        pmullw      xmm3,       [rax + 32]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
-        pmullw      xmm5,       [rax + 64]
-
-        add         rsi,        rdx
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
-
-        pmullw      xmm4,       [rax + 48]
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
-
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-;    unsigned short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    const short    *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
-sym(vp9_filter_block1d16_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
-        pmullw      xmm1,       [rax + 16]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm3,       [rax + 64]
-        pmullw      xmm4,       [rax + 64]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm5,       [rax + 32]
-        pmullw      xmm6,       [rax + 32]
-
-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
-        pmullw      xmm7,       [rax]
-        pmullw      xmm0,       [rax]
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm0
-
-        add         rsi,        rdx
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm3,       [rax + 48]
-        pmullw      xmm4,       [rax + 48]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm5,       [rax + 80]
-        pmullw      xmm6,       [rax + 80]
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm1,       7
-        psraw       xmm2,       7
-
-        packuswb    xmm1,       xmm2              ; pack and saturate
-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-
-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_only_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; lower 8 bytes
-
-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; higher 8 bytes
-
-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int output_height,
-;    const short    *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        mov         rax,        arg(5) ;vp9_filter
-
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
-        movq        xmm1,       MMWORD PTR [rsi]
-        movq        xmm2,       MMWORD PTR [rsi + rdx]
-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
-        add         rsi,        rdx
-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
-
-        punpcklbw   xmm1,       xmm0
-        pmullw      xmm1,       [rax]
-
-        punpcklbw   xmm2,       xmm0
-        pmullw      xmm2,       [rax + 16]
-
-        punpcklbw   xmm3,       xmm0
-        pmullw      xmm3,       [rax + 32]
-
-        punpcklbw   xmm5,       xmm0
-        pmullw      xmm5,       [rax + 64]
-
-        punpcklbw   xmm4,       xmm0
-        pmullw      xmm4,       [rax + 48]
-
-        punpcklbw   xmm6,       xmm0
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    output_height,
-;    unsigned int    output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
-sym(vp9_unpack_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(3) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        punpcklbw   xmm1,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm1
-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(4) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
-sym(vp9_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
-sym(vp9_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-rd:
-    times 8 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm
deleted file mode 100644
index b260480e0..000000000
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4
-
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    mov         rdi, arg(2)             ;output_ptr
-
-    cmp         esi, DWORD PTR [rax]
-    je          vp9_filter_block1d8_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    pmaddubsw   xmm1,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm2,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-    jnz         .filter_block1d8_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp9_filter_block1d8_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm2,   xmm0
-    pshufb      xmm0,   xmm3
-
-    pshufb      xmm2,   xmm4
-    pmaddubsw   xmm0,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-
-    jnz         .filter_block1d8_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)           ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    mov         rdi, arg(2)                     ;output_ptr
-
-    mov         rsi, arg(0)                     ;src_ptr
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)           ;output_height
-    movsxd      rdx, dword ptr arg(3)           ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    movq        xmm3,   MMWORD PTR [rsi +  6]
-
-    pmaddubsw   xmm1,   xmm5
-    movq        xmm7,   MMWORD PTR [rsi + 11]
-
-    pmaddubsw   xmm2,   xmm6
-    punpcklbw   xmm3,   xmm7
-
-    paddsw      xmm0,   xmm1
-    movdqa      xmm1,   xmm3
-
-    pmaddubsw   xmm3,   xmm4
-    paddsw      xmm0,   xmm2
-
-    movdqa      xmm2,   xmm1
-    paddsw      xmm0,   [GLOBAL(rd)]
-
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-
-    psraw       xmm0,   7
-    pmaddubsw   xmm1,   xmm5
-
-    pmaddubsw   xmm2,   xmm6
-    packuswb    xmm0,   xmm0
-
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3,   xmm1
-
-    paddsw      xmm3,   xmm2
-
-    paddsw      xmm3,   [GLOBAL(rd)]
-
-    psraw       xmm3,   7
-
-    packuswb    xmm3,   xmm3
-
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d16_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [GLOBAL(shuf1b)]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pmaddubsw   xmm0, xmm4
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm7
-    pxor        xmm1, xmm1
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    movd        DWORD PTR [rdi], xmm0
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm1, xmm7
-    paddsw      xmm1, xmm2
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-
-    movd        DWORD PTR [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d16_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2          ;store the results
-
-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d16_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    paddsw      xmm2, [GLOBAL(rd)]
-    paddsw      xmm2, xmm3
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    punpcklbw   xmm5, xmm4                  ;B D
-    punpcklbw   xmm1, xmm0                  ;C E
-
-    pmaddubsw   xmm1, xmm6
-    pmaddubsw   xmm5, xmm7
-
-    movdqa      xmm4, [GLOBAL(rd)]
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm5, xmm1
-    paddsw      xmm5, xmm4
-    psraw       xmm5, 7
-    packuswb    xmm5, xmm5
-
-    punpcklqdq  xmm2, xmm5
-
-    movdqa       XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d8_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-    movdqa      xmm4, [GLOBAL(rd)]
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, xmm4
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d8_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm5, [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm5
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_v4_ssse3
-
-    movq        mm5, MMWORD PTR [rax]         ;k0_k5
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
-    movd        mm1, DWORD PTR [rsi]                  ;A
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
-
-    movq        mm4, [GLOBAL(rd)]
-
-    pmaddubsw   mm3, mm6
-    punpcklbw   mm1, mm0                  ;A F
-    pmaddubsw   mm2, mm7
-    pmaddubsw   mm1, mm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm1
-    paddsw      mm2, mm4
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_v4_ssse3:
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-    movq        mm5, MMWORD PTR [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    pmaddubsw   mm3, mm6
-    pmaddubsw   mm2, mm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm5
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
-sym(vp9_bilinear_predict16x16_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
-
-        movdqa      xmm2,       [rax]
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
-%endif
-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
-
-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm6,       xmm5
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm6,       xmm1
-
-        punpcklbw   xmm4,       xmm5
-        pmaddubsw   xmm4,       xmm1
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        packuswb    xmm6,       xmm4
-        movdqa      xmm5,       xmm7
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm2
-
-        punpckhbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm2
-
-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm5,       xmm7
-        movdqa      xmm7,       xmm6
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]               ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-        ; get the first horizontal line done
-        movq        xmm4,       [rsi]               ; load row 0
-        movq        xmm2,       [rsi + 8]           ; load row 0
-
-        lea         rsi,        [rsi + rax]         ; next line
-.next_row_sp:
-        movq        xmm3,       [rsi]               ; load row + 1
-        movq        xmm5,       [rsi + 8]           ; load row + 1
-
-        punpcklbw   xmm4,       xmm3
-        punpcklbw   xmm2,       xmm5
-
-        pmaddubsw   xmm4,       xmm1
-        movq        xmm7,       [rsi + rax]         ; load row + 2
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
-
-        punpcklbw   xmm3,       xmm7
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm3,       xmm1
-        paddw       xmm4,       [GLOBAL(rd)]
-
-        pmaddubsw   xmm5,       xmm1
-        paddw       xmm2,       [GLOBAL(rd)]
-
-        psraw       xmm4,       VP9_FILTER_SHIFT
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        packuswb    xmm4,       xmm2
-        paddw       xmm3,       [GLOBAL(rd)]
-
-        movdqa      [rdi],      xmm4                ; store row 0
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm5
-        movdqa      xmm4,       xmm7
-
-        movdqa      [rdi + rdx],xmm3                ; store row 1
-        lea         rsi,        [rsi + 2*rax]
-
-        movdqa      xmm2,       xmm6
-        lea         rdi,        [rdi + 2*rdx]
-
-        cmp         rdi,        rcx
-        jne         .next_row_sp
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-.next_row_fp:
-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm2,       xmm4
-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rax]         ; next line
-        punpcklbw   xmm3,       xmm4
-
-        pmaddubsw   xmm3,       xmm1
-        movq        xmm5,       [rsi]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        movq        xmm7,       [rsi+1]
-
-        movq        xmm6,       [rsi+8]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        punpcklbw   xmm5,       xmm7
-        movq        xmm7,       [rsi+9]
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        pmaddubsw   xmm5,       xmm1
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        punpcklbw   xmm6,       xmm7
-
-        packuswb    xmm2,       xmm3
-        pmaddubsw   xmm6,       xmm1
-
-        movdqa      [rdi],      xmm2                ; store the results in the destination
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm6,       VP9_FILTER_SHIFT
-
-        packuswb    xmm5,       xmm6
-        lea         rsi,        [rsi + rax]         ; next line
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-.done:
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
-sym(vp9_bilinear_predict8x8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b8x8_sp_only
-
-        shl         rax,        4
-        add         rax,        rcx                 ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b8x8_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm1,       [rax]
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
-        psrldq      xmm5,       1
-        lea         rsp,        [rsp + 16]          ; next line
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        lea         rsp,        [rsp + 16]          ; next line
-
-        movdqa      xmm5,       xmm6
-
-        psrldq      xmm5,       1
-
-        punpcklbw   xmm6,       xmm5
-        pmaddubsw   xmm6,       xmm0
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        packuswb    xmm6,       xmm6
-
-        punpcklbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm1
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm7,       xmm7
-
-        movq        [rdi],      xmm7                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]
-
-        movdqa      xmm7,       xmm6
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done8x8
-
-.b8x8_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]               ; VFilter
-
-        movq        xmm1,       XMMWORD PTR [rsp]
-        movq        xmm2,       XMMWORD PTR [rsp+16]
-
-        movq        xmm3,       XMMWORD PTR [rsp+32]
-        punpcklbw   xmm1,       xmm2
-
-        movq        xmm4,       XMMWORD PTR [rsp+48]
-        punpcklbw   xmm2,       xmm3
-
-        movq        xmm5,       XMMWORD PTR [rsp+64]
-        punpcklbw   xmm3,       xmm4
-
-        movq        xmm6,       XMMWORD PTR [rsp+80]
-        punpcklbw   xmm4,       xmm5
-
-        movq        xmm7,       XMMWORD PTR [rsp+96]
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm1,       xmm0
-        pmaddubsw   xmm2,       xmm0
-
-        pmaddubsw   xmm3,       xmm0
-        pmaddubsw   xmm4,       xmm0
-
-        pmaddubsw   xmm5,       xmm0
-        punpcklbw   xmm6,       xmm7
-
-        pmaddubsw   xmm6,       xmm0
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        psraw       xmm6,       VP9_FILTER_SHIFT
-        packuswb    xmm1,       xmm1
-
-        packuswb    xmm2,       xmm2
-        movq        [rdi],      xmm1
-
-        packuswb    xmm3,       xmm3
-        movq        [rdi+rdx],  xmm2
-
-        packuswb    xmm4,       xmm4
-        movq        xmm1,       XMMWORD PTR [rsp+112]
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        xmm2,       XMMWORD PTR [rsp+128]
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm3
-
-        packuswb    xmm6,       xmm6
-        movq        [rdi+rdx],  xmm4
-
-        lea         rdi,        [rdi + 2*rdx]
-        punpcklbw   xmm7,       xmm1
-
-        movq        [rdi],      xmm5
-        pmaddubsw   xmm7,       xmm0
-
-        movq        [rdi+rdx],  xmm6
-        punpcklbw   xmm1,       xmm2
-
-        pmaddubsw   xmm1,       xmm0
-        paddw       xmm7,       [GLOBAL(rd)]
-
-        psraw       xmm7,       VP9_FILTER_SHIFT
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        psraw       xmm1,       VP9_FILTER_SHIFT
-        packuswb    xmm7,       xmm7
-
-        packuswb    xmm1,       xmm1
-        lea         rdi,        [rdi + 2*rdx]
-
-        movq        [rdi],      xmm7
-
-        movq        [rdi+rdx],  xmm1
-        lea         rsp,        [rsp + 144]
-
-        jmp         .done8x8
-
-.b8x8_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-
-.next_row_fp:
-        movdqa      xmm1,       XMMWORD PTR [rsp]
-        movdqa      xmm3,       XMMWORD PTR [rsp+16]
-
-        movdqa      xmm2,       xmm1
-        movdqa      xmm5,       XMMWORD PTR [rsp+32]
-
-        psrldq      xmm2,       1
-        movdqa      xmm7,       XMMWORD PTR [rsp+48]
-
-        movdqa      xmm4,       xmm3
-        psrldq      xmm4,       1
-
-        movdqa      xmm6,       xmm5
-        psrldq      xmm6,       1
-
-        punpcklbw   xmm1,       xmm2
-        pmaddubsw   xmm1,       xmm0
-
-        punpcklbw   xmm3,       xmm4
-        pmaddubsw   xmm3,       xmm0
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm0
-
-        movdqa      xmm2,       xmm7
-        psrldq      xmm2,       1
-
-        punpcklbw   xmm7,       xmm2
-        pmaddubsw   xmm7,       xmm0
-
-        paddw       xmm1,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm7,       [GLOBAL(rd)]
-        psraw       xmm7,       VP9_FILTER_SHIFT
-
-        packuswb    xmm1,       xmm1
-        packuswb    xmm3,       xmm3
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm1
-
-        packuswb    xmm7,       xmm7
-        movq        [rdi+rdx],  xmm3
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        [rdi],      xmm5
-
-        lea         rsp,        [rsp + 4*16]
-        movq        [rdi+rdx],  xmm7
-
-        lea         rdi,        [rdi + 2*rdx]
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-        lea         rsp,        [rsp + 16]
-
-.done8x8:
-    ;add rsp, 144
-    pop         rsp
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
-    times 8 dw 0x40
-
-align 16
-k0_k5:
-    times 8 db 0, 0             ;placeholder
-    times 8 db 0, 0
-    times 8 db 2, 1
-    times 8 db 0, 0
-    times 8 db 3, 3
-    times 8 db 0, 0
-    times 8 db 1, 2
-    times 8 db 0, 0
-k1_k3:
-    times 8 db  0,    0         ;placeholder
-    times 8 db  -6,  12
-    times 8 db -11,  36
-    times 8 db  -9,  50
-    times 8 db -16,  77
-    times 8 db  -6,  93
-    times 8 db  -8, 108
-    times 8 db  -1, 123
-k2_k4:
-    times 8 db 128,    0        ;placeholder
-    times 8 db 123,   -1
-    times 8 db 108,   -8
-    times 8 db  93,   -6
-    times 8 db  77,  -16
-    times 8 db  50,   -9
-    times 8 db  36,  -11
-    times 8 db  12,   -6
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db 96,  32
-    times 8 db 88,  40
-    times 8 db 80,  48
-    times 8 db 72,  56
-    times 8 db 64,  64
-    times 8 db 56,  72
-    times 8 db 48,  80
-    times 8 db 40,  88
-    times 8 db 32,  96
-    times 8 db 24,  104
-    times 8 db 16,  112
-    times 8 db 8,   120
-
diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h
deleted file mode 100644
index 25bc26d9b..000000000
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif
author	John Koleszar <jkoleszar@google.com>	2013-04-12 15:33:04 -0700
committer	John Koleszar <jkoleszar@google.com>	2013-04-16 06:49:46 -0700
commit	7f7d1357a2732e0a1c36f3baded7dd14f449e535 (patch)
tree	6bee68dd36c842cd700ee8f670d1380e37acd77d /vp9/common/x86
parent	282c963923eb969c146d63e934bbece433a95282 (diff)
parent	868ecb55a1528ca3f19286e7d1551572bf89b642 (diff)
download	libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.gz libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.bz2 libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.zip