21 files changed, 8934 insertions, 0 deletions
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
new file mode 100644
index 000000000..186ee6856
--- /dev/null
+++ b/vp8/encoder/x86/csystemdependent.c
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *, short *);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern int vp8_block_error_c(short *, short *);
+extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_mmx(short *, short *);
+extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_xmm(short *, short *);
+extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
+
+
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// c imports
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
+
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction vp8_sad16x16_c;
+extern SADFunction vp8_sad16x8_c;
+extern SADFunction vp8_sad8x16_c;
+extern SADFunction vp8_sad8x8_c;
+extern SADFunction vp8_sad4x4_c;
+
+extern SADFunction vp8_sad16x16_wmt;
+extern SADFunction vp8_sad16x8_wmt;
+extern SADFunction vp8_sad8x16_wmt;
+extern SADFunction vp8_sad8x8_wmt;
+extern SADFunction vp8_sad4x4_wmt;
+
+extern SADFunction vp8_sad16x16_mmx;
+extern SADFunction vp8_sad16x8_mmx;
+extern SADFunction vp8_sad8x16_mmx;
+extern SADFunction vp8_sad8x8_mmx;
+extern SADFunction vp8_sad4x4_mmx;
+
+extern variance_function vp8_variance16x16_c;
+extern variance_function vp8_variance8x16_c;
+extern variance_function vp8_variance16x8_c;
+extern variance_function vp8_variance8x8_c;
+extern variance_function vp8_variance4x4_c;
+extern variance_function vp8_mse16x16_c;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// mmx imports
+extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
+extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_mmx;
+extern variance_function vp8_variance8x8_mmx;
+extern variance_function vp8_variance8x16_mmx;
+extern variance_function vp8_variance16x8_mmx;
+extern variance_function vp8_variance16x16_mmx;
+
+extern variance_function vp8_mse16x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
+
+extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_mmx(short *);
+extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+
+// wmt imports
+extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
+extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_wmt;
+extern variance_function vp8_variance8x8_wmt;
+extern variance_function vp8_variance8x16_wmt;
+extern variance_function vp8_variance16x8_wmt;
+extern variance_function vp8_variance16x16_wmt;
+
+extern variance_function vp8_mse16x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
+extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
+extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+void vp8_cmachine_specific_config(void)
+{
+    int mmx_enabled;
+    int xmm_enabled;
+    int wmt_enabled;
+
+    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+    if (wmt_enabled)         // Willamette
+    {
+        // Willamette instruction set available:
+        vp8_mbuverror                = vp8_mbuverror_xmm;
+        vp8_fast_quantize_b            = vp8_fast_quantize_b_sse;
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
+        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_wmt;
+        vp8_subtract_b                = vp8_subtract_b_mmx;
+        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
+        vp8_variance4x4              = vp8_variance4x4_mmx;
+        vp8_variance8x8              = vp8_variance8x8_mmx;
+        vp8_variance8x16             = vp8_variance8x16_wmt;
+        vp8_variance16x8             = vp8_variance16x8_wmt;
+        vp8_variance16x16            = vp8_variance16x16_wmt;
+        vp8_mse16x16                 = vp8_mse16x16_wmt;
+        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_wmt;
+        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_wmt;
+        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_wmt;
+        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_wmt;
+        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_wmt;
+        vp8_get_mb_ss                  = vp8_get_mb_ss_sse2;
+        vp8_get16x16pred_error        = vp8_get16x16pred_error_sse2;
+        vp8_get8x8var                = vp8_get8x8var_sse2;
+        vp8_get16x16var              = vp8_get16x16var_sse2;
+        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
+        vp8_sad16x16                 = vp8_sad16x16_wmt;
+        vp8_sad16x8                  = vp8_sad16x8_wmt;
+        vp8_sad8x16                  = vp8_sad8x16_wmt;
+        vp8_sad8x8                   = vp8_sad8x8_wmt;
+        vp8_sad4x4                   = vp8_sad4x4_wmt;
+        vp8_block_error               = vp8_block_error_xmm;
+        vp8_mbblock_error             = vp8_mbblock_error_xmm;
+        vp8_subtract_mby              = vp8_subtract_mby_mmx;
+
+    }
+    else if (mmx_enabled)
+    {
+        // MMX instruction set available:
+        vp8_mbuverror                = vp8_mbuverror_mmx;
+        vp8_fast_quantize_b            = vp8_fast_quantize_b_mmx;
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
+        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_mmx;
+        vp8_subtract_b                = vp8_subtract_b_mmx;
+        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
+        vp8_variance4x4              = vp8_variance4x4_mmx;
+        vp8_variance8x8              = vp8_variance8x8_mmx;
+        vp8_variance8x16             = vp8_variance8x16_mmx;
+        vp8_variance16x8             = vp8_variance16x8_mmx;
+        vp8_variance16x16            = vp8_variance16x16_mmx;
+        vp8_mse16x16                 = vp8_mse16x16_mmx;
+        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_mmx;
+        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_mmx;
+        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_mmx;
+        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_mmx;
+        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_mmx;
+        vp8_get_mb_ss                  = vp8_get_mb_ss_mmx;
+        vp8_get16x16pred_error        = vp8_get16x16pred_error_mmx;
+        vp8_get8x8var                = vp8_get8x8var_mmx;
+        vp8_get16x16var              = vp8_get16x16var_mmx;
+        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
+        vp8_sad16x16                 = vp8_sad16x16_mmx;
+        vp8_sad16x8                  = vp8_sad16x8_mmx;
+        vp8_sad8x16                  = vp8_sad8x16_mmx;
+        vp8_sad8x8                   = vp8_sad8x8_mmx;
+        vp8_sad4x4                   = vp8_sad4x4_mmx;
+        vp8_block_error               = vp8_block_error_mmx;
+        vp8_mbblock_error             = vp8_mbblock_error_mmx;
+        vp8_subtract_mby              = vp8_subtract_mby_mmx;
+
+    }
+    else
+    {
+        // Pure C:
+        vp8_mbuverror                = vp8_mbuverror_c;
+        vp8_fast_quantize_b            = vp8_fast_quantize_b_c;
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
+        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_c;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
+        vp8_subtract_b                = vp8_subtract_b_c;
+        vp8_subtract_mbuv             = vp8_subtract_mbuv_c;
+        vp8_variance4x4              = vp8_variance4x4_c;
+        vp8_variance8x8              = vp8_variance8x8_c;
+        vp8_variance8x16             = vp8_variance8x16_c;
+        vp8_variance16x8             = vp8_variance16x8_c;
+        vp8_variance16x16            = vp8_variance16x16_c;
+        vp8_mse16x16                 = vp8_mse16x16_c;
+        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_c;
+        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_c;
+        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_c;
+        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_c;
+        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_c;
+        vp8_get_mb_ss                  = vp8_get_mb_ss_c;
+        vp8_get16x16pred_error        = vp8_get16x16pred_error_c;
+        vp8_get8x8var                = vp8_get8x8var_c;
+        vp8_get16x16var              = vp8_get16x16var_c;
+        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_c;
+        vp8_sad16x16                 = vp8_sad16x16_c;
+        vp8_sad16x8                  = vp8_sad16x8_c;
+        vp8_sad8x16                  = vp8_sad8x16_c;
+        vp8_sad8x8                   = vp8_sad8x8_c;
+        vp8_sad4x4                   = vp8_sad4x4_c;
+        vp8_block_error               = vp8_block_error_c;
+        vp8_mbblock_error             = vp8_mbblock_error_c;
+        vp8_subtract_mby              = vp8_subtract_mby_c;
+    }
+
+}
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 000000000..e13423796
--- /dev/null
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -0,0 +1,846 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+    global sym(vp8_short_fdct4x4_mmx)
+    global sym(vp8_fast_fdct4x4_mmx)
+    global sym(vp8_fast_fdct8x4_wmt)
+
+
+%define         DCTCONSTANTSBITS         (16)
+%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
+%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
+%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
+%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
+
+
+%define _1STSTAGESHIFT           14
+%define _2NDSTAGESHIFT           16
+
+; using matrix multiply with source and destbuffer has a pitch
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_short_fdct4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,    arg(0) ;input
+        mov         rdi,    arg(1) ;output
+
+        movsxd      rax,    dword ptr arg(2) ;pitch
+        lea         rdx,    [dct_matrix GLOBAL]
+
+        movq        mm0,    [rsi   ]
+        movq        mm1,    [rsi + rax]
+
+        movq        mm2,    [rsi + rax*2]
+        lea         rsi,    [rsi + rax*2]
+
+        movq        mm3,    [rsi + rax]
+
+        ; first column
+        movq        mm4,    mm0
+        movq        mm7,    [rdx]
+
+        pmaddwd     mm4,    mm7
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    mm7
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+
+        pmaddwd     mm5,    mm7
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    mm7
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _1STSTAGESHIFT
+        psrad       mm5,    _1STSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi],  mm4
+
+        ;second column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+8]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+8]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _1STSTAGESHIFT
+        psrad       mm5,    _1STSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+8],  mm4
+
+
+        ;third column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+16]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+16]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _1STSTAGESHIFT
+        psrad       mm5,    _1STSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+16],  mm4
+
+        ;fourth column (this is the last column, so we do not have save the source any more)
+
+        pmaddwd     mm0,    [rdx+24]
+
+        pmaddwd     mm1,    [rdx+24]
+        movq        mm6,    mm0
+
+        punpckldq   mm0,    mm1
+        punpckhdq   mm6,    mm1
+
+        paddd       mm0,    mm6
+
+        pmaddwd     mm2,    [rdx+24]
+
+        pmaddwd     mm3,    [rdx+24]
+        movq        mm7,    mm2
+
+        punpckldq   mm2,    mm3
+        punpckhdq   mm7,    mm3
+
+        paddd       mm2,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm0,    mm6
+        paddd       mm2,    mm6
+
+        psrad       mm0,    _1STSTAGESHIFT
+        psrad       mm2,    _1STSTAGESHIFT
+
+        packssdw    mm0,    mm2
+
+        movq        mm3,    mm0
+
+        ; done with one pass
+        ; now start second pass
+        movq        mm0,    [rdi   ]
+        movq        mm1,    [rdi+ 8]
+        movq        mm2,    [rdi+ 16]
+
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi],  mm4
+
+        ;second column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+8]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+8]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+8],  mm4
+
+
+        ;third column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+16]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+16]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+16],  mm4
+
+        ;fourth column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+24]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+24]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+24]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+24]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+24],  mm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_fast_fdct4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+        mov     rsi,    arg(0) ;input
+        mov     rdi,    arg(1) ;output
+
+        lea     rdx,    [dct_const_mmx GLOBAL]
+        movsxd  rax,    dword ptr arg(2) ;pitch
+
+        lea     rcx,    [rsi + rax*2]
+        ; read the input data
+        movq    mm0,    [rsi]
+        movq    mm1,    [rsi + rax    ]
+
+        movq    mm2,    [rcx]
+        movq    mm3,    [rcx + rax]
+        ; get the constants
+        ;shift to left by 1 for prescision
+        paddw   mm0,    mm0
+        paddw   mm1,    mm1
+
+        psllw   mm2,    1
+        psllw   mm3,    1
+
+        ; transpose for the second stage
+        movq    mm4,    mm0         ; 00 01 02 03
+        movq    mm5,    mm2         ; 10 11 12 03
+
+        punpcklwd   mm0,    mm1     ; 00 10 01 11
+        punpckhwd   mm4,    mm1     ; 02 12 03 13
+
+        punpcklwd   mm2,    mm3     ; 20 30 21 31
+        punpckhwd   mm5,    mm3     ; 22 32 23 33
+
+
+        movq        mm1,    mm0     ; 00 10 01 11
+        punpckldq   mm0,    mm2     ; 00 10 20 30
+
+        punpckhdq   mm1,    mm2     ; 01 11 21 31
+
+        movq        mm2,    mm4     ; 02 12 03 13
+        punpckldq   mm2,    mm5     ; 02 12 22 32
+
+        punpckhdq   mm4,    mm5     ; 03 13 23 33
+        movq        mm3,    mm4
+
+
+        ; first stage
+        movq    mm5,    mm0
+        movq    mm4,    mm1
+
+        paddw   mm0,    mm3         ; a = 0 + 3
+        paddw   mm1,    mm2         ; b = 1 + 2
+
+        psubw   mm4,    mm2         ; c = 1 - 2
+        psubw   mm5,    mm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movq    mm6,    [rdx +  16] ; c2
+        movq    mm2,    mm0         ; a
+
+        paddw   mm0,    mm1         ; a + b
+        psubw   mm2,    mm1         ; a - b
+
+        movq    mm1,    mm0         ; a + b
+        pmulhw  mm0,    mm6         ; 00 01 02 03
+
+        paddw   mm0,    mm1         ; output 00 01 02 03
+        pmulhw  mm6,    mm2         ; 20 21 22 23
+
+        paddw   mm2,    mm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movq    mm6,    [rdx +  8]  ; c1
+        movq    mm7,    [rdx + 24]  ; c3
+
+        movq    mm1,    mm4         ; c
+        movq    mm3,    mm5         ; d
+
+        pmulhw  mm1,    mm7         ; c * c3
+        pmulhw  mm3,    mm6         ; d * c1
+
+        paddw   mm3,    mm5         ; d * c1 rounded
+        paddw   mm1,    mm3         ; output 10 11 12 13
+
+        movq    mm3,    mm4         ; c
+        pmulhw  mm5,    mm7         ; d * c3
+
+        pmulhw  mm4,    mm6         ; c * c1
+        paddw   mm3,    mm4         ; round c* c1
+
+        psubw   mm5,    mm3         ; output 30 31 32 33
+        movq    mm3,    mm5
+
+
+        ; done with vertical
+        ; transpose for the second stage
+        movq    mm4,    mm0         ; 00 01 02 03
+        movq    mm5,    mm2         ; 10 11 12 03
+
+        punpcklwd   mm0,    mm1     ; 00 10 01 11
+        punpckhwd   mm4,    mm1     ; 02 12 03 13
+
+        punpcklwd   mm2,    mm3     ; 20 30 21 31
+        punpckhwd   mm5,    mm3     ; 22 32 23 33
+
+
+        movq        mm1,    mm0     ; 00 10 01 11
+        punpckldq   mm0,    mm2     ; 00 10 20 30
+
+        punpckhdq   mm1,    mm2     ; 01 11 21 31
+
+        movq        mm2,    mm4     ; 02 12 03 13
+        punpckldq   mm2,    mm5     ; 02 12 22 32
+
+        punpckhdq   mm4,    mm5     ; 03 13 23 33
+        movq        mm3,    mm4
+
+
+        ; first stage
+        movq    mm5,    mm0
+        movq    mm4,    mm1
+
+        paddw   mm0,    mm3         ; a = 0 + 3
+        paddw   mm1,    mm2         ; b = 1 + 2
+
+        psubw   mm4,    mm2         ; c = 1 - 2
+        psubw   mm5,    mm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movq    mm6,    [rdx +  16] ; c2
+        movq    mm2,    mm0         ; a
+        paddw   mm0,    mm1         ; a + b
+
+        psubw   mm2,    mm1         ; a - b
+
+        movq    mm1,    mm0         ; a + b
+        pmulhw  mm0,    mm6         ; 00 01 02 03
+
+        paddw   mm0,    mm1         ; output 00 01 02 03
+        pmulhw  mm6,    mm2         ; 20 21 22 23
+
+        paddw   mm2,    mm6         ; output 20 21 22 23
+
+
+        ; output 1 and 3
+        movq    mm6,    [rdx +  8]  ; c1
+        movq    mm7,    [rdx + 24]  ; c3
+
+        movq    mm1,    mm4         ; c
+        movq    mm3,    mm5         ; d
+
+        pmulhw  mm1,    mm7         ; c * c3
+        pmulhw  mm3,    mm6         ; d * c1
+
+        paddw   mm3,    mm5         ; d * c1 rounded
+        paddw   mm1,    mm3         ; output 10 11 12 13
+
+        movq    mm3,    mm4         ; c
+        pmulhw  mm5,    mm7         ; d * c3
+
+        pmulhw  mm4,    mm6         ; c * c1
+        paddw   mm3,    mm4         ; round c* c1
+
+        psubw   mm5,    mm3         ; output 30 31 32 33
+        movq    mm3,    mm5
+        ; done with vertical
+
+		pcmpeqw	mm4,	mm4
+		pcmpeqw	mm5,	mm5
+		psrlw	mm4,	15
+		psrlw	mm5,	15
+
+        paddw   mm0,    mm4
+        paddw   mm1,    mm5
+        paddw   mm2,    mm4
+        paddw   mm3,    mm5
+
+        psraw   mm0, 1
+        psraw   mm1, 1
+        psraw   mm2, 1
+        psraw   mm3, 1
+
+        movq        [rdi   ],   mm0
+        movq        [rdi+ 8],   mm1
+        movq        [rdi+16],   mm2
+        movq        [rdi+24],   mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_fast_fdct8x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+        mov         rsi,    arg(0) ;input
+        mov         rdi,    arg(1) ;output
+
+        lea         rdx,    [dct_const_xmm GLOBAL]
+        movsxd      rax,    dword ptr arg(2) ;pitch
+
+        lea         rcx,    [rsi + rax*2]
+        ; read the input data
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm2,       [rsi + rax]
+
+        movdqa      xmm4,       [rcx]
+        movdqa      xmm3,       [rcx + rax]
+        ; get the constants
+        ;shift to left by 1 for prescision
+        psllw       xmm0,        1
+        psllw       xmm2,        1
+
+        psllw       xmm4,        1
+        psllw       xmm3,        1
+
+        ; transpose for the second stage
+        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
+
+
+        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3         ; a = 0 + 3
+        paddw       xmm1,       xmm2         ; b = 1 + 2
+
+        psubw       xmm4,       xmm2         ; c = 1 - 2
+        psubw       xmm5,       xmm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movdqa      xmm6,       [rdx +  32] ; c2
+        movdqa      xmm2,       xmm0         ; a
+
+        paddw       xmm0,       xmm1         ; a + b
+        psubw       xmm2,       xmm1         ; a - b
+
+        movdqa      xmm1,       xmm0         ; a + b
+        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+
+        paddw       xmm0,       xmm1         ; output 00 01 02 03
+        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+
+        paddw       xmm2,       xmm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movdqa      xmm6,       [rdx + 16]  ; c1
+        movdqa      xmm7,       [rdx + 48]  ; c3
+
+        movdqa      xmm1,       xmm4         ; c
+        movdqa      xmm3,       xmm5         ; d
+
+        pmulhw      xmm1,       xmm7         ; c * c3
+        pmulhw      xmm3,       xmm6         ; d * c1
+
+        paddw       xmm3,       xmm5         ; d * c1 rounded
+        paddw       xmm1,       xmm3         ; output 10 11 12 13
+
+        movdqa      xmm3,       xmm4         ; c
+        pmulhw      xmm5,       xmm7         ; d * c3
+
+        pmulhw      xmm4,       xmm6         ; c * c1
+        paddw       xmm3,       xmm4         ; round c* c1
+
+        psubw       xmm5,       xmm3         ; output 30 31 32 33
+        movdqa      xmm3,       xmm5
+
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
+        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
+
+        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
+
+
+        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3         ; a = 0 + 3
+        paddw       xmm1,       xmm2         ; b = 1 + 2
+
+        psubw       xmm4,       xmm2         ; c = 1 - 2
+        psubw       xmm5,       xmm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movdqa      xmm6,       [rdx +  32] ; c2
+        movdqa      xmm2,       xmm0         ; a
+
+        paddw       xmm0,       xmm1         ; a + b
+        psubw       xmm2,       xmm1         ; a - b
+
+        movdqa      xmm1,       xmm0         ; a + b
+        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+
+        paddw       xmm0,       xmm1         ; output 00 01 02 03
+        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+
+        paddw       xmm2,       xmm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movdqa      xmm6,       [rdx + 16]  ; c1
+        movdqa      xmm7,       [rdx + 48]  ; c3
+
+        movdqa      xmm1,       xmm4         ; c
+        movdqa      xmm3,       xmm5         ; d
+
+        pmulhw      xmm1,       xmm7         ; c * c3
+        pmulhw      xmm3,       xmm6         ; d * c1
+
+        paddw       xmm3,       xmm5         ; d * c1 rounded
+        paddw       xmm1,       xmm3         ; output 10 11 12 13
+
+        movdqa      xmm3,       xmm4         ; c
+        pmulhw      xmm5,       xmm7         ; d * c3
+
+        pmulhw      xmm4,       xmm6         ; c * c1
+        paddw       xmm3,       xmm4         ; round c* c1
+
+        psubw       xmm5,       xmm3         ; output 30 31 32 33
+        movdqa      xmm3,       xmm5
+        ; done with vertical
+
+
+        pcmpeqw		xmm4,		xmm4
+        pcmpeqw		xmm5,		xmm5;
+        psrlw		xmm4,		15
+        psrlw		xmm5,		15
+
+        paddw       xmm0,       xmm4
+        paddw       xmm1,       xmm5
+        paddw       xmm2,       xmm4
+        paddw       xmm3,       xmm5
+
+        psraw       xmm0,       1
+        psraw       xmm1,       1
+        psraw       xmm2,       1
+        psraw       xmm3,       1
+
+        movq        QWORD PTR[rdi   ],   xmm0
+        movq        QWORD PTR[rdi+ 8],   xmm1
+        movq        QWORD PTR[rdi+16],   xmm2
+        movq        QWORD PTR[rdi+24],   xmm3
+
+        psrldq      xmm0,       8
+        psrldq      xmm1,       8
+        psrldq      xmm2,       8
+        psrldq      xmm3,       8
+
+        movq        QWORD PTR[rdi+32],   xmm0
+        movq        QWORD PTR[rdi+40],   xmm1
+        movq        QWORD PTR[rdi+48],   xmm2
+        movq        QWORD PTR[rdi+56],   xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;static const unsigned int dct1st_stage_rounding_mmx[2] =
+align 16
+dct1st_stage_rounding_mmx:
+    times 2 dd 8192
+
+
+;static const unsigned int dct2nd_stage_rounding_mmx[2] =
+align 16
+dct2nd_stage_rounding_mmx:
+    times 2 dd 32768
+
+
+;static const short dct_matrix[4][4]=
+align 16
+dct_matrix:
+    times 4 dw 23170
+
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+
+    dw 23170
+    times 2 dw -23170
+    dw 23170
+
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540
+
+
+;static const unsigned short dct_const_mmx[4 * 4]=
+align 16
+dct_const_mmx:
+    times 4 dw 0
+    times 4 dw 60547
+    times 4 dw 46341
+    times 4 dw 25080
+
+
+;static const unsigned short dct_const_xmm[8 * 4]=
+align 16
+dct_const_xmm:
+    times 8 dw 0
+    times 8 dw 60547
+    times 8 dw 46341
+    times 8 dw 25080
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..3e5e9a70c
--- /dev/null
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,260 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_short_fdct4x4_wmt)
+
+%define         DCTCONSTANTSBITS         (16)
+%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
+%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
+%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
+%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
+
+%define _1STSTAGESHIFT           14
+%define _2NDSTAGESHIFT           16
+
+
+;; using matrix multiply
+;void vp8_short_fdct4x4_wmt(short *input, short *output)
+sym(vp8_short_fdct4x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rax,        arg(0) ;input
+        mov         rcx,        arg(1) ;output
+
+        lea         rdx,        [dct_matrix_sse2 GLOBAL]
+
+        movdqu      xmm0,       [rax   ]
+        movdqu      xmm1,       [rax+16]
+
+        ; first column
+        movdqa      xmm2,       xmm0
+        movdqa      xmm7,       [rdx]
+
+        pmaddwd     xmm2,       xmm7
+        movdqa      xmm3,       xmm1
+
+        pmaddwd     xmm3,       xmm7
+        movdqa      xmm4,       xmm2
+
+        punpckldq   xmm2,       xmm3
+        punpckhdq   xmm4,       xmm3
+
+        movdqa      xmm3,       xmm2
+        punpckldq   xmm2,       xmm4
+
+        punpckhdq   xmm3,       xmm4
+        paddd       xmm2,       xmm3
+
+
+        paddd       xmm2,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+        psrad       xmm2,       _1STSTAGESHIFT
+        ;second column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+16]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+16]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+        psrad       xmm3,       _1STSTAGESHIFT
+        packssdw    xmm2,       xmm3
+
+        ;third column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+32]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+32]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm3,       _1STSTAGESHIFT
+
+        ;fourth column (this is the last column, so we do not have save the source any more)
+        pmaddwd     xmm0,       [rdx+48]
+        pmaddwd     xmm1,       [rdx+48]
+
+        movdqa      xmm4,       xmm0
+        punpckldq   xmm0,       xmm1
+
+        punpckhdq   xmm4,       xmm1
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm4
+        punpckhdq   xmm1,       xmm4
+
+        paddd       xmm0,       xmm1
+        paddd       xmm0,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+        psrad       xmm0,       _1STSTAGESHIFT
+        packssdw    xmm3,       xmm0
+        ; done with one pass
+        ; now start second pass
+        movdqa      xmm0,       xmm2
+        movdqa      xmm1,       xmm3
+
+        pmaddwd     xmm2,       xmm7
+        pmaddwd     xmm3,       xmm7
+
+        movdqa      xmm4,       xmm2
+        punpckldq   xmm2,       xmm3
+
+        punpckhdq   xmm4,       xmm3
+        movdqa      xmm3,       xmm2
+
+        punpckldq   xmm2,       xmm4
+        punpckhdq   xmm3,       xmm4
+
+        paddd       xmm2,       xmm3
+        paddd       xmm2,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm2,       _2NDSTAGESHIFT
+
+        ;second column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+16]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+16]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm3,       _2NDSTAGESHIFT
+        packssdw    xmm2,       xmm3
+
+        movdqu      [rcx],      xmm2
+        ;third column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+32]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+32]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm3,       _2NDSTAGESHIFT
+        ;fourth column
+        pmaddwd     xmm0,       [rdx+48]
+        pmaddwd     xmm1,       [rdx+48]
+
+        movdqa      xmm4,       xmm0
+        punpckldq   xmm0,       xmm1
+
+        punpckhdq   xmm4,       xmm1
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm4
+        punpckhdq   xmm1,       xmm4
+
+        paddd       xmm0,       xmm1
+        paddd       xmm0,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm0,       _2NDSTAGESHIFT
+        packssdw    xmm3,       xmm0
+
+        movdqu     [rcx+16],   xmm3
+
+    mov rsp, rbp
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;static unsigned int dct1st_stage_rounding_sse2[4] =
+align 16
+dct1st_stage_rounding_sse2:
+    times 4 dd 8192
+
+
+;static unsigned int dct2nd_stage_rounding_sse2[4] =
+align 16
+dct2nd_stage_rounding_sse2:
+    times 4 dd 32768
+
+;static short dct_matrix_sse2[4][8]=
+align 16
+dct_matrix_sse2:
+    times 8 dw 23170
+
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+
+    dw  23170
+    times 2 dw -23170
+    times 2 dw  23170
+    times 2 dw -23170
+    dw  23170
+
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
new file mode 100644
index 000000000..bc80e64ef
--- /dev/null
+++ b/vp8/encoder/x86/dct_x86.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DCT_X86_H
+#define DCT_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_fdct(vp8_short_fdct4x4_mmx);
+extern prototype_fdct(vp8_short_fdct8x4_mmx);
+extern prototype_fdct(vp8_fast_fdct4x4_mmx);
+extern prototype_fdct(vp8_fast_fdct8x4_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_fdct(vp8_short_fdct4x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_fast_fdct8x4_wmt);
+
+extern prototype_fdct(vp8_short_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#if 0
+/* short SSE2 DCT currently disabled, does not match the MMX version */
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#endif
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2
+
+#endif
+
+
+#endif
+
+#endif
diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
new file mode 100644
index 000000000..9397a6cca
--- /dev/null
+++ b/vp8/encoder/x86/encodemb_x86.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_X86_H
+#define ENCODEMB_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_berr(vp8_block_error_mmx);
+extern prototype_mberr(vp8_mbblock_error_mmx);
+extern prototype_mbuverr(vp8_mbuverror_mmx);
+extern prototype_subb(vp8_subtract_b_mmx);
+extern prototype_submby(vp8_subtract_mby_mmx);
+extern prototype_submbuv(vp8_subtract_mbuv_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_mmx
+
+#undef  vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_mmx
+
+#undef  vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_mmx
+
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_mmx
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_mmx
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_berr(vp8_block_error_xmm);
+extern prototype_mberr(vp8_mbblock_error_xmm);
+extern prototype_mbuverr(vp8_mbuverror_xmm);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_xmm
+
+#undef  vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_xmm
+
+#undef  vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_xmm
+
+#endif
+#endif
+
+
+#endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 000000000..194047155
--- /dev/null
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -0,0 +1,393 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_xmm)
+sym(vp8_block_error_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm7,       xmm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movdqa      xmm3,       [rsi]
+
+        movdqa      xmm4,       [rdi]
+        movdqa      xmm5,       [rsi+16]
+
+        movdqa      xmm6,       [rdi+16]
+        pxor        xmm1,       xmm1    ; from movd xmm1, dc; dc=0
+
+        movdqa      xmm2,       xmm7
+        psubw       xmm5,       xmm6
+
+        por         xmm1,       xmm2
+        pmaddwd     xmm5,       xmm5
+
+        pcmpeqw     xmm1,       xmm7
+        psubw       xmm3,       xmm4
+
+        pand        xmm1,       xmm3
+        pmaddwd     xmm1,       xmm1
+
+        paddd       xmm1,       xmm5
+        movdqa      xmm0,       xmm1
+
+        punpckldq   xmm0,       xmm7
+        punpckhdq   xmm1,       xmm7
+
+        paddd       xmm0,       xmm1
+        movdqa      xmm1,       xmm0
+
+        psrldq      xmm0,       8
+        paddd       xmm0,       xmm1
+
+        movd        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_mmx)
+sym(vp8_block_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movq        mm3,        [rsi]
+
+        movq        mm4,        [rdi]
+        movq        mm5,        [rsi+8]
+
+        movq        mm6,        [rdi+8]
+        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
+
+        movq        mm2,        mm7
+        psubw       mm5,        mm6
+
+        por         mm1,        mm2
+        pmaddwd     mm5,        mm5
+
+        pcmpeqw     mm1,        mm7
+        psubw       mm3,        mm4
+
+        pand        mm1,        mm3
+        pmaddwd     mm1,        mm1
+
+        paddd       mm1,        mm5
+        movq        mm3,        [rsi+16]
+
+        movq        mm4,        [rdi+16]
+        movq        mm5,        [rsi+24]
+
+        movq        mm6,        [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm3,        mm5
+
+        paddd       mm1,        mm3
+        movq        mm0,        mm1
+
+        psrlq       mm1,        32
+        paddd       mm0,        mm1
+
+        movd        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl)
+sym(vp8_mbblock_error_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        mm2,        mm2
+
+        movd        mm1,        dword ptr arg(2) ;dc
+        por         mm1,        mm2
+
+        pcmpeqw     mm1,        mm7
+        mov         rcx,        16
+
+mberror_loop_mmx:
+        movq        mm3,       [rsi]
+        movq        mm4,       [rdi]
+
+        movq        mm5,       [rsi+8]
+        movq        mm6,       [rdi+8]
+
+
+        psubw       mm5,        mm6
+        pmaddwd     mm5,        mm5
+
+        psubw       mm3,        mm4
+        pand        mm3,        mm1
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        movq        mm3,       [rsi+16]
+
+        movq        mm4,       [rdi+16]
+        movq        mm5,       [rsi+24]
+
+        movq        mm6,       [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        add         rsi,        32
+
+        add         rdi,        32
+        sub         rcx,        1
+
+        jnz         mberror_loop_mmx
+
+        movq        mm0,        mm2
+        psrlq       mm2,        32
+
+        paddd       mm0,        mm2
+        movd        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl)
+sym(vp8_mbblock_error_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm7,       xmm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        xmm2,       xmm2
+
+        movd        xmm1,       dword ptr arg(2) ;dc
+        por         xmm1,       xmm2
+
+        pcmpeqw     xmm1,       xmm7
+        mov         rcx,        16
+
+mberror_loop:
+        movdqa      xmm3,       [rsi]
+        movdqa      xmm4,       [rdi]
+
+        movdqa      xmm5,       [rsi+16]
+        movdqa      xmm6,       [rdi+16]
+
+
+        psubw       xmm5,       xmm6
+        pmaddwd     xmm5,       xmm5
+
+        psubw       xmm3,       xmm4
+        pand        xmm3,       xmm1
+
+        pmaddwd     xmm3,       xmm3
+        add         rsi,        32
+
+        add         rdi,        32
+
+        sub         rcx,        1
+        paddd       xmm2,       xmm5
+
+        paddd       xmm2,       xmm3
+        jnz         mberror_loop
+
+        movdqa      xmm0,       xmm2
+        punpckldq   xmm0,       xmm7
+
+        punpckhdq   xmm2,       xmm7
+        paddd       xmm0,       xmm2
+
+        movdqa      xmm1,       xmm0
+        psrldq      xmm0,       8
+
+        paddd       xmm0,       xmm1
+        movd        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl)
+sym(vp8_mbuverror_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            mm7,        mm7
+
+mbuverror_loop_mmx:
+
+        movq            mm1,        [rsi]
+        movq            mm2,        [rdi]
+
+        psubw           mm1,        mm2
+        pmaddwd         mm1,        mm1
+
+
+        movq            mm3,        [rsi+8]
+        movq            mm4,        [rdi+8]
+
+        psubw           mm3,        mm4
+        pmaddwd         mm3,        mm3
+
+
+        paddd           mm7,        mm1
+        paddd           mm7,        mm3
+
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             mbuverror_loop_mmx
+
+        movq            mm0,        mm7
+        psrlq           mm7,        32
+
+        paddd           mm0,        mm7
+        movd            rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl)
+sym(vp8_mbuverror_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            xmm7,       xmm7
+
+mbuverror_loop:
+
+        movdqa          xmm1,       [rsi]
+        movdqa          xmm2,       [rdi]
+
+        psubw           xmm1,       xmm2
+        pmaddwd         xmm1,       xmm1
+
+        paddd           xmm7,       xmm1
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             mbuverror_loop
+
+        pxor        xmm0,           xmm0
+        movdqa      xmm1,           xmm7
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        paddd       xmm1,           xmm2
+
+        movdqa      xmm2,           xmm1
+
+        psrldq      xmm1,           8
+        paddd       xmm1,           xmm2
+
+        movd            rax,            xmm1
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 000000000..7d8620178
--- /dev/null
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -0,0 +1,117 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2)
+sym(vp8_short_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)
+    mov     rdi, arg(1)
+
+    movdqu    xmm4, [rsi + 0]       ;ip[4] ip[0]
+    movdqu    xmm0, [rsi + 16]      ;ip[12] ip[8]
+
+    pxor  xmm7, xmm7
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm4          ;ip[4] ip[0]
+
+    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm5, xmm4
+    punpcklqdq  xmm4, xmm3          ;d1 a1
+    punpckhqdq  xmm5, xmm3          ;c1 b1
+
+    movdqa    xmm1, xmm5          ;c1 b1
+    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm5          ;ip[4] ip[0]
+
+    paddw   xmm5, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm6, xmm5
+    punpcklqdq  xmm5, xmm3          ;d1 a1
+    punpckhqdq  xmm6, xmm3          ;c1 b1
+
+    movdqa    xmm1, xmm6          ;c1 b1
+    paddw   xmm6, xmm5          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm5, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    movdqa    xmm0, xmm6          ;aka b2 a2
+    movdqa    xmm1, xmm5          ;aka d2 c2
+
+    pcmpgtw   xmm0, xmm7
+    pcmpgtw   xmm1, xmm7
+
+    psrlw   xmm0, 15
+    psrlw   xmm1, 15
+
+    paddw   xmm6, xmm0
+    paddw   xmm5, xmm1
+
+    psraw   xmm6, 1
+    psraw   xmm5, 1
+
+    ;   a2 = a1 + b1;
+    ;   b2 = c1 + d1;
+    ;   c2 = a1 - b1;
+    ;   d2 = d1 - c1;
+    ;        a2 += (a2>0);
+    ;        b2 += (b2>0);
+    ;        c2 += (c2>0);
+    ;        d2 += (d2>0);
+    ;   op[0] = (a2)>>1;
+    ;   op[4] = (b2)>>1;
+    ;   op[8] = (c2)>>1;
+    ;   op[12]= (d2)>>1;
+
+    movdqu  [rdi + 0], xmm6
+    movdqu  [rdi + 16], xmm5
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
new file mode 100644
index 000000000..5661491ad
--- /dev/null
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef MCOMP_X86_H
+#define MCOMP_X86_H
+
+#if HAVE_SSE3
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx3
+
+#undef  vp8_search_diamond_search
+#define vp8_search_diamond_search vp8_diamond_search_sadx4
+
+#endif
+#endif
+
+#endif
+
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
new file mode 100644
index 000000000..69617ca47
--- /dev/null
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -0,0 +1,297 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "memory.h"
+#include "preproc.h"
+#include "pragmas.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+*  Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_wmt
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_wmt
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 8; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 8; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 8;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        xmm1, xmm1              // accumulator
+
+                pxor        xmm7, xmm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        xmm2, xmm2              // count
+
+                movq        xmm3, QWORD PTR [edi]
+
+                movq        QWORD PTR [esi+8*eax], xmm3
+
+                punpcklbw   xmm3, xmm2              // xmm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
+                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
+                movdqa      xmm6, xmm4              // save the pixel values
+                psubsw      xmm4, xmm3              // subtracted pixel values
+                pmullw      xmm4, xmm4              // square xmm4
+                movd        xmm5, strength
+                psrlw       xmm4, xmm5              // should be strength
+                pmullw      xmm4, threes            // 3 * modifier
+                movdqa      xmm5, sixteens          // 16s
+                psubusw     xmm5, xmm4              // 16 - modifiers
+                movdqa      xmm4, xmm5              // save the modifiers
+                pmullw      xmm4, xmm6              // multiplier values
+                paddusw     xmm1, xmm4              // accumulator
+                paddusw     xmm2, xmm5              // count
+                add         esi, 8                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movdqa      counts, xmm2
+                psrlw       xmm2, 1                 // divide count by 2 for rounding
+                paddusw     xmm1, xmm2              // rounding added in
+
+                mov         frameptr, esi
+
+                movdqa      sums, xmm1
+            }
+
+            for (i = 0; i < 8; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 8;
+            d += 8;
+            byte += 8;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_mmx
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_mmx
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 4; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 4; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 4;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        mm1, mm1                // accumulator
+                pxor        mm7, mm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        mm2, mm2                // count
+
+                movd        mm3, DWORD PTR [edi]
+                movd        DWORD PTR [esi+4*eax], mm3
+
+                punpcklbw   mm3, mm2                // mm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movd        mm4, DWORD PTR [esi]    // get frame buffer values
+                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
+                movq        mm6, mm4                // save the pixel values
+                psubsw      mm4, mm3                // subtracted pixel values
+                pmullw      mm4, mm4                // square mm4
+                movd        mm5, strength
+                psrlw       mm4, mm5                // should be strength
+                pmullw      mm4, threes             // 3 * modifier
+                movq        mm5, sixteens           // 16s
+                psubusw     mm5, mm4                // 16 - modifiers
+                movq        mm4, mm5                // save the modifiers
+                pmullw      mm4, mm6                // multiplier values
+                paddusw     mm1, mm4                // accumulator
+                paddusw     mm2, mm5                // count
+                add         esi, 4                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movq        counts, mm2
+                psrlw       mm2, 1                  // divide count by 2 for rounding
+                paddusw     mm1, mm2                // rounding added in
+
+                mov         frameptr, esi
+
+                movq        sums, mm1
+
+            }
+
+            for (i = 0; i < 4; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 4;
+            d += 4;
+            byte += 4;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 000000000..847fc6e37
--- /dev/null
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -0,0 +1,438 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx)
+sym(vp8_fast_quantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movq            mm0,        [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm1,        [rax]
+
+        movq            mm3,        mm0
+        psraw           mm0,        15
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0         ; abs
+
+        movq            mm2,        mm3
+        pcmpgtw         mm1,        mm2
+
+        pandn           mm1,        mm2
+        movq            mm3,        mm1
+
+        mov             rdx,        arg(6) ;quant_ptr
+        movq            mm1,        [rdx]
+
+        mov             rcx,        arg(5) ;round_ptr
+        movq            mm2,        [rcx]
+
+        paddw           mm3,        mm2
+        pmulhuw         mm3,        mm1
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0     ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movq            mm0,        mm3
+
+        movq            [rdi],      mm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm2,        [rax]
+
+        pmullw          mm3,        mm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax],      mm3
+
+        ; next 8
+        movq            mm4,        [rsi+8]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+8]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+8]
+        movq            mm6,        [rcx+8]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+8],    mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+8]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+8],    mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+16]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+16]
+        movq            mm6,        [rcx+16]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+16],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+16]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+16],   mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+24]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+24]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+24]
+        movq            mm6,        [rcx+24]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+24],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+24]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+24],   mm7
+
+
+
+        mov             rdi,        arg(4) ;scan_mask
+        mov             rsi,        arg(2) ;qcoeff_ptr
+
+        pxor            mm5,        mm5
+        pxor            mm7,        mm7
+
+        movq            mm0,        [rsi]
+        movq            mm1,        [rsi+8]
+
+        movq            mm2,        [rdi]
+        movq            mm3,        [rdi+8];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        movq            mm5,        mm0
+
+        paddd           mm5,        mm1
+
+        movq            mm0,        [rsi+16]
+        movq            mm1,        [rsi+24]
+
+        movq            mm2,        [rdi+16]
+        movq            mm3,        [rdi+24];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        paddd           mm5,        mm0
+
+        paddd           mm5,        mm1
+        movq            mm0,        mm5
+
+        psrlq           mm5,        32
+        paddd           mm0,        mm5
+
+        ; eob adjustment begins here
+        movd            rcx,        mm0
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx ; rdx=-rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+        ; following is kept as reference
+        ;    movd            rcx,        mm0
+        ;    bsr             rax,        rcx
+        ;
+        ;    mov             eob,        rax
+        ;    mov             eee,        rcx
+        ;
+        ;if(eee==0)
+        ;{
+        ;    eob=-1;
+        ;}
+        ;else if(eee<0)
+        ;{
+        ;    eob=15;
+        ;}
+        ;d->eob = eob+1;
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse)
+sym(vp8_fast_quantize_b_impl_sse):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movdqa          xmm0,       [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movdqa          xmm1,       [rax]
+
+        movdqa          xmm3,       xmm0
+        psraw           xmm0,       15
+
+        pxor            xmm3,       xmm0
+        psubw           xmm3,       xmm0            ; abs
+
+        movdqa          xmm2,       xmm3
+        pcmpgtw         xmm1,       xmm2
+
+        pandn           xmm1,       xmm2
+        movdqa          xmm3,       xmm1
+
+        mov             rdx,        arg(6) ; quant_ptr
+        movdqa          xmm1,       [rdx]
+
+        mov             rcx,        arg(5) ; round_ptr
+        movdqa          xmm2,       [rcx]
+
+        paddw           xmm3,       xmm2
+        pmulhuw         xmm3,       xmm1
+
+        pxor            xmm3,       xmm0
+        psubw           xmm3,       xmm0        ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movdqa          xmm0,       xmm3
+
+        movdqa          [rdi],      xmm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movdqa          xmm2,       [rax]
+
+        pmullw          xmm3,       xmm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movdqa          [rax],      xmm3
+
+        ; next 8
+        movdqa          xmm4,       [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movdqa          xmm5,       [rax+16]
+
+        movdqa          xmm7,       xmm4
+        psraw           xmm4,       15
+
+        pxor            xmm7,       xmm4
+        psubw           xmm7,       xmm4            ; abs
+
+        movdqa          xmm6,       xmm7
+        pcmpgtw         xmm5,       xmm6
+
+        pandn           xmm5,       xmm6
+        movdqa          xmm7,       xmm5
+
+        movdqa          xmm5,       [rdx+16]
+        movdqa          xmm6,       [rcx+16]
+
+
+        paddw           xmm7,       xmm6
+        pmulhuw         xmm7,       xmm5
+
+        pxor            xmm7,       xmm4
+        psubw           xmm7,       xmm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movdqa          xmm1,       xmm7
+        movdqa          [rdi+16],   xmm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movdqa          xmm6,       [rax+16]
+
+        pmullw          xmm7,       xmm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movdqa          [rax+16],   xmm7
+        mov             rdi,        arg(4) ;scan_mask
+
+        pxor            xmm7,       xmm7
+        movdqa          xmm2,       [rdi]
+
+        movdqa          xmm3,       [rdi+16];
+        pcmpeqw         xmm0,       xmm7
+
+        pcmpeqw         xmm1,       xmm7
+        pcmpeqw         xmm6,       xmm6
+
+        pxor            xmm0,       xmm6
+        pxor            xmm1,       xmm6
+
+        psrlw           xmm0,       15
+        psrlw           xmm1,       15
+
+        pmaddwd         xmm0,       xmm2
+        pmaddwd         xmm1,       xmm3
+
+        movq            xmm2,       xmm0
+        movq            xmm3,       xmm1
+
+        psrldq          xmm0,       8
+        psrldq          xmm1,       8
+
+        paddd           xmm0,       xmm1
+        paddd           xmm2,       xmm3
+
+        paddd           xmm0,       xmm2
+        movq            xmm1,       xmm0
+
+        psrldq          xmm0,       4
+        paddd           xmm1,       xmm0
+
+        movd            rcx,        xmm1
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
new file mode 100644
index 000000000..a825698e7
--- /dev/null
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -0,0 +1,428 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx)
+global sym(vp8_sad8x16_mmx)
+global sym(vp8_sad8x8_mmx)
+global sym(vp8_sad4x4_mmx)
+global sym(vp8_sad16x8_mmx)
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x16x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpcklbw       mm2,        mm6
+
+        punpckhbw       mm1,        mm6
+        punpckhbw       mm3,        mm6
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+
+        lea             rsi,        [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm1
+
+        cmp             rsi,        rcx
+        jne             x16x16sad_mmx_loop
+
+
+        movq            mm0,        mm7
+
+        punpcklwd       mm0,        mm6
+        punpckhwd       mm7,        mm6
+
+        paddw           mm0,        mm7
+        movq            mm7,        mm0
+
+
+        psrlq           mm0,        32
+        paddw           mm7,        mm0
+
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad8x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x8x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        paddw           mm7,        mm2
+        cmp             rsi,        rcx
+
+        jne             x8x16sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x8x8sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        paddw           mm0,        mm2
+
+        lea             rsi,       [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,       mm0
+        cmp             rsi,        rcx
+
+        jne             x8x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,       QWORD PTR [rsi]
+        movd            mm1,       QWORD PTR [rdi]
+
+        movd            mm2,       QWORD PTR [rsi+rax]
+        movd            mm3,       QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        pxor            mm3,        mm3
+
+        punpcklbw       mm0,        mm3
+        punpckhbw       mm2,        mm3
+
+        paddw           mm0,        mm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm4,       QWORD PTR [rsi]
+        movd            mm5,       QWORD PTR [rdi]
+
+        movd            mm6,       QWORD PTR [rsi+rax]
+        movd            mm7,       QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm4,        mm6
+        punpcklbw       mm5,        mm7
+
+        movq            mm6,        mm4
+        psubusb         mm4,        mm5
+
+        psubusb         mm5,        mm6
+        por             mm4,        mm5
+
+        movq            mm5,        mm4
+        punpcklbw       mm4,        mm3
+
+        punpckhbw       mm5,        mm3
+        paddw           mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            mm1,        mm0
+
+        punpcklwd       mm0,        mm3
+        punpckhwd       mm1,        mm3
+
+        paddw           mm0,        mm1
+        movq            mm1,        mm0
+
+        psrlq           mm0,        32
+        paddw           mm0,        mm1
+
+        movd            rax,        mm0
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad16x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x16x8sad_mmx_loop:
+
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+
+        movq            mm2,        [rsi+8]
+        movq            mm3,        [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpckhbw       mm1,        mm6
+
+        punpcklbw       mm2,        mm6
+        punpckhbw       mm3,        mm6
+
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+        paddw           mm0,        mm1
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             x16x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
new file mode 100644
index 000000000..53240bbf1
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -0,0 +1,329 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad16x16_wmt)
+sym(vp8_sad16x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            xmm7,       xmm7
+
+x16x16sad_wmt_loop:
+
+        movq            xmm0,       QWORD PTR [rsi]
+        movq            xmm2,       QWORD PTR [rsi+8]
+
+        movq            xmm1,       QWORD PTR [rdi]
+        movq            xmm3,       QWORD PTR [rdi+8]
+
+        movq            xmm4,       QWORD PTR [rsi+rax]
+        movq            xmm5,       QWORD PTR [rdi+rdx]
+
+
+        punpcklbw       xmm0,       xmm2
+        punpcklbw       xmm1,       xmm3
+
+        psadbw          xmm0,       xmm1
+        movq            xmm6,       QWORD PTR [rsi+rax+8]
+
+        movq            xmm3,       QWORD PTR [rdi+rdx+8]
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        punpcklbw       xmm4,       xmm6
+
+        punpcklbw       xmm5,       xmm3
+        psadbw          xmm4,       xmm5
+
+        paddw           xmm7,       xmm0
+        paddw           xmm7,       xmm4
+
+        cmp             rsi,        rcx
+        jne             x16x16sad_wmt_loop
+
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            rax,        xmm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad8x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_err)
+global sym(vp8_sad8x16_wmt)
+sym(vp8_sad8x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
+        pxor            mm7,        mm7
+
+x8x16sad_wmt_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              x8x16sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        QWORD PTR [rsi+rbx]
+        movq            mm3,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm2
+
+        cmp             rsi,        rcx
+        jne             x8x16sad_wmt_loop
+
+        movd            rax,        mm7
+
+x8x16sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad8x8_wmt)
+sym(vp8_sad8x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+x8x8sad_wmt_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              x8x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rbx]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             x8x8sad_wmt_loop
+
+        movd            rax,        mm7
+x8x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad4x4_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad4x4_wmt)
+sym(vp8_sad4x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,       QWORD PTR [rsi]
+        movd            mm1,       QWORD PTR [rdi]
+
+        movd            mm2,       QWORD PTR [rsi+rax]
+        movd            mm3,       QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        movd            mm4,       QWORD PTR [rsi]
+
+        movd            mm5,       QWORD PTR [rdi]
+        movd            mm6,       QWORD PTR [rsi+rax]
+
+        movd            mm7,       QWORD PTR [rdi+rdx]
+        punpcklbw       mm4,        mm6
+
+        punpcklbw       mm5,        mm7
+        psadbw          mm4,        mm5
+
+        paddw           mm0,        mm4
+        movd            rax,        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad16x8_wmt)
+sym(vp8_sad16x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+x16x8sad_wmt_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              x16x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
+
+        psadbw          mm4,        mm5
+        psadbw          mm1,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm0,        mm2
+        paddw           mm4,        mm1
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm4
+
+        cmp             rsi,        rcx
+        jne             x16x8sad_wmt_loop
+
+        movd            rax,        mm7
+
+x16x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
new file mode 100644
index 000000000..38cc02957
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -0,0 +1,939 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm5,       [rdi]
+        lddqu           xmm6,       [rdi+1]
+        lddqu           xmm7,       [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm1,       [rdi]
+        lddqu           xmm2,       [rdi+1]
+        lddqu           xmm3,       [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        lddqu           xmm1,       QWORD PTR [rdi+rdx]
+        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 1
+%if %1
+        movq            mm0,       [rsi]
+        movq            mm5,       [rdi]
+        movq            mm6,       [rdi+1]
+        movq            mm7,       [rdi+2]
+
+        psadbw          mm5,       mm0
+        psadbw          mm6,       mm0
+        psadbw          mm7,       mm0
+%else
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+        movq            mm2,       [rdi+1]
+        movq            mm3,       [rdi+2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endif
+        movq            mm0,       QWORD PTR [rsi+rax]
+        movq            mm1,       QWORD PTR [rdi+rdx]
+        movq            mm2,       QWORD PTR [rdi+rdx+1]
+        movq            mm3,       QWORD PTR [rdi+rdx+2]
+
+        lea             rsi,       [rsi+rax*2]
+        lea             rdi,       [rdi+rdx*2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+        mov             %2,         [%1+REG_SZ_BYTES*0]
+        mov             %3,         [%1+REG_SZ_BYTES*1]
+
+        mov             %4,         [%1+REG_SZ_BYTES*2]
+        mov             %5,         [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 1
+%if %1
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm4,       [rcx]
+        lddqu           xmm5,       [rdx]
+        lddqu           xmm6,       [rbx]
+        lddqu           xmm7,       [rdi]
+
+        psadbw          xmm4,       xmm0
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm1,       [rcx]
+        lddqu           xmm2,       [rdx]
+        lddqu           xmm3,       [rbx]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       [rdi]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        lddqu           xmm1,       QWORD PTR [rcx+rbp]
+        lddqu           xmm2,       QWORD PTR [rdx+rbp]
+        lddqu           xmm3,       QWORD PTR [rbx+rbp]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       QWORD PTR [rdi+rbp]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
+
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
+
+        lea             rdi,        [rdi+rbp*2]
+
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 1
+%if %1
+        movq            mm0,        [rsi]
+        movq            mm4,        [rcx]
+        movq            mm5,        [rdx]
+        movq            mm6,        [rbx]
+        movq            mm7,        [rdi]
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+        psadbw          mm7,        mm0
+%else
+        movq            mm0,        [rsi]
+        movq            mm1,        [rcx]
+        movq            mm2,        [rdx]
+        movq            mm3,        [rbx]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        [rdi]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+%endif
+        movq            mm0,        QWORD PTR [rsi+rax]
+        movq            mm1,        QWORD PTR [rcx+rbp]
+        movq            mm2,        QWORD PTR [rdx+rbp]
+        movq            mm3,        QWORD PTR [rbx+rbp]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        QWORD PTR [rdi+rbp]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
+
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
+
+        lea             rdi,        [rdi+rbp*2]
+
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x3_sse3)
+sym(vp8_sad16x16x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad16x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x3_sse3)
+sym(vp8_sad16x8x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x16x3_sse3)
+sym(vp8_sad8x16x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X3 1
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm5
+        movd            [rdi+4],    mm6
+        movd            [rdi+8],    mm7
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x8x3_sse3)
+sym(vp8_sad8x8x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X3 1
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm5
+        movd            [rdi+4],    mm6
+        movd            [rdi+8],    mm7
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad4x4x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad4x4x3_sse3)
+sym(vp8_sad4x4x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm1,        QWORD PTR [rdi]
+
+        movd            mm2,        QWORD PTR [rsi+rax]
+        movd            mm3,        QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        QWORD PTR [rdi+1]
+        movd            mm5,        QWORD PTR [rdi+2]
+
+        movd            mm2,        QWORD PTR [rdi+rdx+1]
+        movd            mm3,        QWORD PTR [rdi+rdx+2]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+
+
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm2,        QWORD PTR [rdi]
+
+        movd            mm3,        QWORD PTR [rsi+rax]
+        movd            mm6,        QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm6
+
+        movd            mm3,        QWORD PTR [rdi+1]
+        movd            mm7,        QWORD PTR [rdi+2]
+
+        psadbw          mm2,        mm0
+
+        paddw           mm1,        mm2
+
+        movd            mm2,        QWORD PTR [rdi+rdx+1]
+        movd            mm6,        QWORD PTR [rdi+rdx+2]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm6
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        mov             rdi,        arg(4) ;Results
+        movd            [rdi],      mm1
+
+        movd            [rdi+4],    mm3
+        movd            [rdi+8],    mm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad16x16_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_err)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3)
+sym(vp8_sad16x16_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
+        pxor            mm7,        mm7
+
+vp8_sad16x16_sse3_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              vp8_sad16x16_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
+
+        psadbw          mm4,        mm5
+        psadbw          mm1,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm0,        mm2
+        paddw           mm4,        mm1
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm4
+
+        cmp             rsi,        rcx
+        jne             vp8_sad16x16_sse3_loop
+
+        movd            rax,        mm7
+
+vp8_sad16x16_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_sad16x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x4d_sse3)
+sym(vp8_sad16x16x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_16X2X4 1
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+12],   xmm0
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_sad16x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x4d_sse3)
+sym(vp8_sad16x8x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_16X2X4 1
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+12],   xmm0
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x16x4d_sse3)
+sym(vp8_sad8x16x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_8X2X4 1
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm4
+        movd            [rdi+4],    mm5
+        movd            [rdi+8],    mm6
+        movd            [rdi+12],   mm7
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x8x4d_sse3)
+sym(vp8_sad8x8x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_8X2X4 1
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm4
+        movd            [rdi+4],    mm5
+        movd            [rdi+8],    mm6
+        movd            [rdi+12],   mm7
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad4x4x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad4x4x4d_sse3)
+sym(vp8_sad4x4x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm1,        QWORD PTR [rcx]
+
+        movd            mm2,        QWORD PTR [rsi+rax]
+        movd            mm3,        QWORD PTR [rcx+rbp]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        QWORD PTR [rdx]
+        movd            mm5,        QWORD PTR [rbx]
+
+        movd            mm6,        QWORD PTR [rdi]
+        movd            mm2,        QWORD PTR [rdx+rbp]
+
+        movd            mm3,        QWORD PTR [rbx+rbp]
+        movd            mm7,        QWORD PTR [rdi+rbp]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        punpcklbw       mm6,        mm7
+        psadbw          mm4,        mm0
+
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+
+
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
+
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
+
+        lea             rdi,        [rdi+rbp*2]
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm2,        QWORD PTR [rcx]
+
+        movd            mm3,        QWORD PTR [rsi+rax]
+        movd            mm7,        QWORD PTR [rcx+rbp]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm7
+
+        movd            mm3,        QWORD PTR [rdx]
+        movd            mm7,        QWORD PTR [rbx]
+
+        psadbw          mm2,        mm0
+        mov             rax,        rbp
+
+        pop             rbp
+        mov             rsi,        arg(4) ;Results
+
+        paddw           mm1,        mm2
+        movd            [rsi],      mm1
+
+        movd            mm2,        QWORD PTR [rdx+rax]
+        movd            mm1,        QWORD PTR [rbx+rax]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm1
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        movd            mm2,        QWORD PTR [rdi]
+        movd            mm1,        QWORD PTR [rdi+rax]
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        movd            [rsi+4],    mm3
+        punpcklbw       mm2,        mm1
+
+        movd            [rsi+8],    mm7
+        psadbw          mm2,        mm0
+
+        paddw           mm2,        mm6
+        movd            [rsi+12],   mm2
+
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
new file mode 100644
index 000000000..1bb956121
--- /dev/null
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -0,0 +1,367 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm5,       [rdi]
+        lddqu           xmm6,       [rdi+1]
+        lddqu           xmm7,       [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm1,       [rdi]
+        lddqu           xmm2,       [rdi+1]
+        lddqu           xmm3,       [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        lddqu           xmm1,       QWORD PTR [rdi+rdx]
+        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+        movdqa          xmm0,       [rsi]
+        movdqa          xmm4,       [rdi]
+        movdqa          xmm7,       [rdi+16]
+
+        movdqa          xmm5,       xmm7
+        palignr         xmm5,       xmm4,       %2
+
+        movdqa          xmm6,       xmm7
+        palignr         xmm6,       xmm4,       (%2+1)
+
+        palignr         xmm7,       xmm4,       (%2+2)
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        movdqa          xmm4,       [rdi]
+        movdqa          xmm3,       [rdi+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        movdqa          xmm4,       QWORD PTR [rdi+rdx]
+        movdqa          xmm3,       QWORD PTR [rdi+rdx+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x3_ssse3)
+sym(vp8_sad16x16x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp vp8_sad16x16x3_ssse3_skiptable
+vp8_sad16x16x3_ssse3_jumptable:
+        dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_skiptable:
+
+        call vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+
+vp8_sad16x16x3_ssse3_aligned_by_15:
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+vp8_sad16x16x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad16x8x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x3_ssse3)
+sym(vp8_sad16x8x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp vp8_sad16x8x3_ssse3_skiptable
+vp8_sad16x8x3_ssse3_jumptable:
+        dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_skiptable:
+
+        call vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+
+vp8_sad16x8x3_ssse3_aligned_by_15:
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+vp8_sad16x8x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
new file mode 100644
index 000000000..ce3e61066
--- /dev/null
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -0,0 +1,431 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+;                            unsigned short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl)
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi],      mm0
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],mm0
+
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*4],        mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_mmx)
+sym(vp8_subtract_mby_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            16
+            pxor        mm0,            mm0
+
+submby_loop:
+
+            movq        mm1,            [rsi]
+            movq        mm3,            [rax]
+
+            movq        mm2,            mm1
+            movq        mm4,            mm3
+
+            punpcklbw   mm1,            mm0
+            punpcklbw   mm3,            mm0
+
+            punpckhbw   mm2,            mm0
+            punpckhbw   mm4,            mm0
+
+            psubw       mm1,            mm3
+            psubw       mm2,            mm4
+
+            movq        [rdi],          mm1
+            movq        [rdi+8],        mm2
+
+
+            movq        mm1,            [rsi+8]
+            movq        mm3,            [rax+8]
+
+            movq        mm2,            mm1
+            movq        mm4,            mm3
+
+            punpcklbw   mm1,            mm0
+            punpcklbw   mm3,            mm0
+
+            punpckhbw   mm2,            mm0
+            punpckhbw   mm4,            mm0
+
+            psubw       mm1,            mm3
+            psubw       mm2,            mm4
+
+            movq        [rdi+16],       mm1
+            movq        [rdi+24],       mm2
+
+
+            add         rdi,            32
+            add         rax,            16
+
+            lea         rsi,            [rsi+rdx]
+
+            sub         rcx,            1
+            jnz         submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx)
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+    ;short *udiff = diff + 256;
+    ;short *vdiff = diff + 320;
+    ;unsigned char *upred = pred + 256;
+    ;unsigned char *vpred = pred + 320;
+
+        ;unsigned char  *z    = usrc;
+        ;unsigned short *diff = udiff;
+        ;unsigned char  *Predictor= upred;
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            pxor    mm7,        mm7
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+
+            add     rdi,        64
+            add     rax,        32
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+        ;unsigned char  *z    = vsrc;
+        ;unsigned short *diff = vdiff;
+        ;unsigned char  *Predictor= vpred;
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(2) ;z = usrc
+            add     rdi,        320*2  ;diff = diff + 320 (shorts)
+            add     rax,        320    ;Predictor = pred + 320
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            pxor    mm7,        mm7
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+
+            add     rdi,        64
+            add     rax,        32
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
new file mode 100644
index 000000000..d0da82ad4
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -0,0 +1,980 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx)
+sym(vp8_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp8_get8x8var_mmx)
+sym(vp8_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp8_get4x4var_mmx)
+sym(vp8_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx)
+sym(vp8_get4x4sse_cs_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm1, mm6
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        movq        mm0,    mm7                 ;
+        psrlq       mm7,    32
+
+        paddd       mm0,    mm7
+        movd        rax,    mm0
+
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%define mmx_filter_shift            7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx)
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil4x4_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx)
+sym(vp8_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_get16x16pred_error_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_stride,
+;    unsigned char *ref_ptr,
+;    int ref_stride
+;)
+global sym(vp8_get16x16pred_error_mmx)
+sym(vp8_get16x16pred_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;DWORD PTR [src_ptr]
+        mov         rdi,            arg(2) ;DWORD PTR [ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
+
+        pxor        mm0,            mm0                     ; clear xmm0 for unpack
+        pxor        mm7,            mm7                     ; clear xmm7 for accumulating diffs
+
+        pxor        mm6,            mm6                     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+var16loop:
+
+        movq        mm1,            [rsi]
+        movq        mm2,            [rdi]
+
+        movq        mm3,            mm1
+        movq        mm4,            mm2
+
+        punpcklbw   mm1,            mm0
+        punpckhbw   mm3,            mm0
+
+        punpcklbw   mm2,            mm0
+        punpckhbw   mm4,            mm0
+
+        psubw       mm1,            mm2
+        psubw       mm3,            mm4
+
+        paddw       mm7,            mm1
+        pmaddwd     mm1,            mm1
+
+        paddw       mm7,            mm3
+        pmaddwd     mm3,            mm3
+
+        paddd       mm6,            mm1
+        paddd       mm6,            mm3
+
+
+        movq        mm1,            [rsi+8]
+        movq        mm2,            [rdi+8]
+
+        movq        mm3,            mm1
+        movq        mm4,            mm2
+
+        punpcklbw   mm1,            mm0
+        punpckhbw   mm3,            mm0
+
+        punpcklbw   mm2,            mm0
+        punpckhbw   mm4,            mm0
+
+        psubw       mm1,            mm2
+        psubw       mm3,            mm4
+
+        paddw       mm7,            mm1
+        pmaddwd     mm1,            mm1
+
+        paddw       mm7,            mm3
+        pmaddwd     mm3,            mm3
+
+        paddd       mm6,            mm1
+        paddd       mm6,            mm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         var16loop
+
+
+        movq        mm1,            mm6
+        pxor        mm6,            mm6
+
+        pxor        mm5,            mm5
+        punpcklwd   mm6,            mm7
+
+        punpckhwd   mm5,            mm7
+        psrad       mm5,            16
+
+        psrad       mm6,            16
+        paddd       mm6,            mm5
+
+        movq        mm2,            mm1
+        psrlq       mm1,            32
+
+        paddd       mm2,            mm1
+        movq        mm7,            mm6
+
+        psrlq       mm6,            32
+        paddd       mm6,            mm7
+
+        movd DWORD PTR [rsp],       mm6  ;Sum
+        movd DWORD PTR [rsp+4],     mm2  ;SSE
+
+        ; return (SSE-((Sum*Sum)>>8));
+        movsxd      rdx, dword ptr [rsp]
+        imul        rdx, rdx
+        sar         rdx, 8
+        movsxd      rax, dword ptr [rsp + 4]
+        sub         rax, rdx
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
new file mode 100644
index 000000000..7e5ee284b
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -0,0 +1,975 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+;    short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2)
+sym(vp8_get_mb_ss_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 1
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        mov         rax, arg(0) ;[src_ptr]
+        mov         rcx, 8
+        pxor        xmm4, xmm4
+
+NEXTROW:
+        movdqa      xmm0, [rax]
+        movdqa      xmm1, [rax+16]
+        movdqa      xmm2, [rax+32]
+        movdqa      xmm3, [rax+48]
+        pmaddwd     xmm0, xmm0
+        pmaddwd     xmm1, xmm1
+        pmaddwd     xmm2, xmm2
+        pmaddwd     xmm3, xmm3
+
+        paddd       xmm0, xmm1
+        paddd       xmm2, xmm3
+        paddd       xmm4, xmm0
+        paddd       xmm4, xmm2
+
+        add         rax, 0x40
+        dec         rcx
+        ja          NEXTROW
+
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,8
+        paddd       xmm4,xmm3
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,4
+        paddd       xmm4,xmm3
+        movd        rax,xmm4
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp8_get16x16var_sse2)
+sym(vp8_get16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        movdqa      xmm3,           xmm1
+        movdqa      xmm4,           xmm2
+
+
+        punpcklbw   xmm1,           xmm0
+        punpckhbw   xmm3,           xmm0
+
+        punpcklbw   xmm2,           xmm0
+        punpckhbw   xmm4,           xmm0
+
+
+        psubw       xmm1,           xmm2
+        psubw       xmm3,           xmm4
+
+        paddw       xmm7,           xmm1
+        pmaddwd     xmm1,           xmm1
+
+        paddw       xmm7,           xmm3
+        pmaddwd     xmm3,           xmm3
+
+        paddd       xmm6,           xmm1
+        paddd       xmm6,           xmm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         var16loop
+
+
+        movdqa      xmm1,           xmm6
+        pxor        xmm6,           xmm6
+
+        pxor        xmm5,           xmm5
+        punpcklwd   xmm6,           xmm7
+
+        punpckhwd   xmm5,           xmm7
+        psrad       xmm5,           16
+
+        psrad       xmm6,           16
+        paddd       xmm6,           xmm5
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddd       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddd       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movd DWORD PTR [rax],       xmm7
+        movd DWORD PTR [rdi],       xmm1
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get16x16pred_error_sse2
+;(
+;   unsigned char *src_ptr,
+;    int src_stride,
+;    unsigned char *ref_ptr,
+;    int ref_stride
+;)
+global sym(vp8_get16x16pred_error_sse2)
+sym(vp8_get16x16pred_error_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+var16peloop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        movdqa      xmm3,           xmm1
+        movdqa      xmm4,           xmm2
+
+        punpcklbw   xmm1,           xmm0
+        punpckhbw   xmm3,           xmm0
+
+        punpcklbw   xmm2,           xmm0
+        punpckhbw   xmm4,           xmm0
+
+        psubw       xmm1,           xmm2
+        psubw       xmm3,           xmm4
+
+        paddw       xmm7,           xmm1
+        pmaddwd     xmm1,           xmm1
+
+        paddw       xmm7,           xmm3
+        pmaddwd     xmm3,           xmm3
+
+        paddd       xmm6,           xmm1
+        paddd       xmm6,           xmm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         var16peloop
+
+
+        movdqa      xmm1,           xmm6
+        pxor        xmm6,           xmm6
+
+        pxor        xmm5,           xmm5
+        punpcklwd   xmm6,           xmm7
+
+        punpckhwd   xmm5,           xmm7
+        psrad       xmm5,           16
+
+        psrad       xmm6,           16
+        paddd       xmm6,           xmm5
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddd       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddd       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        movd DWORD PTR [rsp],       xmm7  ;Sum
+        movd DWORD PTR [rsp+4],     xmm1  ;SSE
+
+        ; return (SSE-((Sum*Sum)>>8));
+        movsxd      rdx, dword ptr [rsp]
+        imul        rdx, rdx
+        sar         rdx, 8
+        movsxd      rax, dword ptr [rsp + 4]
+        sub         rax, rdx
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp8_get8x8var_sse2)
+sym(vp8_get8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        movq        xmm1,           QWORD PTR [rsi]
+        movq        xmm2,           QWORD PTR [rdi]
+
+        punpcklbw   xmm1,           xmm0
+        punpcklbw   xmm2,           xmm0
+
+        psubsw      xmm1,           xmm2
+        paddw       xmm7,           xmm1
+
+        pmaddwd     xmm1,           xmm1
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax * 2]
+        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movdqa      xmm6,           xmm7
+        punpcklwd   xmm6,           xmm0
+
+        punpckhwd   xmm7,           xmm0
+        movdqa      xmm2,           xmm1
+
+        paddw       xmm6,           xmm7
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddw       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddw       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movd        rdx,            xmm7
+        movsx       rcx,            dx
+
+        mov  dword ptr [rax],       ecx
+        movd DWORD PTR [rdi],       xmm1
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2)
+sym(vp8_filter_block2d_bil_var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            xmm6,           xmm6                 ;
+        pxor            xmm7,           xmm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]               ;
+
+        movq            xmm3,           QWORD PTR [rsi+1]        ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0
+            ;
+        pmullw          xmm3,           [rax+16]             ;
+        paddw           xmm1,           xmm3                 ;
+
+        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movdqa          xmm5,           xmm1
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rsi,            r8
+%endif
+filter_block2d_bil_var_sse2_loop:
+
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm3,           QWORD PTR [rsi+1]             ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+
+        psraw           xmm1,           xmm_filter_shift    ;
+        movdqa          xmm3,           xmm5                 ;
+
+        movdqa          xmm5,           xmm1                 ;
+        pmullw          xmm3,           [rdx]               ;
+
+        pmullw          xmm1,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+
+        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_sse2_loop       ;
+
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(7) ; sum
+        mov             rdi,            arg(8) ; sumsquared
+
+        movd            [rsi],          mm2    ; xsum
+        movd            [rdi],          mm4    ; xxsum
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+%else
+        add             rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance16x_h_1:
+
+        movq            xmm1,           QWORD PTR [rsi]     ;
+        movq            xmm2,           QWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+vp8_half_vert_variance16x_h_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_vert_variance16x_h_1          ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            xmm0,           xmm0                ;
+vp8_half_horiz_variance16x16_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x16_1        ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
new file mode 100644
index 000000000..4a5b25b0d
--- /dev/null
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -0,0 +1,596 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx
+(
+    unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    short *vp7_filter
+);
+extern void filter_block1d_v6_mmx
+(
+    short *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    short *vp7_filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern unsigned int vp8_get4x4sse_cs_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern unsigned int vp8_get16x16pred_error_mmx
+(
+    unsigned char *src_ptr,
+    int src_stride,
+    unsigned char *ref_ptr,
+    int ref_stride
+);
+
+
+void vp8_test_get_mb_ss(void)
+{
+    short zz[] =
+    {
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+    };
+    int s = 0, x = vp8_get_mb_ss_mmx(zz);
+    {
+        int y;
+
+        for (y = 0; y < 256; y++)
+            s += (zz[y] * zz[y]);
+    }
+
+    x += 0;
+}
+
+
+unsigned int vp8_get16x16var_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned *SSE,
+    unsigned *SUM
+)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+
+    *SSE = var;
+    *SUM = avg;
+    return (var - ((avg * avg) >> 8));
+
+}
+
+
+
+
+
+unsigned int vp8_variance4x4_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+    return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+
+    return (var - ((avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - ((avg * avg) >> 7));
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass                                                          //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
+{
+    { 128, 128, 128, 128,  0,  0,  0,  0 },
+    { 112, 112, 112, 112, 16, 16, 16, 16 },
+    {  96, 96, 96, 96, 32, 32, 32, 32 },
+    {  80, 80, 80, 80, 48, 48, 48, 48 },
+    {  64, 64, 64, 64, 64, 64, 64, 64 },
+    {  48, 48, 48, 48, 80, 80, 80, 80 },
+    {  32, 32, 32, 32, 96, 96, 96, 96 },
+    {  16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse)
+
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil4x4_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_i_sub_pixel_variance16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+    int f2soffset = (src_pixels_per_line >> 1);
+    int f2doffset = (dst_pixels_per_line >> 1);
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + f2soffset, src_pixels_per_line,
+        dst_ptr + f2doffset, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + f2soffset + 8, src_pixels_per_line,
+        dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+    int f2soffset = (src_pixels_per_line >> 1);
+    int f2doffset = (dst_pixels_per_line >> 1);
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + f2soffset, src_pixels_per_line,
+        dst_ptr + f2doffset, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
new file mode 100644
index 000000000..ea80753bd
--- /dev/null
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -0,0 +1,514 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+    short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+    unsigned char     *src_ptr,
+    int             source_stride,
+    unsigned char     *ref_ptr,
+    int             recon_stride,
+    unsigned int      *SSE,
+    int               *Sum
+);
+unsigned int vp8_get16x16pred_error_sse2
+(
+    unsigned char *src_ptr,
+    int src_stride,
+    unsigned char *ref_ptr,
+    int ref_stride
+);
+unsigned int vp8_get8x8var_sse2
+(
+    unsigned char     *src_ptr,
+    int             source_stride,
+    unsigned char     *ref_ptr,
+    int             recon_stride,
+    unsigned int      *SSE,
+    int               *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
+
+unsigned int vp8_variance4x4_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    return (var - ((avg * avg) >> 4));
+
+}
+
+
+
+unsigned int vp8_variance8x8_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+
+    return (var - ((avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0;
+    int sum0;
+
+
+    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    *sse = sse0;
+    return (sse0 - ((sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+
+    unsigned int sse0;
+    int sum0;
+    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    *sse = sse0;
+    return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass                                                          //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
+{
+    { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+    {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+    {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+    {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+    {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+    {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil4x4_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum, &xxsum
+    );
+
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    // note we could avoid these if statements if the calling function
+    // just called the appropriate functions inside.
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            &xsum0, &xxsum0
+        );
+
+
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            &xsum1, &xxsum1
+        );
+    }
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum, &xxsum
+    );
+
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_i_sub_pixel_variance16x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
new file mode 100644
index 000000000..35fc90c48
--- /dev/null
+++ b/vp8/encoder/x86/variance_x86.h
@@ -0,0 +1,275 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_X86_H
+#define VARIANCE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_sad(vp8_sad4x4_mmx);
+extern prototype_sad(vp8_sad8x8_mmx);
+extern prototype_sad(vp8_sad8x16_mmx);
+extern prototype_sad(vp8_sad16x8_mmx);
+extern prototype_sad(vp8_sad16x16_mmx);
+extern prototype_variance(vp8_variance4x4_mmx);
+extern prototype_variance(vp8_variance8x8_mmx);
+extern prototype_variance(vp8_variance8x16_mmx);
+extern prototype_variance(vp8_variance16x8_mmx);
+extern prototype_variance(vp8_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
+extern prototype_getmbss(vp8_get_mb_ss_mmx);
+extern prototype_variance(vp8_mse16x16_mmx);
+extern prototype_sad(vp8_get16x16pred_error_mmx);
+extern prototype_variance2(vp8_get8x8var_mmx);
+extern prototype_variance2(vp8_get16x16var_mmx);
+extern prototype_sad(vp8_get4x4sse_cs_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_mmx
+
+#undef  vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_mmx
+
+#undef  vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_mmx
+
+#undef  vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_mmx
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_mmx
+
+#undef  vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_mmx
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_mmx
+
+#undef  vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_mmx
+
+#undef  vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_mmx
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_mmx
+
+#undef  vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx
+
+#undef  vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx
+
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
+
+#undef  vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
+
+#undef  vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_mmx
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_mmx
+
+#undef  vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
+
+#undef  vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_mmx
+
+#undef  vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_mmx
+
+#undef  vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_sad(vp8_sad4x4_wmt);
+extern prototype_sad(vp8_sad8x8_wmt);
+extern prototype_sad(vp8_sad8x16_wmt);
+extern prototype_sad(vp8_sad16x8_wmt);
+extern prototype_sad(vp8_sad16x16_wmt);
+extern prototype_variance(vp8_variance4x4_wmt);
+extern prototype_variance(vp8_variance8x8_wmt);
+extern prototype_variance(vp8_variance8x16_wmt);
+extern prototype_variance(vp8_variance16x8_wmt);
+extern prototype_variance(vp8_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
+extern prototype_getmbss(vp8_get_mb_ss_sse2);
+extern prototype_variance(vp8_mse16x16_wmt);
+extern prototype_sad(vp8_get16x16pred_error_sse2);
+extern prototype_variance2(vp8_get8x8var_sse2);
+extern prototype_variance2(vp8_get16x16var_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_wmt
+
+#undef  vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_wmt
+
+#undef  vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_wmt
+
+#undef  vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_wmt
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_wmt
+
+#undef  vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_wmt
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_wmt
+
+#undef  vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_wmt
+
+#undef  vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_wmt
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_wmt
+
+#undef  vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt
+
+#undef  vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt
+
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
+
+#undef  vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
+
+#undef  vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_sse2
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_wmt
+
+#undef  vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
+
+#undef  vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_sse2
+
+#undef  vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_sse2
+
+#endif
+#endif
+
+
+#if HAVE_SSE3
+extern prototype_sad(vp8_sad16x16_sse3);
+extern prototype_sad(vp8_sad16x8_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3);
+
+extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_sse3
+
+#undef  vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3
+
+#undef  vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3
+
+#undef  vp8_variance_sad8x16x3
+#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3
+
+#undef  vp8_variance_sad8x8x3
+#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3
+
+#undef  vp8_variance_sad4x4x3
+#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
+
+#undef  vp8_variance_sad16x16x4d
+#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3
+
+#undef  vp8_variance_sad16x8x4d
+#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
+
+#undef  vp8_variance_sad8x16x4d
+#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3
+
+#undef  vp8_variance_sad8x8x4d
+#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3
+
+#undef  vp8_variance_sad4x4x4d
+#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
+
+#endif
+#endif
+
+
+#if HAVE_SSSE3
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3
+
+#undef  vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+
+#endif
+#endif
+
+#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
new file mode 100644
index 000000000..f1391ba8c
--- /dev/null
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -0,0 +1,287 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+
+#if HAVE_MMX
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_mmx(input,   output   , pitch);
+    vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *scan_mask, short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr  = &b->coeff[0];
+    short *zbin_ptr   = &b->zbin[0][0];
+    short *round_ptr  = &b->round[0][0];
+    short *quant_ptr  = &b->quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = &d->dequant[0][0];
+
+    d->eob = vp8_fast_quantize_b_impl_mmx(
+                 coeff_ptr,
+                 zbin_ptr,
+                 qcoeff_ptr,
+                 dequant_ptr,
+                 scan_mask,
+
+                 round_ptr,
+                 quant_ptr,
+                 dqcoeff_ptr
+             );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSE2
+void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_wmt(input,   output,    pitch);
+    vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *scan_mask, short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+{
+    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr  = &b->coeff[0];
+    short *zbin_ptr   = &b->zbin[0][0];
+    short *round_ptr  = &b->round[0][0];
+    short *quant_ptr  = &b->quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = &d->dequant[0][0];
+
+    d->eob = vp8_fast_quantize_b_impl_sse(
+                 coeff_ptr,
+                 zbin_ptr,
+                 qcoeff_ptr,
+                 dequant_ptr,
+                 scan_mask,
+
+                 round_ptr,
+                 quant_ptr,
+                 dqcoeff_ptr
+             );
+}
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+#endif
+
+void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = x86_simd_caps();
+    int mmx_enabled = flags & HAS_MMX;
+    int xmm_enabled = flags & HAS_SSE;
+    int wmt_enabled = flags & HAS_SSE2;
+    int SSE3Enabled = flags & HAS_SSE3;
+    int SSSE3Enabled = flags & HAS_SSSE3;
+
+    /* Note:
+     *
+     * This platform can be built without runtime CPU detection as well. If
+     * you modify any of the function mappings present in this file, be sure
+     * to also update them in static mapings (<arch>/filename_<arch>.h)
+     */
+
+    /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+
+    if (mmx_enabled)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_mmx;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_mmx;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_mmx;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_mmx;
+
+        cpi->rtcd.variance.var4x4                = vp8_variance4x4_mmx;
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_mmx;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_mmx;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_mmx;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_mmx;
+
+        cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_mmx;
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_mmx;
+        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_mmx;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_mmx;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_mmx;
+        cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_mmx;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_mmx;
+        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_mmx;
+
+        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_mmx;
+        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
+
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_mmx;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_mmx;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
+
+        cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_mmx;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_mmx;
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_mmx;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;
+    }
+
+#endif
+#if HAVE_SSE2
+
+    if (wmt_enabled)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_wmt;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_wmt;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_wmt;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_wmt;
+
+        cpi->rtcd.variance.var4x4                = vp8_variance4x4_wmt;
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_wmt;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_wmt;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_wmt;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_wmt;
+
+        cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_wmt;
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_wmt;
+        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_wmt;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_wmt;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_wmt;
+        cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_wmt;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_wmt;
+        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_sse2;
+
+        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;
+        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;
+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
+        /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;
+
+#if 0
+        /* short SSE2 DCT currently disabled, does not match the MMX version */
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_wmt;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_wmt;
+#endif
+        /* cpi->rtcd.fdct.fast4x4  not implemented for wmt */;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_wmt;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2;
+
+        cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
+        /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse;
+    }
+
+#endif
+#if HAVE_SSE3
+
+    if (SSE3Enabled)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
+        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_sse3;
+        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_sse3;
+        cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_sse3;
+        cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_sse3;
+        cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_sse3;
+        cpi->rtcd.search.full_search             = vp8_full_search_sadx3;
+
+        cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_sse3;
+        cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_sse3;
+        cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
+        cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_sse3;
+        cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
+        cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
+    }
+
+#endif
+#if HAVE_SSSE3
+
+    if (SSSE3Enabled)
+    {
+        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
+        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+    }
+
+#endif
+#endif
+}