26 files changed, 2313 insertions, 1584 deletions
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
deleted file mode 100644
index 186ee6856..000000000
--- a/vp8/encoder/x86/csystemdependent.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-
-#include "variance.h"
-#include "onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *, short *);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-
-extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
-
-extern int vp8_block_error_c(short *, short *);
-extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
-
-extern int vp8_block_error_mmx(short *, short *);
-extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
-
-extern int vp8_block_error_xmm(short *, short *);
-extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
-
-
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// c imports
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
-
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction vp8_sad16x16_c;
-extern SADFunction vp8_sad16x8_c;
-extern SADFunction vp8_sad8x16_c;
-extern SADFunction vp8_sad8x8_c;
-extern SADFunction vp8_sad4x4_c;
-
-extern SADFunction vp8_sad16x16_wmt;
-extern SADFunction vp8_sad16x8_wmt;
-extern SADFunction vp8_sad8x16_wmt;
-extern SADFunction vp8_sad8x8_wmt;
-extern SADFunction vp8_sad4x4_wmt;
-
-extern SADFunction vp8_sad16x16_mmx;
-extern SADFunction vp8_sad16x8_mmx;
-extern SADFunction vp8_sad8x16_mmx;
-extern SADFunction vp8_sad8x8_mmx;
-extern SADFunction vp8_sad4x4_mmx;
-
-extern variance_function vp8_variance16x16_c;
-extern variance_function vp8_variance8x16_c;
-extern variance_function vp8_variance16x8_c;
-extern variance_function vp8_variance8x8_c;
-extern variance_function vp8_variance4x4_c;
-extern variance_function vp8_mse16x16_c;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// mmx imports
-extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
-extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
-extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
-extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
-extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
-extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
-extern variance_function vp8_variance4x4_mmx;
-extern variance_function vp8_variance8x8_mmx;
-extern variance_function vp8_variance8x16_mmx;
-extern variance_function vp8_variance16x8_mmx;
-extern variance_function vp8_variance16x16_mmx;
-
-extern variance_function vp8_mse16x16_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
-
-extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get_mb_ss_mmx(short *);
-extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-
-// wmt imports
-extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
-extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
-extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
-extern variance_function vp8_variance4x4_wmt;
-extern variance_function vp8_variance8x8_wmt;
-extern variance_function vp8_variance8x16_wmt;
-extern variance_function vp8_variance16x8_wmt;
-extern variance_function vp8_variance16x16_wmt;
-
-extern variance_function vp8_mse16x16_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
-extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
-extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-void vp8_cmachine_specific_config(void)
-{
-    int mmx_enabled;
-    int xmm_enabled;
-    int wmt_enabled;
-
-    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
-
-    if (wmt_enabled)         // Willamette
-    {
-        // Willamette instruction set available:
-        vp8_mbuverror                = vp8_mbuverror_xmm;
-        vp8_fast_quantize_b            = vp8_fast_quantize_b_sse;
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
-        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_wmt;
-        vp8_subtract_b                = vp8_subtract_b_mmx;
-        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
-        vp8_variance4x4              = vp8_variance4x4_mmx;
-        vp8_variance8x8              = vp8_variance8x8_mmx;
-        vp8_variance8x16             = vp8_variance8x16_wmt;
-        vp8_variance16x8             = vp8_variance16x8_wmt;
-        vp8_variance16x16            = vp8_variance16x16_wmt;
-        vp8_mse16x16                 = vp8_mse16x16_wmt;
-        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_wmt;
-        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_wmt;
-        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_wmt;
-        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_wmt;
-        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_wmt;
-        vp8_get_mb_ss                  = vp8_get_mb_ss_sse2;
-        vp8_get16x16pred_error        = vp8_get16x16pred_error_sse2;
-        vp8_get8x8var                = vp8_get8x8var_sse2;
-        vp8_get16x16var              = vp8_get16x16var_sse2;
-        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
-        vp8_sad16x16                 = vp8_sad16x16_wmt;
-        vp8_sad16x8                  = vp8_sad16x8_wmt;
-        vp8_sad8x16                  = vp8_sad8x16_wmt;
-        vp8_sad8x8                   = vp8_sad8x8_wmt;
-        vp8_sad4x4                   = vp8_sad4x4_wmt;
-        vp8_block_error               = vp8_block_error_xmm;
-        vp8_mbblock_error             = vp8_mbblock_error_xmm;
-        vp8_subtract_mby              = vp8_subtract_mby_mmx;
-
-    }
-    else if (mmx_enabled)
-    {
-        // MMX instruction set available:
-        vp8_mbuverror                = vp8_mbuverror_mmx;
-        vp8_fast_quantize_b            = vp8_fast_quantize_b_mmx;
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
-        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_mmx;
-        vp8_subtract_b                = vp8_subtract_b_mmx;
-        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
-        vp8_variance4x4              = vp8_variance4x4_mmx;
-        vp8_variance8x8              = vp8_variance8x8_mmx;
-        vp8_variance8x16             = vp8_variance8x16_mmx;
-        vp8_variance16x8             = vp8_variance16x8_mmx;
-        vp8_variance16x16            = vp8_variance16x16_mmx;
-        vp8_mse16x16                 = vp8_mse16x16_mmx;
-        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_mmx;
-        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_mmx;
-        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_mmx;
-        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_mmx;
-        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_mmx;
-        vp8_get_mb_ss                  = vp8_get_mb_ss_mmx;
-        vp8_get16x16pred_error        = vp8_get16x16pred_error_mmx;
-        vp8_get8x8var                = vp8_get8x8var_mmx;
-        vp8_get16x16var              = vp8_get16x16var_mmx;
-        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
-        vp8_sad16x16                 = vp8_sad16x16_mmx;
-        vp8_sad16x8                  = vp8_sad16x8_mmx;
-        vp8_sad8x16                  = vp8_sad8x16_mmx;
-        vp8_sad8x8                   = vp8_sad8x8_mmx;
-        vp8_sad4x4                   = vp8_sad4x4_mmx;
-        vp8_block_error               = vp8_block_error_mmx;
-        vp8_mbblock_error             = vp8_mbblock_error_mmx;
-        vp8_subtract_mby              = vp8_subtract_mby_mmx;
-
-    }
-    else
-    {
-        // Pure C:
-        vp8_mbuverror                = vp8_mbuverror_c;
-        vp8_fast_quantize_b            = vp8_fast_quantize_b_c;
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
-        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_c;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
-        vp8_subtract_b                = vp8_subtract_b_c;
-        vp8_subtract_mbuv             = vp8_subtract_mbuv_c;
-        vp8_variance4x4              = vp8_variance4x4_c;
-        vp8_variance8x8              = vp8_variance8x8_c;
-        vp8_variance8x16             = vp8_variance8x16_c;
-        vp8_variance16x8             = vp8_variance16x8_c;
-        vp8_variance16x16            = vp8_variance16x16_c;
-        vp8_mse16x16                 = vp8_mse16x16_c;
-        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_c;
-        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_c;
-        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_c;
-        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_c;
-        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_c;
-        vp8_get_mb_ss                  = vp8_get_mb_ss_c;
-        vp8_get16x16pred_error        = vp8_get16x16pred_error_c;
-        vp8_get8x8var                = vp8_get8x8var_c;
-        vp8_get16x16var              = vp8_get16x16var_c;
-        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_c;
-        vp8_sad16x16                 = vp8_sad16x16_c;
-        vp8_sad16x8                  = vp8_sad16x8_c;
-        vp8_sad8x16                  = vp8_sad8x16_c;
-        vp8_sad8x8                   = vp8_sad8x8_c;
-        vp8_sad4x4                   = vp8_sad4x4_c;
-        vp8_block_error               = vp8_block_error_c;
-        vp8_mbblock_error             = vp8_mbblock_error_c;
-        vp8_subtract_mby              = vp8_subtract_mby_c;
-    }
-
-}
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index e13423796..5acaca875 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -12,8 +13,7 @@
 
 section .text
     global sym(vp8_short_fdct4x4_mmx)
-    global sym(vp8_fast_fdct4x4_mmx)
-    global sym(vp8_fast_fdct8x4_wmt)
+    global sym(vp8_short_fdct8x4_wmt)
 
 
 %define         DCTCONSTANTSBITS         (16)
@@ -23,10 +23,6 @@ section .text
 %define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
 
 
-%define _1STSTAGESHIFT           14
-%define _2NDSTAGESHIFT           16
-
-; using matrix multiply with source and destbuffer has a pitch
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
 sym(vp8_short_fdct4x4_mmx):
     push        rbp
@@ -36,337 +32,10 @@ sym(vp8_short_fdct4x4_mmx):
     push rsi
     push rdi
     ; end prolog
-
-        mov         rsi,    arg(0) ;input
-        mov         rdi,    arg(1) ;output
-
-        movsxd      rax,    dword ptr arg(2) ;pitch
-        lea         rdx,    [dct_matrix GLOBAL]
-
-        movq        mm0,    [rsi   ]
-        movq        mm1,    [rsi + rax]
-
-        movq        mm2,    [rsi + rax*2]
-        lea         rsi,    [rsi + rax*2]
-
-        movq        mm3,    [rsi + rax]
-
-        ; first column
-        movq        mm4,    mm0
-        movq        mm7,    [rdx]
-
-        pmaddwd     mm4,    mm7
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    mm7
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-
-        pmaddwd     mm5,    mm7
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    mm7
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _1STSTAGESHIFT
-        psrad       mm5,    _1STSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi],  mm4
-
-        ;second column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+8]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+8]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _1STSTAGESHIFT
-        psrad       mm5,    _1STSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+8],  mm4
-
-
-        ;third column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+16]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+16]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _1STSTAGESHIFT
-        psrad       mm5,    _1STSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+16],  mm4
-
-        ;fourth column (this is the last column, so we do not have save the source any more)
-
-        pmaddwd     mm0,    [rdx+24]
-
-        pmaddwd     mm1,    [rdx+24]
-        movq        mm6,    mm0
-
-        punpckldq   mm0,    mm1
-        punpckhdq   mm6,    mm1
-
-        paddd       mm0,    mm6
-
-        pmaddwd     mm2,    [rdx+24]
-
-        pmaddwd     mm3,    [rdx+24]
-        movq        mm7,    mm2
-
-        punpckldq   mm2,    mm3
-        punpckhdq   mm7,    mm3
-
-        paddd       mm2,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm0,    mm6
-        paddd       mm2,    mm6
-
-        psrad       mm0,    _1STSTAGESHIFT
-        psrad       mm2,    _1STSTAGESHIFT
-
-        packssdw    mm0,    mm2
-
-        movq        mm3,    mm0
-
-        ; done with one pass
-        ; now start second pass
-        movq        mm0,    [rdi   ]
-        movq        mm1,    [rdi+ 8]
-        movq        mm2,    [rdi+ 16]
-
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi],  mm4
-
-        ;second column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+8]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+8]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+8],  mm4
-
-
-        ;third column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+16]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+16]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+16],  mm4
-
-        ;fourth column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+24]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+24]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+24]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+24]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+24],  mm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
-sym(vp8_fast_fdct4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
         mov     rsi,    arg(0) ;input
         mov     rdi,    arg(1) ;output
 
-        lea     rdx,    [dct_const_mmx GLOBAL]
+        lea     rdx,    [GLOBAL(dct_const_mmx)]
         movsxd  rax,    dword ptr arg(2) ;pitch
 
         lea     rcx,    [rsi + rax*2]
@@ -378,11 +47,11 @@ sym(vp8_fast_fdct4x4_mmx):
         movq    mm3,    [rcx + rax]
         ; get the constants
         ;shift to left by 1 for prescision
-        paddw   mm0,    mm0
-        paddw   mm1,    mm1
+        psllw   mm0,    3
+        psllw   mm1,    3
 
-        psllw   mm2,    1
-        psllw   mm3,    1
+        psllw   mm2,    3
+        psllw   mm3,    3
 
         ; transpose for the second stage
         movq    mm4,    mm0         ; 00 01 02 03
@@ -530,20 +199,23 @@ sym(vp8_fast_fdct4x4_mmx):
         movq    mm3,    mm5
         ; done with vertical
 
-		pcmpeqw	mm4,	mm4
-		pcmpeqw	mm5,	mm5
-		psrlw	mm4,	15
-		psrlw	mm5,	15
+        pcmpeqw mm4,    mm4
+        pcmpeqw mm5,    mm5
+        psrlw   mm4,    15
+        psrlw   mm5,    15
+
+        psllw   mm4,    2
+        psllw   mm5,    2
 
         paddw   mm0,    mm4
         paddw   mm1,    mm5
         paddw   mm2,    mm4
         paddw   mm3,    mm5
 
-        psraw   mm0, 1
-        psraw   mm1, 1
-        psraw   mm2, 1
-        psraw   mm3, 1
+        psraw   mm0, 3
+        psraw   mm1, 3
+        psraw   mm2, 3
+        psraw   mm3, 3
 
         movq        [rdi   ],   mm0
         movq        [rdi+ 8],   mm1
@@ -559,8 +231,8 @@ sym(vp8_fast_fdct4x4_mmx):
     ret
 
 
-;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_fast_fdct8x4_wmt):
+;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_short_fdct8x4_wmt):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
@@ -571,7 +243,7 @@ sym(vp8_fast_fdct8x4_wmt):
         mov         rsi,    arg(0) ;input
         mov         rdi,    arg(1) ;output
 
-        lea         rdx,    [dct_const_xmm GLOBAL]
+        lea         rdx,    [GLOBAL(dct_const_xmm)]
         movsxd      rax,    dword ptr arg(2) ;pitch
 
         lea         rcx,    [rsi + rax*2]
@@ -583,11 +255,11 @@ sym(vp8_fast_fdct8x4_wmt):
         movdqa      xmm3,       [rcx + rax]
         ; get the constants
         ;shift to left by 1 for prescision
-        psllw       xmm0,        1
-        psllw       xmm2,        1
+        psllw       xmm0,        3
+        psllw       xmm2,        3
 
-        psllw       xmm4,        1
-        psllw       xmm3,        1
+        psllw       xmm4,        3
+        psllw       xmm3,        3
 
         ; transpose for the second stage
         movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
@@ -757,20 +429,23 @@ sym(vp8_fast_fdct8x4_wmt):
         ; done with vertical
 
 
-        pcmpeqw		xmm4,		xmm4
-        pcmpeqw		xmm5,		xmm5;
-        psrlw		xmm4,		15
-        psrlw		xmm5,		15
+        pcmpeqw     xmm4,       xmm4
+        pcmpeqw     xmm5,       xmm5;
+        psrlw       xmm4,       15
+        psrlw       xmm5,       15
+
+        psllw       xmm4,       2
+        psllw       xmm5,       2
 
         paddw       xmm0,       xmm4
         paddw       xmm1,       xmm5
         paddw       xmm2,       xmm4
         paddw       xmm3,       xmm5
 
-        psraw       xmm0,       1
-        psraw       xmm1,       1
-        psraw       xmm2,       1
-        psraw       xmm3,       1
+        psraw       xmm0,       3
+        psraw       xmm1,       3
+        psraw       xmm2,       3
+        psraw       xmm3,       3
 
         movq        QWORD PTR[rdi   ],   xmm0
         movq        QWORD PTR[rdi+ 8],   xmm1
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 3e5e9a70c..723a78d76 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -1,260 +1,189 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
 %include "vpx_ports/x86_abi_support.asm"
 
-global sym(vp8_short_fdct4x4_wmt)
-
-%define         DCTCONSTANTSBITS         (16)
-%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
-%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
-%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
-%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
-
-%define _1STSTAGESHIFT           14
-%define _2NDSTAGESHIFT           16
-
-
-;; using matrix multiply
-;void vp8_short_fdct4x4_wmt(short *input, short *output)
-sym(vp8_short_fdct4x4_wmt):
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
+    SHADOW_ARGS_TO_STACK 3
+;;    SAVE_XMM
     GET_GOT     rbx
+    push        rsi
+    push        rdi
     ; end prolog
 
-        mov         rax,        arg(0) ;input
-        mov         rcx,        arg(1) ;output
-
-        lea         rdx,        [dct_matrix_sse2 GLOBAL]
-
-        movdqu      xmm0,       [rax   ]
-        movdqu      xmm1,       [rax+16]
-
-        ; first column
-        movdqa      xmm2,       xmm0
-        movdqa      xmm7,       [rdx]
-
-        pmaddwd     xmm2,       xmm7
-        movdqa      xmm3,       xmm1
-
-        pmaddwd     xmm3,       xmm7
-        movdqa      xmm4,       xmm2
-
-        punpckldq   xmm2,       xmm3
-        punpckhdq   xmm4,       xmm3
-
-        movdqa      xmm3,       xmm2
-        punpckldq   xmm2,       xmm4
-
-        punpckhdq   xmm3,       xmm4
-        paddd       xmm2,       xmm3
-
-
-        paddd       xmm2,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-        psrad       xmm2,       _1STSTAGESHIFT
-        ;second column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+16]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+16]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
-        psrad       xmm3,       _1STSTAGESHIFT
-        packssdw    xmm2,       xmm3
-
-        ;third column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+32]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+32]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm3,       _1STSTAGESHIFT
-
-        ;fourth column (this is the last column, so we do not have save the source any more)
-        pmaddwd     xmm0,       [rdx+48]
-        pmaddwd     xmm1,       [rdx+48]
-
-        movdqa      xmm4,       xmm0
-        punpckldq   xmm0,       xmm1
-
-        punpckhdq   xmm4,       xmm1
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm4
-        punpckhdq   xmm1,       xmm4
-
-        paddd       xmm0,       xmm1
-        paddd       xmm0,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
-        psrad       xmm0,       _1STSTAGESHIFT
-        packssdw    xmm3,       xmm0
-        ; done with one pass
-        ; now start second pass
-        movdqa      xmm0,       xmm2
-        movdqa      xmm1,       xmm3
-
-        pmaddwd     xmm2,       xmm7
-        pmaddwd     xmm3,       xmm7
-
-        movdqa      xmm4,       xmm2
-        punpckldq   xmm2,       xmm3
+    mov         rsi, arg(0)
+    movsxd      rax, DWORD PTR arg(2)
+    lea         rdi, [rsi + rax*2]
+
+    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
+    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
+    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
+    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
+
+    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
+    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
+
+    mov         rdi, arg(1)
+
+    movdqa      xmm2, xmm0
+    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
+    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
+    movdqa      xmm1, xmm0
+    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
+    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
+    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
+
+    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
+    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
+    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
+    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
+    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
+    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
+
+    packssdw    xmm0, xmm1                      ;op[2] op[0]
+    packssdw    xmm3, xmm4                      ;op[3] op[1]
+    ; 23 22 21 20 03 02 01 00
+    ;
+    ; 33 32 31 30 13 12 11 10
+    ;
+    movdqa      xmm2, xmm0
+    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
+
+    movdqa      xmm3, xmm0
+    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
+    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
+    movdqa      xmm2, xmm0
+    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
+    pshufd      xmm2, xmm2, 04eh
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
+    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
+
+    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
+    movdqa      xmm2, xmm3                      ;save d1 for compare
+    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
+    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
+    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
+    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
+    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+    pxor        xmm4, xmm4                      ;zero out for compare
+    paddd       xmm0, xmm5
+    paddd       xmm1, xmm5
+    pcmpeqw     xmm2, xmm4
+    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
+    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
+    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+                                                     ;and keep bit 0 of lower
+
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
+    packssdw    xmm0, xmm1                      ;op[8] op[0]
+    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
+    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
+
+    packssdw    xmm3, xmm4                      ;op[12] op[4]
+    movdqa      xmm1, xmm0
+    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
+    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
+    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
+
+    movdqa      XMMWORD PTR[rdi + 0], xmm0
+    movdqa      XMMWORD PTR[rdi + 16], xmm1
 
-        punpckhdq   xmm4,       xmm3
-        movdqa      xmm3,       xmm2
-
-        punpckldq   xmm2,       xmm4
-        punpckhdq   xmm3,       xmm4
-
-        paddd       xmm2,       xmm3
-        paddd       xmm2,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm2,       _2NDSTAGESHIFT
-
-        ;second column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+16]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+16]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm3,       _2NDSTAGESHIFT
-        packssdw    xmm2,       xmm3
-
-        movdqu      [rcx],      xmm2
-        ;third column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+32]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+32]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm3,       _2NDSTAGESHIFT
-        ;fourth column
-        pmaddwd     xmm0,       [rdx+48]
-        pmaddwd     xmm1,       [rdx+48]
-
-        movdqa      xmm4,       xmm0
-        punpckldq   xmm0,       xmm1
-
-        punpckhdq   xmm4,       xmm1
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm4
-        punpckhdq   xmm1,       xmm4
-
-        paddd       xmm0,       xmm1
-        paddd       xmm0,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm0,       _2NDSTAGESHIFT
-        packssdw    xmm3,       xmm0
-
-        movdqu     [rcx+16],   xmm3
-
-    mov rsp, rbp
     ; begin epilog
+    pop rdi
+    pop rsi
     RESTORE_GOT
+;;    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
 
-
 SECTION_RODATA
-;static unsigned int dct1st_stage_rounding_sse2[4] =
 align 16
-dct1st_stage_rounding_sse2:
-    times 4 dd 8192
-
-
-;static unsigned int dct2nd_stage_rounding_sse2[4] =
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
 align 16
-dct2nd_stage_rounding_sse2:
-    times 4 dd 32768
-
-;static short dct_matrix_sse2[4][8]=
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
 align 16
-dct_matrix_sse2:
-    times 8 dw 23170
-
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-
-    dw  23170
-    times 2 dw -23170
-    times 2 dw  23170
-    times 2 dw -23170
-    dw  23170
+_mult_add:
+    times 8 dw 1
+align 16
+_cmp_mask:
+    times 4 dw 1
+    times 4 dw 0
 
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
+align 16
+_mult_sub:
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+align 16
+_7:
+    times 4 dd 7
+align 16
+_14500:
+    times 4 dd 14500
+align 16
+_7500:
+    times 4 dd 7500
+align 16
+_12000:
+    times 4 dd 12000
+align 16
+_51000:
+    times 4 dd 51000
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index bc80e64ef..05824c684 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -21,46 +22,41 @@
 #if HAVE_MMX
 extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);
-extern prototype_fdct(vp8_fast_fdct4x4_mmx);
-extern prototype_fdct(vp8_fast_fdct8x4_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
+#if 0
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
 
 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-
-#undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
-
-#undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+#endif
 
 #endif
 #endif
 
 
 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct4x4_wmt);
 extern prototype_fdct(vp8_short_fdct8x4_wmt);
-extern prototype_fdct(vp8_fast_fdct8x4_wmt);
-
 extern prototype_fdct(vp8_short_walsh4x4_sse2);
 
-#if !CONFIG_RUNTIME_CPU_DETECT
+extern prototype_fdct(vp8_short_fdct4x4_sse2);
 
-#if 0
+#if !CONFIG_RUNTIME_CPU_DETECT
+#if 1
 /* short SSE2 DCT currently disabled, does not match the MMX version */
 #undef  vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
 
 #undef  vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
 #endif
 
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
+
 #undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
 
 #undef vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2
diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
index 9397a6cca..69b3edd66 100644
--- a/vp8/encoder/x86/encodemb_x86.h
+++ b/vp8/encoder/x86/encodemb_x86.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -54,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx);
 extern prototype_berr(vp8_block_error_xmm);
 extern prototype_mberr(vp8_mbblock_error_xmm);
 extern prototype_mbuverr(vp8_mbuverror_xmm);
-
+extern prototype_subb(vp8_subtract_b_sse2);
+extern prototype_submby(vp8_subtract_mby_sse2);
+extern prototype_submbuv(vp8_subtract_mbuv_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_encodemb_berr
@@ -66,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm);
 #undef  vp8_encodemb_mbuverr
 #define vp8_encodemb_mbuverr vp8_mbuverror_xmm
 
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_sse2
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_sse2
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2
+
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 194047155..c0f06bbbb 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -1,16 +1,16 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
 %include "vpx_ports/x86_abi_support.asm"
 
-
 ;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
 global sym(vp8_block_error_xmm)
 sym(vp8_block_error_xmm):
@@ -19,11 +19,9 @@ sym(vp8_block_error_xmm):
     SHADOW_ARGS_TO_STACK 2
     push rsi
     push rdi
-    ; end prolog
-
+    ; end prologue
 
         mov         rsi,        arg(0) ;coeff_ptr
-        pxor        xmm7,       xmm7
 
         mov         rdi,        arg(1) ;dcoef_ptr
         movdqa      xmm3,       [rsi]
@@ -32,33 +30,27 @@ sym(vp8_block_error_xmm):
         movdqa      xmm5,       [rsi+16]
 
         movdqa      xmm6,       [rdi+16]
-        pxor        xmm1,       xmm1    ; from movd xmm1, dc; dc=0
+        psubw       xmm3,       xmm4
 
-        movdqa      xmm2,       xmm7
         psubw       xmm5,       xmm6
-
-        por         xmm1,       xmm2
+        pmaddwd     xmm3,       xmm3
         pmaddwd     xmm5,       xmm5
 
-        pcmpeqw     xmm1,       xmm7
-        psubw       xmm3,       xmm4
+        paddd       xmm3,       xmm5
 
-        pand        xmm1,       xmm3
-        pmaddwd     xmm1,       xmm1
-
-        paddd       xmm1,       xmm5
-        movdqa      xmm0,       xmm1
+        pxor        xmm7,       xmm7
+        movdqa      xmm0,       xmm3
 
         punpckldq   xmm0,       xmm7
-        punpckhdq   xmm1,       xmm7
+        punpckhdq   xmm3,       xmm7
 
-        paddd       xmm0,       xmm1
-        movdqa      xmm1,       xmm0
+        paddd       xmm0,       xmm3
+        movdqa      xmm3,       xmm0
 
         psrldq      xmm0,       8
-        paddd       xmm0,       xmm1
+        paddd       xmm0,       xmm3
 
-        movd        rax,        xmm0
+        movq        rax,        xmm0
 
     pop rdi
     pop rsi
@@ -67,7 +59,6 @@ sym(vp8_block_error_xmm):
     pop         rbp
     ret
 
-
 ;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
 global sym(vp8_block_error_mmx)
 sym(vp8_block_error_mmx):
@@ -124,7 +115,7 @@ sym(vp8_block_error_mmx):
         psrlq       mm1,        32
         paddd       mm0,        mm1
 
-        movd        rax,        mm0
+        movq        rax,        mm0
 
     pop rdi
     pop rsi
@@ -201,7 +192,7 @@ mberror_loop_mmx:
         psrlq       mm2,        32
 
         paddd       mm0,        mm2
-        movd        rax,        mm0
+        movq        rax,        mm0
 
     pop rdi
     pop rsi
@@ -269,7 +260,7 @@ mberror_loop:
         psrldq      xmm0,       8
 
         paddd       xmm0,       xmm1
-        movd        rax,        xmm0
+        movq        rax,        xmm0
 
     pop rdi
     pop rsi
@@ -326,7 +317,7 @@ mbuverror_loop_mmx:
         psrlq           mm7,        32
 
         paddd           mm0,        mm7
-        movd            rax,        mm0
+        movq            rax,        mm0
 
     pop rdi
     pop rsi
@@ -383,7 +374,7 @@ mbuverror_loop:
         psrldq      xmm1,           8
         paddd       xmm1,           xmm2
 
-        movd            rax,            xmm1
+        movq            rax,            xmm1
 
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
index 7d8620178..39439f0d8 100644
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -16,102 +17,148 @@ sym(vp8_short_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM
+    GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog
 
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-
-    movdqu    xmm4, [rsi + 0]       ;ip[4] ip[0]
-    movdqu    xmm0, [rsi + 16]      ;ip[12] ip[8]
-
-    pxor  xmm7, xmm7
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
-
-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
+    mov     rsi, arg(0)           ; input
+    mov     rdi, arg(1)           ; output
+    movsxd  rdx, dword ptr arg(2) ; pitch
+
+    ; first for loop
+    movq    xmm0, MMWORD PTR [rsi]           ; load input
+    movq    xmm1, MMWORD PTR [rsi + rdx]
+    lea     rsi,  [rsi + rdx*2]
+    movq    xmm2, MMWORD PTR [rsi]
+    movq    xmm3, MMWORD PTR [rsi + rdx]
+
+    punpcklwd xmm0,  xmm1
+    punpcklwd xmm2,  xmm3
+
+    movdqa    xmm1, xmm0
+    punpckldq xmm0, xmm2           ; ip[1] ip[0]
+    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+    psllw     xmm0, 2              ; d1  a1
+    psllw     xmm2, 2              ; c1  b1
+
+    movdqa    xmm1, xmm0
+    punpcklqdq xmm0, xmm2          ; b1  a1
+    punpckhqdq xmm1, xmm2          ; c1  d1
+
+    pxor      xmm6, xmm6
+    movq      xmm6, xmm0
+    pxor      xmm7, xmm7
+    pcmpeqw   xmm7, xmm6
+    paddw     xmm7, [GLOBAL(c1)]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1           ; b1+c1  a1+d1
+    psubw     xmm2, xmm1           ; b1-c1  a1-d1
+    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
+
+    ; second for loop
+    ; input: 13  9  5  1 12  8  4  0 (xmm0)
+    ;        14 10  6  2 15 11  7  3 (xmm2)
+    ; after shuffle:
+    ;        13  5  9  1 12  4  8  0 (xmm0)
+    ;        14  6 10  2 15  7 11  3 (xmm1)
+    pshuflw   xmm3, xmm0, 0xd8
+    pshufhw   xmm0, xmm3, 0xd8
+    pshuflw   xmm3, xmm2, 0xd8
+    pshufhw   xmm1, xmm3, 0xd8
+
+    movdqa    xmm2, xmm0
+    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
+    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
+    movdqa    xmm3, xmm1
+    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
+    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
+
+    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
+    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
+    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
+    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
+
+    movdqa    xmm0, xmm4
+    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
+    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
+    movdqa    xmm1, xmm6
+    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
+    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
+
+    movdqa    xmm2, xmm0
+    paddd     xmm0, xmm4            ; b21 b20 a21 a20
+    psubd     xmm2, xmm4            ; c21 c20 d21 d20
+    movdqa    xmm3, xmm1
+    paddd     xmm1, xmm6            ; b23 b22 a23 a22
+    psubd     xmm3, xmm6            ; c23 c22 d23 d22
+
+    pxor      xmm4, xmm4
     movdqa    xmm5, xmm4
-    punpcklqdq  xmm4, xmm3          ;d1 a1
-    punpckhqdq  xmm5, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm5          ;ip[4] ip[0]
-
-    paddw   xmm5, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm6, xmm5
-    punpcklqdq  xmm5, xmm3          ;d1 a1
-    punpckhqdq  xmm6, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm6          ;c1 b1
-    paddw   xmm6, xmm5          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm5, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-
-    movdqa    xmm0, xmm6          ;aka b2 a2
-    movdqa    xmm1, xmm5          ;aka d2 c2
-
-    pcmpgtw   xmm0, xmm7
-    pcmpgtw   xmm1, xmm7
-
-    psrlw   xmm0, 15
-    psrlw   xmm1, 15
-
-    paddw   xmm6, xmm0
-    paddw   xmm5, xmm1
-
-    psraw   xmm6, 1
-    psraw   xmm5, 1
-
-    ;   a2 = a1 + b1;
-    ;   b2 = c1 + d1;
-    ;   c2 = a1 - b1;
-    ;   d2 = d1 - c1;
-    ;        a2 += (a2>0);
-    ;        b2 += (b2>0);
-    ;        c2 += (c2>0);
-    ;        d2 += (d2>0);
-    ;   op[0] = (a2)>>1;
-    ;   op[4] = (b2)>>1;
-    ;   op[8] = (c2)>>1;
-    ;   op[12]= (d2)>>1;
-
-    movdqu  [rdi + 0], xmm6
-    movdqu  [rdi + 16], xmm5
+    pcmpgtd   xmm4, xmm0
+    pcmpgtd   xmm5, xmm2
+    pand      xmm4, [GLOBAL(cd1)]
+    pand      xmm5, [GLOBAL(cd1)]
+
+    pxor      xmm6, xmm6
+    movdqa    xmm7, xmm6
+    pcmpgtd   xmm6, xmm1
+    pcmpgtd   xmm7, xmm3
+    pand      xmm6, [GLOBAL(cd1)]
+    pand      xmm7, [GLOBAL(cd1)]
+
+    paddd     xmm0, xmm4
+    paddd     xmm2, xmm5
+    paddd     xmm0, [GLOBAL(cd3)]
+    paddd     xmm2, [GLOBAL(cd3)]
+    paddd     xmm1, xmm6
+    paddd     xmm3, xmm7
+    paddd     xmm1, [GLOBAL(cd3)]
+    paddd     xmm3, [GLOBAL(cd3)]
+
+    psrad     xmm0, 3
+    psrad     xmm1, 3
+    psrad     xmm2, 3
+    psrad     xmm3, 3
+    movdqa    xmm4, xmm0
+    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
+    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
+    movdqa    xmm5, xmm2
+    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
+    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
+
+    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
+    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
+
+    movdqa  XMMWORD PTR [rdi], xmm0
+    movdqa  XMMWORD PTR [rdi + 16], xmm2
 
     ; begin epilog
     pop rdi
     pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+SECTION_RODATA
+align 16
+c1:
+    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
index 5661491ad..3b7b29c21 100644
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -23,5 +24,14 @@
 #endif
 #endif
 
+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx8
+
+#endif
+#endif
+
 #endif
 
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
index 69617ca47..a182c8856 100644
--- a/vp8/encoder/x86/preproc_mmx.c
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
index 847fc6e37..f29a54ecd 100644
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -248,7 +249,7 @@ sym(vp8_fast_quantize_b_impl_mmx):
         paddd           mm0,        mm5
 
         ; eob adjustment begins here
-        movd            rcx,        mm0
+        movq            rcx,        mm0
         and             rcx,        0xffff
 
         xor             rdx,        rdx
@@ -261,7 +262,7 @@ sym(vp8_fast_quantize_b_impl_mmx):
         and             rax,        rdx
         ; Substitute the sse assembly for the old mmx mixed assembly/C. The
         ; following is kept as reference
-        ;    movd            rcx,        mm0
+        ;    movq            rcx,        mm0
         ;    bsr             rax,        rcx
         ;
         ;    mov             eob,        rax
@@ -283,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_sse)
-sym(vp8_fast_quantize_b_impl_sse):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movdqa          xmm0,       [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movdqa          xmm1,       [rax]
-
-        movdqa          xmm3,       xmm0
-        psraw           xmm0,       15
-
-        pxor            xmm3,       xmm0
-        psubw           xmm3,       xmm0            ; abs
-
-        movdqa          xmm2,       xmm3
-        pcmpgtw         xmm1,       xmm2
-
-        pandn           xmm1,       xmm2
-        movdqa          xmm3,       xmm1
-
-        mov             rdx,        arg(6) ; quant_ptr
-        movdqa          xmm1,       [rdx]
-
-        mov             rcx,        arg(5) ; round_ptr
-        movdqa          xmm2,       [rcx]
-
-        paddw           xmm3,       xmm2
-        pmulhuw         xmm3,       xmm1
-
-        pxor            xmm3,       xmm0
-        psubw           xmm3,       xmm0        ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movdqa          xmm0,       xmm3
-
-        movdqa          [rdi],      xmm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movdqa          xmm2,       [rax]
-
-        pmullw          xmm3,       xmm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movdqa          [rax],      xmm3
-
-        ; next 8
-        movdqa          xmm4,       [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movdqa          xmm5,       [rax+16]
-
-        movdqa          xmm7,       xmm4
-        psraw           xmm4,       15
-
-        pxor            xmm7,       xmm4
-        psubw           xmm7,       xmm4            ; abs
-
-        movdqa          xmm6,       xmm7
-        pcmpgtw         xmm5,       xmm6
-
-        pandn           xmm5,       xmm6
-        movdqa          xmm7,       xmm5
-
-        movdqa          xmm5,       [rdx+16]
-        movdqa          xmm6,       [rcx+16]
-
-
-        paddw           xmm7,       xmm6
-        pmulhuw         xmm7,       xmm5
-
-        pxor            xmm7,       xmm4
-        psubw           xmm7,       xmm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movdqa          xmm1,       xmm7
-        movdqa          [rdi+16],   xmm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movdqa          xmm6,       [rax+16]
-
-        pmullw          xmm7,       xmm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movdqa          [rax+16],   xmm7
-        mov             rdi,        arg(4) ;scan_mask
-
-        pxor            xmm7,       xmm7
-        movdqa          xmm2,       [rdi]
-
-        movdqa          xmm3,       [rdi+16];
-        pcmpeqw         xmm0,       xmm7
-
-        pcmpeqw         xmm1,       xmm7
-        pcmpeqw         xmm6,       xmm6
-
-        pxor            xmm0,       xmm6
-        pxor            xmm1,       xmm6
-
-        psrlw           xmm0,       15
-        psrlw           xmm1,       15
-
-        pmaddwd         xmm0,       xmm2
-        pmaddwd         xmm1,       xmm3
-
-        movq            xmm2,       xmm0
-        movq            xmm3,       xmm1
-
-        psrldq          xmm0,       8
-        psrldq          xmm1,       8
-
-        paddd           xmm0,       xmm1
-        paddd           xmm2,       xmm3
-
-        paddd           xmm0,       xmm2
-        movq            xmm1,       xmm0
-
-        psrldq          xmm0,       4
-        paddd           xmm1,       xmm0
-
-        movd            rcx,        xmm1
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
new file mode 100644
index 000000000..1e0bd5c48
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -0,0 +1,388 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
+;               short *qcoeff_ptr,short *dequant_ptr,
+;               const int *default_zig_zag, short *round_ptr,
+;               short *quant_ptr, short *dqcoeff_ptr,
+;               unsigned short zbin_oq_value,
+;               short *zbin_boost_ptr);
+;
+global sym(vp8_regular_quantize_b_impl_sse2)
+sym(vp8_regular_quantize_b_impl_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 10
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+
+    %define abs_minus_zbin_lo 0
+    %define abs_minus_zbin_hi 16
+    %define temp_qcoeff_lo 32
+    %define temp_qcoeff_hi 48
+    %define save_xmm6 64
+    %define save_xmm7 80
+    %define eob 96
+
+    %define vp8_regularquantizeb_stack_size eob + 16
+
+    sub         rsp, vp8_regularquantizeb_stack_size
+
+    movdqa      OWORD PTR[rsp + save_xmm6], xmm6
+    movdqa      OWORD PTR[rsp + save_xmm7], xmm7
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         eax, arg(8)                 ;zbin_oq_value
+
+    mov         rcx, arg(1)                 ;zbin_ptr
+    movd        xmm7, eax
+
+    movdqa      xmm0, OWORD PTR[rdx]
+    movdqa      xmm4, OWORD PTR[rdx + 16]
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+
+    movdqa      xmm2, OWORD PTR[rcx]        ;load zbin_ptr
+    movdqa      xmm3, OWORD PTR[rcx + 16]   ;load zbin_ptr
+
+    pshuflw     xmm7, xmm7, 0
+    psubw       xmm1, xmm0                  ;x = abs(z)
+
+    punpcklwd   xmm7, xmm7                  ;duplicated zbin_oq_value
+    psubw       xmm5, xmm4                  ;x = abs(z)
+
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    psubw       xmm1, xmm2                  ;sub (zbin_ptr + zbin_oq_value)
+    psubw       xmm5, xmm3                  ;sub (zbin_ptr + zbin_oq_value)
+
+    mov         rdi, arg(5)                 ;round_ptr
+    mov         rsi, arg(6)                 ;quant_ptr
+
+    movdqa      OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
+    movdqa      OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
+
+    paddw       xmm1, xmm2                  ;add (zbin_ptr + zbin_oq_value) back
+    paddw       xmm5, xmm3                  ;add (zbin_ptr + zbin_oq_value) back
+
+    movdqa      xmm2, OWORD PTR[rdi]
+    movdqa      xmm3, OWORD PTR[rsi]
+
+    movdqa      xmm6, OWORD PTR[rdi + 16]
+    movdqa      xmm7, OWORD PTR[rsi + 16]
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm6
+
+    pmulhw      xmm1, xmm3
+    pmulhw      xmm5, xmm7
+
+    mov         rsi, arg(2)                 ;qcoeff_ptr
+    pxor        xmm6, xmm6
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      OWORD PTR[rsp + temp_qcoeff_lo], xmm1
+    movdqa      OWORD PTR[rsp + temp_qcoeff_hi], xmm5
+
+    movdqa      OWORD PTR[rsi], xmm6        ;zero qcoeff
+    movdqa      OWORD PTR[rsi + 16], xmm6   ;zero qcoeff
+
+    xor         rax, rax
+    mov         rcx, -1
+
+    mov         [rsp + eob], rcx
+    mov         rsi, arg(9)                 ;zbin_boost_ptr
+
+    mov         rbx, arg(4)                 ;default_zig_zag
+
+rq_zigzag_loop:
+    movsxd      rcx, DWORD PTR[rbx + rax*4] ;now we have rc
+    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
+    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
+
+    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+
+    sub         edx, edi                    ;x - zbin
+    jl          rq_zigzag_1
+
+    mov         rdi, arg(2)                 ;qcoeff_ptr
+
+    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+    cmp         edx, 0
+    je          rq_zigzag_1
+
+    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+    mov         rsi, arg(9)                 ;zbin_boost_ptr
+    mov         [rsp + eob], rax            ;eob = i
+
+rq_zigzag_1:
+    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
+    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
+    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
+
+    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+    lea         rax, [rax + 1]
+
+    sub         edx, edi                    ;x - zbin
+    jl          rq_zigzag_1a
+
+    mov         rdi, arg(2)                 ;qcoeff_ptr
+
+    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+    cmp         edx, 0
+    je          rq_zigzag_1a
+
+    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+    mov         rsi, arg(9)                 ;zbin_boost_ptr
+    mov         [rsp + eob], rax            ;eob = i
+
+rq_zigzag_1a:
+    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
+    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
+    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
+
+    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+    lea         rax, [rax + 1]
+
+    sub         edx, edi                    ;x - zbin
+    jl          rq_zigzag_1b
+
+    mov         rdi, arg(2)                 ;qcoeff_ptr
+
+    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+    cmp         edx, 0
+    je          rq_zigzag_1b
+
+    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+    mov         rsi, arg(9)                 ;zbin_boost_ptr
+    mov         [rsp + eob], rax            ;eob = i
+
+rq_zigzag_1b:
+    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
+    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
+    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
+
+    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+    lea         rax, [rax + 1]
+
+    sub         edx, edi                    ;x - zbin
+    jl          rq_zigzag_1c
+
+    mov         rdi, arg(2)                 ;qcoeff_ptr
+
+    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+    cmp         edx, 0
+    je          rq_zigzag_1c
+
+    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+    mov         rsi, arg(9)                 ;zbin_boost_ptr
+    mov         [rsp + eob], rax            ;eob = i
+
+rq_zigzag_1c:
+    lea         rax, [rax + 1]
+
+    cmp         rax, 16
+    jl          rq_zigzag_loop
+
+    mov         rdi, arg(2)                 ;qcoeff_ptr
+    mov         rcx, arg(3)                 ;dequant_ptr
+    mov         rsi, arg(7)                 ;dqcoeff_ptr
+
+    movdqa      xmm2, OWORD PTR[rdi]
+    movdqa      xmm3, OWORD PTR[rdi + 16]
+
+    movdqa      xmm0, OWORD PTR[rcx]
+    movdqa      xmm1, OWORD PTR[rcx + 16]
+
+    pmullw      xmm0, xmm2
+    pmullw      xmm1, xmm3
+
+    movdqa      OWORD PTR[rsi], xmm0        ;store dqcoeff
+    movdqa      OWORD PTR[rsi + 16], xmm1   ;store dqcoeff
+
+    mov         rax, [rsp + eob]
+
+    movdqa      xmm6, OWORD PTR[rsp + save_xmm6]
+    movdqa      xmm7, OWORD PTR[rsp + save_xmm7]
+
+    add         rax, 1
+
+    add         rsp, vp8_regularquantizeb_stack_size
+    pop         rsp
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse2)
+sym(vp8_fast_quantize_b_impl_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+
+    %define save_xmm6  0
+    %define save_xmm7 16
+
+    %define vp8_fastquantizeb_stack_size save_xmm7 + 16
+
+    sub         rsp, vp8_fastquantizeb_stack_size
+
+    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
+    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rax, arg(3)                 ;scan_mask
+    mov         rdi, arg(4)                 ;round_ptr
+    mov         rsi, arg(5)                 ;quant_ptr
+
+    movdqa      xmm0, XMMWORD PTR[rdx]
+    movdqa      xmm4, XMMWORD PTR[rdx + 16]
+
+    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
+    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0                  ;x = abs(z)
+    psubw       xmm5, xmm4                  ;x = abs(z)
+
+    paddw       xmm1, xmm6
+    paddw       xmm5, xmm7
+
+    pmulhw      xmm1, XMMWORD PTR[rsi]
+    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rsi, arg(6)                 ;dqcoeff_ptr
+
+    movdqa      xmm6, XMMWORD PTR[rcx]
+    movdqa      xmm7, XMMWORD PTR[rcx + 16]
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      XMMWORD PTR[rdi], xmm1
+    movdqa      XMMWORD PTR[rdi + 16], xmm5
+
+    pmullw      xmm6, xmm1
+    pmullw      xmm7, xmm5
+
+    movdqa      xmm2, XMMWORD PTR[rax]
+    movdqa      xmm3, XMMWORD PTR[rax+16];
+
+    pxor        xmm4, xmm4            ;clear all bits
+    pcmpeqw     xmm1, xmm4
+    pcmpeqw     xmm5, xmm4
+
+    pcmpeqw     xmm4, xmm4            ;set all bits
+    pxor        xmm1, xmm4
+    pxor        xmm5, xmm4
+
+    psrlw       xmm1, 15
+    psrlw       xmm5, 15
+
+    pmaddwd     xmm1, xmm2
+    pmaddwd     xmm5, xmm3
+
+    movq        xmm2, xmm1
+    movq        xmm3, xmm5
+
+    psrldq      xmm1, 8
+    psrldq      xmm5, 8
+
+    paddd       xmm1, xmm5
+    paddd       xmm2, xmm3
+
+    paddd       xmm1, xmm2
+    movq        xmm5, xmm1
+
+    psrldq      xmm1, 4
+    paddd       xmm5, xmm1
+
+    movq        rcx,  xmm5
+    and         rcx,  0xffff
+
+    xor         rdx,  rdx
+    sub         rdx,  rcx
+
+    bsr         rax,  rcx
+    inc         rax
+
+    sar         rdx,  31
+    and         rax,  rdx
+
+    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
+    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff
+
+    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
+    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]
+
+    add         rsp, vp8_fastquantizeb_stack_size
+    pop         rsp
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
new file mode 100755
index 000000000..2f33199e5
--- /dev/null
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,114 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+;               short *qcoeff_ptr,short *dequant_ptr,
+;               short *round_ptr,
+;               short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rdi, arg(3)                 ;round_ptr
+    mov         rsi, arg(4)                 ;quant_ptr
+
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+
+    movdqa      xmm2, [rdi]                 ;round lo
+    movdqa      xmm3, [rdi + 16]            ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    pmulhw      xmm1, [rsi]
+    pmulhw      xmm5, [rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rsi, arg(5)                 ;dqcoeff_ptr
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rdi], xmm1
+    movdqa      [rdi + 16], xmm5
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    pxor        xmm4, xmm4
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5
+
+    pcmpeqw     xmm1, xmm4                  ;non zero mask
+    pcmpeqw     xmm5, xmm4                  ;non zero mask
+    packsswb    xmm1, xmm5
+    pshufb      xmm1, [ GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm1
+
+;    xor         ecx, ecx
+;    mov         eax, -1
+;find_eob_loop:
+;    shr         edx, 1
+;    jc          fq_skip
+;    mov         eax, ecx
+;fq_skip:
+;    inc         ecx
+;    cmp         ecx, 16
+;    jne         find_eob_loop
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax                      ;flip the bits for bsr
+    bsr         eax, edx
+
+    movdqa      [rsi], xmm2                 ;store dqcoeff
+    movdqa      [rsi + 16], xmm3            ;store dqcoeff
+
+    sub         edi, edx                    ;check for all zeros in bit mask
+    sar         edi, 31                     ;0 or -1
+    add         eax, 1
+    and         eax, edi                    ;if the bit mask was all zero,
+                                            ;then eob = 0
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
new file mode 100644
index 000000000..b5b22c022
--- /dev/null
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+#ifndef QUANTIZE_X86_H
+#define QUANTIZE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_quantize_block(vp8_regular_quantize_b_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+/* The sse2 quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ *#undef vp8_quantize_quantb
+ *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+ */
+
+#endif
+
+#endif
+
+
+#endif
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index a825698e7..85cb023a4 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -16,8 +17,6 @@ global sym(vp8_sad8x8_mmx)
 global sym(vp8_sad4x4_mmx)
 global sym(vp8_sad16x8_mmx)
 
-%idefine QWORD
-
 ;unsigned int vp8_sad16x16_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -99,7 +98,7 @@ x16x16sad_mmx_loop:
         psrlq           mm0,        32
         paddw           mm7,        mm0
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
@@ -171,7 +170,7 @@ x8x16sad_mmx_loop:
         psrlq           mm0,        32
 
         paddw           mm7,        mm0
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
@@ -241,7 +240,7 @@ x8x8sad_mmx_loop:
         psrlq           mm0,        32
 
         paddw           mm7,        mm0
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
@@ -271,11 +270,11 @@ sym(vp8_sad4x4_mmx):
         movsxd          rax,        dword ptr arg(1) ;src_stride
         movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm0,       QWORD PTR [rsi]
-        movd            mm1,       QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
-        movd            mm2,       QWORD PTR [rsi+rax]
-        movd            mm3,       QWORD PTR [rdi+rdx]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
@@ -297,11 +296,11 @@ sym(vp8_sad4x4_mmx):
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
 
-        movd            mm4,       QWORD PTR [rsi]
-        movd            mm5,       QWORD PTR [rdi]
+        movd            mm4,        DWORD PTR [rsi]
+        movd            mm5,        DWORD PTR [rdi]
 
-        movd            mm6,       QWORD PTR [rsi+rax]
-        movd            mm7,       QWORD PTR [rdi+rdx]
+        movd            mm6,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm4,        mm6
         punpcklbw       mm5,        mm7
@@ -330,7 +329,7 @@ sym(vp8_sad4x4_mmx):
         psrlq           mm0,        32
         paddw           mm0,        mm1
 
-        movd            rax,        mm0
+        movq            rax,        mm0
 
     pop rdi
     pop rsi
@@ -417,7 +416,7 @@ x16x8sad_mmx_loop:
         psrlq           mm0,        32
 
         paddw           mm7,        mm0
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 53240bbf1..39ed79604 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -1,17 +1,16 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%idefine QWORD
-
 ;unsigned int vp8_sad16x16_wmt(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -74,7 +73,7 @@ x16x16sad_wmt_loop:
         psrldq          xmm7,       8
 
         paddw           xmm0,       xmm7
-        movd            rax,        xmm0
+        movq            rax,        xmm0
 
     ; begin epilog
     pop rdi
@@ -112,7 +111,7 @@ sym(vp8_sad8x16_wmt):
 
 x8x16sad_wmt_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              x8x16sad_wmt_early_exit
 
@@ -134,7 +133,7 @@ x8x16sad_wmt_loop:
         cmp             rsi,        rcx
         jne             x8x16sad_wmt_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
 x8x16sad_wmt_early_exit:
 
@@ -173,7 +172,7 @@ sym(vp8_sad8x8_wmt):
 
 x8x8sad_wmt_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              x8x8sad_wmt_early_exit
 
@@ -189,7 +188,7 @@ x8x8sad_wmt_loop:
         cmp             rsi,        rcx
         jne             x8x8sad_wmt_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 x8x8sad_wmt_early_exit:
 
     ; begin epilog
@@ -220,11 +219,11 @@ sym(vp8_sad4x4_wmt):
         movsxd          rax,        dword ptr arg(1) ;src_stride
         movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm0,       QWORD PTR [rsi]
-        movd            mm1,       QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
-        movd            mm2,       QWORD PTR [rsi+rax]
-        movd            mm3,       QWORD PTR [rdi+rdx]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
@@ -233,19 +232,19 @@ sym(vp8_sad4x4_wmt):
         lea             rsi,        [rsi+rax*2]
 
         lea             rdi,        [rdi+rdx*2]
-        movd            mm4,       QWORD PTR [rsi]
+        movd            mm4,        DWORD PTR [rsi]
 
-        movd            mm5,       QWORD PTR [rdi]
-        movd            mm6,       QWORD PTR [rsi+rax]
+        movd            mm5,        DWORD PTR [rdi]
+        movd            mm6,        DWORD PTR [rsi+rax]
 
-        movd            mm7,       QWORD PTR [rdi+rdx]
+        movd            mm7,        DWORD PTR [rdi+rdx]
         punpcklbw       mm4,        mm6
 
         punpcklbw       mm5,        mm7
         psadbw          mm4,        mm5
 
         paddw           mm0,        mm4
-        movd            rax,        mm0
+        movq            rax,        mm0
 
     ; begin epilog
     pop rdi
@@ -282,7 +281,7 @@ sym(vp8_sad16x8_wmt):
 
 x16x8sad_wmt_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              x16x8sad_wmt_early_exit
 
@@ -316,7 +315,7 @@ x16x8sad_wmt_loop:
         cmp             rsi,        rcx
         jne             x16x8sad_wmt_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
 x16x8sad_wmt_early_exit:
 
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 38cc02957..1b7293c20 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -1,32 +1,31 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%idefine QWORD
-
 %macro PROCESS_16X2X3 1
 %if %1
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm5,       [rdi]
-        lddqu           xmm6,       [rdi+1]
-        lddqu           xmm7,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm1,       [rdi]
-        lddqu           xmm2,       [rdi+1]
-        lddqu           xmm3,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -36,10 +35,10 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        lddqu           xmm1,       QWORD PTR [rdi+rdx]
-        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
 
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
@@ -55,19 +54,19 @@
 
 %macro PROCESS_8X2X3 1
 %if %1
-        movq            mm0,       [rsi]
-        movq            mm5,       [rdi]
-        movq            mm6,       [rdi+1]
-        movq            mm7,       [rdi+2]
+        movq            mm0,       QWORD PTR [rsi]
+        movq            mm5,       QWORD PTR [rdi]
+        movq            mm6,       QWORD PTR [rdi+1]
+        movq            mm7,       QWORD PTR [rdi+2]
 
         psadbw          mm5,       mm0
         psadbw          mm6,       mm0
         psadbw          mm7,       mm0
 %else
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-        movq            mm2,       [rdi+1]
-        movq            mm3,       [rdi+2]
+        movq            mm0,       QWORD PTR [rsi]
+        movq            mm1,       QWORD PTR [rdi]
+        movq            mm2,       QWORD PTR [rdi+1]
+        movq            mm3,       QWORD PTR [rdi+2]
 
         psadbw          mm1,       mm0
         psadbw          mm2,       mm0
@@ -104,45 +103,45 @@
 
 %macro PROCESS_16X2X4 1
 %if %1
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm4,       [rcx]
-        lddqu           xmm5,       [rdx]
-        lddqu           xmm6,       [rbx]
-        lddqu           xmm7,       [rdi]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm4,       XMMWORD PTR [rcx]
+        lddqu           xmm5,       XMMWORD PTR [rdx]
+        lddqu           xmm6,       XMMWORD PTR [rbx]
+        lddqu           xmm7,       XMMWORD PTR [rdi]
 
         psadbw          xmm4,       xmm0
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm1,       [rcx]
-        lddqu           xmm2,       [rdx]
-        lddqu           xmm3,       [rbx]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rcx]
+        lddqu           xmm2,       XMMWORD PTR [rdx]
+        lddqu           xmm3,       XMMWORD PTR [rbx]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
         psadbw          xmm3,       xmm0
 
         paddw           xmm4,       xmm1
-        lddqu           xmm1,       [rdi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
         paddw           xmm5,       xmm2
         paddw           xmm6,       xmm3
 
         psadbw          xmm1,       xmm0
         paddw           xmm7,       xmm1
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        lddqu           xmm1,       QWORD PTR [rcx+rbp]
-        lddqu           xmm2,       QWORD PTR [rdx+rbp]
-        lddqu           xmm3,       QWORD PTR [rbx+rbp]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rcx+rbp]
+        lddqu           xmm2,       XMMWORD PTR [rdx+rbp]
+        lddqu           xmm3,       XMMWORD PTR [rbx+rbp]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
         psadbw          xmm3,       xmm0
 
         paddw           xmm4,       xmm1
-        lddqu           xmm1,       QWORD PTR [rdi+rbp]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rbp]
         paddw           xmm5,       xmm2
         paddw           xmm6,       xmm3
 
@@ -161,28 +160,28 @@
 
 %macro PROCESS_8X2X4 1
 %if %1
-        movq            mm0,        [rsi]
-        movq            mm4,        [rcx]
-        movq            mm5,        [rdx]
-        movq            mm6,        [rbx]
-        movq            mm7,        [rdi]
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm4,        QWORD PTR [rcx]
+        movq            mm5,        QWORD PTR [rdx]
+        movq            mm6,        QWORD PTR [rbx]
+        movq            mm7,        QWORD PTR [rdi]
 
         psadbw          mm4,        mm0
         psadbw          mm5,        mm0
         psadbw          mm6,        mm0
         psadbw          mm7,        mm0
 %else
-        movq            mm0,        [rsi]
-        movq            mm1,        [rcx]
-        movq            mm2,        [rdx]
-        movq            mm3,        [rbx]
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rcx]
+        movq            mm2,        QWORD PTR [rdx]
+        movq            mm3,        QWORD PTR [rbx]
 
         psadbw          mm1,        mm0
         psadbw          mm2,        mm0
         psadbw          mm3,        mm0
 
         paddw           mm4,        mm1
-        movq            mm1,        [rdi]
+        movq            mm1,        QWORD PTR [rdi]
         paddw           mm5,        mm2
         paddw           mm6,        mm3
 
@@ -429,20 +428,20 @@ sym(vp8_sad4x4x3_sse3):
         movsxd          rax,        dword ptr arg(1) ;src_stride
         movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm1,        QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
-        movd            mm2,        QWORD PTR [rsi+rax]
-        movd            mm3,        QWORD PTR [rdi+rdx]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
 
-        movd            mm4,        QWORD PTR [rdi+1]
-        movd            mm5,        QWORD PTR [rdi+2]
+        movd            mm4,        DWORD PTR [rdi+1]
+        movd            mm5,        DWORD PTR [rdi+2]
 
-        movd            mm2,        QWORD PTR [rdi+rdx+1]
-        movd            mm3,        QWORD PTR [rdi+rdx+2]
+        movd            mm2,        DWORD PTR [rdi+rdx+1]
+        movd            mm3,        DWORD PTR [rdi+rdx+2]
 
         psadbw          mm1,        mm0
 
@@ -457,24 +456,24 @@ sym(vp8_sad4x4x3_sse3):
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm2,        QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm2,        DWORD PTR [rdi]
 
-        movd            mm3,        QWORD PTR [rsi+rax]
-        movd            mm6,        QWORD PTR [rdi+rdx]
+        movd            mm3,        DWORD PTR [rsi+rax]
+        movd            mm6,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm3
         punpcklbw       mm2,        mm6
 
-        movd            mm3,        QWORD PTR [rdi+1]
-        movd            mm7,        QWORD PTR [rdi+2]
+        movd            mm3,        DWORD PTR [rdi+1]
+        movd            mm7,        DWORD PTR [rdi+2]
 
         psadbw          mm2,        mm0
 
         paddw           mm1,        mm2
 
-        movd            mm2,        QWORD PTR [rdi+rdx+1]
-        movd            mm6,        QWORD PTR [rdi+rdx+2]
+        movd            mm2,        DWORD PTR [rdi+rdx+1]
+        movd            mm6,        DWORD PTR [rdi+rdx+2]
 
         punpcklbw       mm3,        mm2
         punpcklbw       mm7,        mm6
@@ -529,7 +528,7 @@ sym(vp8_sad16x16_sse3):
 
 vp8_sad16x16_sse3_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              vp8_sad16x16_early_exit
 
@@ -563,7 +562,7 @@ vp8_sad16x16_sse3_loop:
         cmp             rsi,        rcx
         jne             vp8_sad16x16_sse3_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
 vp8_sad16x16_early_exit:
 
@@ -845,23 +844,23 @@ sym(vp8_sad4x4x4d_sse3):
 
         xchg            rbx,        rax
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm1,        QWORD PTR [rcx]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rcx]
 
-        movd            mm2,        QWORD PTR [rsi+rax]
-        movd            mm3,        QWORD PTR [rcx+rbp]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rcx+rbp]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
 
-        movd            mm4,        QWORD PTR [rdx]
-        movd            mm5,        QWORD PTR [rbx]
+        movd            mm4,        DWORD PTR [rdx]
+        movd            mm5,        DWORD PTR [rbx]
 
-        movd            mm6,        QWORD PTR [rdi]
-        movd            mm2,        QWORD PTR [rdx+rbp]
+        movd            mm6,        DWORD PTR [rdi]
+        movd            mm2,        DWORD PTR [rdx+rbp]
 
-        movd            mm3,        QWORD PTR [rbx+rbp]
-        movd            mm7,        QWORD PTR [rdi+rbp]
+        movd            mm3,        DWORD PTR [rbx+rbp]
+        movd            mm7,        DWORD PTR [rdi+rbp]
 
         psadbw          mm1,        mm0
 
@@ -884,17 +883,17 @@ sym(vp8_sad4x4x4d_sse3):
 
         lea             rdi,        [rdi+rbp*2]
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm2,        QWORD PTR [rcx]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm2,        DWORD PTR [rcx]
 
-        movd            mm3,        QWORD PTR [rsi+rax]
-        movd            mm7,        QWORD PTR [rcx+rbp]
+        movd            mm3,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rcx+rbp]
 
         punpcklbw       mm0,        mm3
         punpcklbw       mm2,        mm7
 
-        movd            mm3,        QWORD PTR [rdx]
-        movd            mm7,        QWORD PTR [rbx]
+        movd            mm3,        DWORD PTR [rdx]
+        movd            mm7,        DWORD PTR [rbx]
 
         psadbw          mm2,        mm0
         mov             rax,        rbp
@@ -905,8 +904,8 @@ sym(vp8_sad4x4x4d_sse3):
         paddw           mm1,        mm2
         movd            [rsi],      mm1
 
-        movd            mm2,        QWORD PTR [rdx+rax]
-        movd            mm1,        QWORD PTR [rbx+rax]
+        movd            mm2,        DWORD PTR [rdx+rax]
+        movd            mm1,        DWORD PTR [rbx+rax]
 
         punpcklbw       mm3,        mm2
         punpcklbw       mm7,        mm1
@@ -914,8 +913,8 @@ sym(vp8_sad4x4x4d_sse3):
         psadbw          mm3,        mm0
         psadbw          mm7,        mm0
 
-        movd            mm2,        QWORD PTR [rdi]
-        movd            mm1,        QWORD PTR [rdi+rax]
+        movd            mm2,        DWORD PTR [rdi]
+        movd            mm1,        DWORD PTR [rdi+rax]
 
         paddw           mm3,        mm4
         paddw           mm7,        mm5
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
new file mode 100644
index 000000000..21e2e5007
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4)
+sym(vp8_sad16x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad16x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4)
+sym(vp8_sad16x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4)
+sym(vp8_sad8x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4)
+sym(vp8_sad8x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad4x4x8_c(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4)
+sym(vp8_sad4x4x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_4X2X8 1
+        PROCESS_4X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 1bb956121..69c5eaedc 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -1,32 +1,31 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%idefine QWORD
-
 %macro PROCESS_16X2X3 1
 %if %1
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm5,       [rdi]
-        lddqu           xmm6,       [rdi+1]
-        lddqu           xmm7,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm1,       [rdi]
-        lddqu           xmm2,       [rdi+1]
-        lddqu           xmm3,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -36,10 +35,10 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        lddqu           xmm1,       QWORD PTR [rdi+rdx]
-        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
 
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
@@ -55,9 +54,9 @@
 
 %macro PROCESS_16X2X3_OFFSET 2
 %if %1
-        movdqa          xmm0,       [rsi]
-        movdqa          xmm4,       [rdi]
-        movdqa          xmm7,       [rdi+16]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm7,       XMMWORD PTR [rdi+16]
 
         movdqa          xmm5,       xmm7
         palignr         xmm5,       xmm4,       %2
@@ -71,9 +70,9 @@
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        movdqa          xmm4,       [rdi]
-        movdqa          xmm3,       [rdi+16]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm3,       XMMWORD PTR [rdi+16]
 
         movdqa          xmm1,       xmm3
         palignr         xmm1,       xmm4,       %2
@@ -91,9 +90,9 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        movdqa          xmm4,       QWORD PTR [rdi+rdx]
-        movdqa          xmm3,       QWORD PTR [rdi+rdx+16]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
+        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
 
         movdqa          xmm1,       xmm3
         palignr         xmm1,       xmm4,       %2
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index ce3e61066..a47e1f0d6 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -1,20 +1,21 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            unsigned short *diff, unsigned char *Predictor,
+;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
 global sym(vp8_subtract_b_mmx_impl)
-sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -150,7 +151,7 @@ submby_loop:
 
 ;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
 global sym(vp8_subtract_mbuv_mmx)
-sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644
index 000000000..3fb23d097
--- /dev/null
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -0,0 +1,356 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_sse2_impl)
+sym(vp8_subtract_b_sse2_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi],      mm0
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*4], mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_sse2)
+sym(vp8_subtract_mby_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            8      ; do two lines at one time
+
+submby_loop:
+            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
+            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
+
+            movdqa      xmm2,           xmm0
+            psubb       xmm0,           xmm1
+
+            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm2,           [GLOBAL(t80)]
+            pcmpgtb     xmm1,           xmm2            ; obtain sign information
+
+            movdqa      xmm2,    xmm0
+            movdqa      xmm3,    xmm1
+            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa      XMMWORD PTR [rdi],   xmm0
+            movdqa      XMMWORD PTR [rdi +16], xmm2
+
+            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
+            movdqa      xmm5,           XMMWORD PTR [rax + 16]
+
+            movdqa      xmm6,           xmm4
+            psubb       xmm4,           xmm5
+
+            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm6,           [GLOBAL(t80)]
+            pcmpgtb     xmm5,           xmm6            ; obtain sign information
+
+            movdqa      xmm6,    xmm4
+            movdqa      xmm7,    xmm5
+            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
+            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
+
+            movdqa      XMMWORD PTR [rdi +32], xmm4
+            movdqa      XMMWORD PTR [rdi +48], xmm6
+
+            add         rdi,            64
+            add         rax,            32
+            lea         rsi,            [rsi+rdx*2]
+
+            sub         rcx,            1
+            jnz         submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_sse2)
+sym(vp8_subtract_mbuv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            lea     rcx,        [rdx + rdx*2]
+
+            ;u
+            ;line 0 1
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi],   xmm0
+            movdqa     XMMWORD PTR [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 32],   xmm0
+            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 64],   xmm0
+            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 96],   xmm0
+            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+
+            ;v
+            mov     rsi,        arg(2) ;z = vsrc
+            add     rdi,        64*2  ;diff = diff + 320 (shorts)
+            add     rax,        64    ;Predictor = pred + 320
+
+            ;line 0 1
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi],   xmm0
+            movdqa     XMMWORD PTR [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 32],   xmm0
+            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 64],   xmm0
+            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 96],   xmm0
+            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t80:
+    times 16 db 0x80
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index d0da82ad4..67a9b4d3e 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -497,7 +498,7 @@ sym(vp8_get4x4sse_cs_mmx):
         psrlq       mm7,    32
 
         paddd       mm0,    mm7
-        movd        rax,    mm0
+        movq        rax,    mm0
 
 
     ; begin epilog
@@ -555,7 +556,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx):
         pmullw          mm3,            [rax+8]             ;
 
         paddw           mm1,            mm3                 ;
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
         movq            mm5,            mm1
@@ -579,7 +580,7 @@ filter_block2d_bil4x4_var_mmx_loop:
         pmullw          mm3,            [rax+8]             ;
 
         paddw           mm1,            mm3                 ;
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
         movq            mm3,            mm5                 ;
@@ -591,7 +592,7 @@ filter_block2d_bil4x4_var_mmx_loop:
         paddw           mm1,            mm3                 ;
 
 
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm1,            mmx_filter_shift    ;
 
         movd            mm3,            [rdi]               ;
@@ -709,10 +710,10 @@ sym(vp8_filter_block2d_bil_var_mmx):
         paddw           mm1,            mm3                 ;
 
         paddw           mm2,            mm4                 ;
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm2,            mmx_filter_shift    ;
         movq            mm5,            mm1
@@ -748,10 +749,10 @@ filter_block2d_bil_var_mmx_loop:
         paddw           mm1,            mm3                 ;
         paddw           mm2,            mm4                 ;
 
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm1,            mmx_filter_shift    ;
 
-        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm2,            mmx_filter_shift    ;
 
         movq            mm3,            mm5                 ;
@@ -772,8 +773,8 @@ filter_block2d_bil_var_mmx_loop:
         paddw           mm1,            mm3                 ;
         paddw           mm2,            mm4                 ;
 
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
-        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
         psraw           mm2,            mmx_filter_shift    ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 7e5ee284b..cefa0a956 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -1,10 +1,11 @@
 ;
-;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
@@ -57,7 +58,7 @@ NEXTROW:
         movdqa      xmm3,xmm4
         psrldq      xmm4,4
         paddd       xmm4,xmm3
-        movd        rax,xmm4
+        movq        rax,xmm4
 
 
     ; begin epilog
@@ -470,7 +471,7 @@ sym(vp8_get8x8var_sse2):
         mov         rax,            arg(5) ;[Sum]
         mov         rdi,            arg(4) ;[SSE]
 
-        movd        rdx,            xmm7
+        movq        rdx,            xmm7
         movsx       rcx,            dx
 
         mov  dword ptr [rax],       ecx
@@ -531,7 +532,7 @@ sym(vp8_filter_block2d_bil_var_sse2):
         pmullw          xmm3,           [rax+16]             ;
         paddw           xmm1,           xmm3                 ;
 
-        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
         psraw           xmm1,           xmm_filter_shift    ;
 
         movdqa          xmm5,           xmm1
@@ -553,7 +554,7 @@ filter_block2d_bil_var_sse2_loop:
         pmullw          xmm3,           [rax+16]             ;
 
         paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
 
         psraw           xmm1,           xmm_filter_shift    ;
         movdqa          xmm3,           xmm5                 ;
@@ -564,7 +565,7 @@ filter_block2d_bil_var_sse2_loop:
         pmullw          xmm1,           [rdx+16]             ;
         paddw           xmm1,           xmm3                 ;
 
-        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
         psraw           xmm1,           xmm_filter_shift    ;
 
         movq            xmm3,           QWORD PTR [rdi]               ;
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 4a5b25b0d..2df73a635 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -14,7 +15,7 @@
 
 extern void filter_block1d_h6_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     unsigned short *output_ptr,
     unsigned int src_pixels_per_line,
     unsigned int pixel_step,
@@ -24,7 +25,7 @@ extern void filter_block1d_h6_mmx
 );
 extern void filter_block1d_v6_mmx
 (
-    short *src_ptr,
+    const short *src_ptr,
     unsigned char *output_ptr,
     unsigned int pixels_per_line,
     unsigned int pixel_step,
@@ -36,34 +37,34 @@ extern void filter_block1d_v6_mmx
 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
 extern unsigned int vp8_get8x8var_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
 );
 extern unsigned int vp8_get4x4var_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
 );
 extern unsigned int vp8_get4x4sse_cs_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride
 );
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     const short *HFilter,
     const short *VFilter,
@@ -72,9 +73,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
 );
 extern void vp8_filter_block2d_bil_var_mmx
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     const short *HFilter,
@@ -125,9 +126,9 @@ void vp8_test_get_mb_ss(void)
 
 
 unsigned int vp8_get16x16var_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned *SSE,
     unsigned *SUM
@@ -156,9 +157,9 @@ unsigned int vp8_get16x16var_mmx(
 
 
 unsigned int vp8_variance4x4_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -172,9 +173,9 @@ unsigned int vp8_variance4x4_mmx(
 }
 
 unsigned int vp8_variance8x8_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -189,9 +190,9 @@ unsigned int vp8_variance8x8_mmx(
 }
 
 unsigned int vp8_mse16x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -211,9 +212,9 @@ unsigned int vp8_mse16x16_mmx(
 
 
 unsigned int vp8_variance16x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     int *sse)
 {
@@ -233,9 +234,9 @@ unsigned int vp8_variance16x16_mmx(
 }
 
 unsigned int vp8_variance16x8_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -254,9 +255,9 @@ unsigned int vp8_variance16x8_mmx(
 
 
 unsigned int vp8_variance8x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -295,11 +296,11 @@ DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
 
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse)
 
@@ -319,11 +320,11 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
 
 unsigned int vp8_sub_pixel_variance8x8_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -343,11 +344,11 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
 
 unsigned int vp8_sub_pixel_variance16x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -382,11 +383,11 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
 }
 
 unsigned int vp8_sub_pixel_mse16x16_mmx(
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -397,11 +398,11 @@ unsigned int vp8_sub_pixel_mse16x16_mmx(
 
 unsigned int vp8_sub_pixel_variance16x8_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -434,11 +435,11 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
 
 unsigned int vp8_sub_pixel_variance8x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     int *sse
 )
@@ -456,9 +457,9 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
 }
 
 unsigned int vp8_i_variance16x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -479,9 +480,9 @@ unsigned int vp8_i_variance16x16_mmx(
 }
 
 unsigned int vp8_i_variance8x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -500,11 +501,11 @@ unsigned int vp8_i_variance8x16_mmx(
 
 unsigned int vp8_i_sub_pixel_variance16x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -559,11 +560,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_mmx
 
 unsigned int vp8_i_sub_pixel_variance8x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -594,3 +595,39 @@ unsigned int vp8_i_sub_pixel_variance8x16_mmx
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
+                                           ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
+                                           ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
+                                           ref_ptr, recon_stride, sse);
+}
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index ea80753bd..006e0a24a 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -12,16 +13,16 @@
 #include "pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
 
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     const short *HFilter,
     const short *VFilter,
@@ -31,9 +32,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
 
 extern unsigned int vp8_get4x4var_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
@@ -41,38 +42,38 @@ extern unsigned int vp8_get4x4var_mmx
 
 unsigned int vp8_get_mb_ss_sse2
 (
-    short *src_ptr
+    const short *src_ptr
 );
 unsigned int vp8_get16x16var_sse2
 (
-    unsigned char     *src_ptr,
-    int             source_stride,
-    unsigned char     *ref_ptr,
-    int             recon_stride,
-    unsigned int      *SSE,
-    int               *Sum
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
 );
 unsigned int vp8_get16x16pred_error_sse2
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_stride
 );
 unsigned int vp8_get8x8var_sse2
 (
-    unsigned char     *src_ptr,
-    int             source_stride,
-    unsigned char     *ref_ptr,
-    int             recon_stride,
-    unsigned int      *SSE,
-    int               *Sum
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
 );
 void vp8_filter_block2d_bil_var_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     const short *HFilter,
@@ -82,9 +83,9 @@ void vp8_filter_block2d_bil_var_sse2
 );
 void vp8_half_horiz_vert_variance16x_h_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     int *sum,
@@ -92,9 +93,9 @@ void vp8_half_horiz_vert_variance16x_h_sse2
 );
 void vp8_half_horiz_variance16x_h_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     int *sum,
@@ -102,9 +103,9 @@ void vp8_half_horiz_variance16x_h_sse2
 );
 void vp8_half_vert_variance16x_h_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     int *sum,
@@ -114,9 +115,9 @@ void vp8_half_vert_variance16x_h_sse2
 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
 
 unsigned int vp8_variance4x4_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride)
 {
     unsigned int var;
@@ -131,9 +132,9 @@ unsigned int vp8_variance4x4_wmt(
 
 unsigned int vp8_variance8x8_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride)
 {
     unsigned int var;
@@ -148,9 +149,9 @@ unsigned int vp8_variance8x8_wmt
 
 unsigned int vp8_variance16x16_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -163,9 +164,9 @@ unsigned int vp8_variance16x16_wmt
     return (sse0 - ((sum0 * sum0) >> 8));
 }
 unsigned int vp8_mse16x16_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -181,9 +182,9 @@ unsigned int vp8_mse16x16_wmt(
 
 unsigned int vp8_variance16x8_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -202,9 +203,9 @@ unsigned int vp8_variance16x8_wmt
 
 unsigned int vp8_variance8x16_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -238,11 +239,11 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
 };
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -262,11 +263,11 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
 
 unsigned int vp8_sub_pixel_variance8x8_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -287,11 +288,11 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
 
 unsigned int vp8_sub_pixel_variance16x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -363,11 +364,11 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
 }
 
 unsigned int vp8_sub_pixel_mse16x16_wmt(
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -378,11 +379,11 @@ unsigned int vp8_sub_pixel_mse16x16_wmt(
 
 unsigned int vp8_sub_pixel_variance16x8_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 
@@ -416,11 +417,11 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
 
 unsigned int vp8_sub_pixel_variance8x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -439,9 +440,9 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
 }
 
 unsigned int vp8_i_variance16x16_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -463,9 +464,9 @@ unsigned int vp8_i_variance16x16_wmt(
 }
 
 unsigned int vp8_i_variance8x16_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -485,11 +486,11 @@ unsigned int vp8_i_variance8x16_wmt(
 
 unsigned int vp8_i_sub_pixel_variance16x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -500,11 +501,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_wmt
 
 unsigned int vp8_i_sub_pixel_variance8x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -512,3 +513,84 @@ unsigned int vp8_i_sub_pixel_variance8x16_wmt
 
     return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
 }
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vp8_half_horiz_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    vp8_half_horiz_variance16x_h_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vp8_half_vert_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    vp8_half_vert_variance16x_h_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vp8_half_horiz_vert_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    vp8_half_horiz_vert_variance16x_h_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 35fc90c48..6bea15ebc 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -34,6 +35,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
 extern prototype_getmbss(vp8_get_mb_ss_mmx);
 extern prototype_variance(vp8_mse16x16_mmx);
@@ -88,6 +92,15 @@ extern prototype_sad(vp8_get4x4sse_cs_mmx);
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
 
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_mmx
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_mmx
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_mmx
+
 #undef  vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
 
@@ -129,6 +142,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
 extern prototype_getmbss(vp8_get_mb_ss_sse2);
 extern prototype_variance(vp8_mse16x16_wmt);
@@ -182,6 +198,15 @@ extern prototype_variance2(vp8_get16x16var_sse2);
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
 
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt
+
 #undef  vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
 
@@ -240,7 +265,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
 #define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
 
 #undef  vp8_variance_sad16x16x4d
-#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3
+#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_sse3
 
 #undef  vp8_variance_sad16x8x4d
 #define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
@@ -272,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #endif
 #endif
 
+
+#if HAVE_SSE4_1
+extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
+
+#undef  vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
+
+#undef  vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
+
+#undef  vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
+
+#undef  vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
+
+#endif
+#endif
+
 #endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index f1391ba8c..fb1b37ccb 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -1,10 +1,11 @@
 /*
- *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
@@ -17,15 +18,10 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_mmx(input,   output,    pitch);
-    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
-void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
-{
-    vp8_fast_fdct4x4_mmx(input,   output   , pitch);
-    vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
 
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
@@ -33,14 +29,14 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
 void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
 {
-    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
+    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr   = b->coeff;
+    short *zbin_ptr    = b->zbin;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
+    short *dequant_ptr = d->dequant;
 
     d->eob = vp8_fast_quantize_b_impl_mmx(
                  coeff_ptr,
@@ -86,30 +82,28 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif
 
 #if HAVE_SSE2
-void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_wmt(input,   output,    pitch);
-    vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_sse2(input,   output,    pitch);
+    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
 }
 
-int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
-    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
+    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
+    short *dequant_ptr = d->dequant;
 
-    d->eob = vp8_fast_quantize_b_impl_sse(
+    d->eob = vp8_fast_quantize_b_impl_sse2(
                  coeff_ptr,
-                 zbin_ptr,
                  qcoeff_ptr,
                  dequant_ptr,
                  scan_mask,
@@ -120,6 +114,41 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
              );
 }
 
+
+int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
+                               short *qcoeff_ptr,short *dequant_ptr,
+                               const int *default_zig_zag, short *round_ptr,
+                               short *quant_ptr, short *dqcoeff_ptr,
+                               unsigned short zbin_oq_value,
+                               short *zbin_boost_ptr);
+
+void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
+{
+    short *zbin_boost_ptr = b->zrun_zbin_boost;
+    short *coeff_ptr      = b->coeff;
+    short *zbin_ptr       = b->zbin;
+    short *round_ptr      = b->round;
+    short *quant_ptr      = b->quant;
+    short *qcoeff_ptr     = d->qcoeff;
+    short *dqcoeff_ptr    = d->dqcoeff;
+    short *dequant_ptr    = d->dequant;
+    short zbin_oq_value   = b->zbin_extra;
+
+    d->eob = vp8_regular_quantize_b_impl_sse2(
+        coeff_ptr,
+        zbin_ptr,
+        qcoeff_ptr,
+        dequant_ptr,
+        vp8_default_zig_zag1d,
+
+        round_ptr,
+        quant_ptr,
+        dqcoeff_ptr,
+        zbin_oq_value,
+        zbin_boost_ptr
+        );
+}
+
 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
 int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
 {
@@ -136,8 +165,39 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
     return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 }
 
+void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+    d->eob = vp8_fast_quantize_b_impl_ssse3(
+                    b->coeff,
+                    d->qcoeff,
+                    d->dequant,
+                    b->round,
+                    b->quant,
+                    d->dqcoeff
+               );
+}
 #endif
 
+
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -147,6 +207,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
     int wmt_enabled = flags & HAS_SSE2;
     int SSE3Enabled = flags & HAS_SSE3;
     int SSSE3Enabled = flags & HAS_SSSE3;
+    int SSE4_1Enabled = flags & HAS_SSE4_1;
 
     /* Note:
      *
@@ -157,7 +218,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 
     /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
-
     if (mmx_enabled)
     {
         cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
@@ -177,6 +237,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_mmx;
         cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_mmx;
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_mmx;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_mmx;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_mmx;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_mmx;
         cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_mmx;
 
         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_mmx;
@@ -186,11 +249,19 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-
+#if 0 // new fdct
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
         cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_mmx;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_mmx;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
+#else
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
+
+#endif
+
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
@@ -200,12 +271,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;
         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;
 
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;
+        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
     }
-
 #endif
-#if HAVE_SSE2
 
+#if HAVE_SSE2
     if (wmt_enabled)
     {
         cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
@@ -225,6 +295,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_wmt;
         cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_wmt;
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_wmt;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_wmt;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_wmt;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_wmt;
         cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_wmt;
 
         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_wmt;
@@ -235,26 +308,26 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;
 
-#if 0
-        /* short SSE2 DCT currently disabled, does not match the MMX version */
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_wmt;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_wmt;
-#endif
-        /* cpi->rtcd.fdct.fast4x4  not implemented for wmt */;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_wmt;
-        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2;
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_sse2;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_sse2;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_sse2;
+
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2 ;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
         cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
         cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
-        /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_sse2;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;
 
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse;
+        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
     }
-
 #endif
-#if HAVE_SSE3
 
+#if HAVE_SSE3
     if (SSE3Enabled)
     {
         cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
@@ -272,16 +345,30 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
         cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
     }
-
 #endif
-#if HAVE_SSSE3
 
+#if HAVE_SSSE3
     if (SSSE3Enabled)
     {
         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+
     }
+#endif
 
+#if HAVE_SSE4_1
+    if (SSE4_1Enabled)
+    {
+        cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_sse4;
+        cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_sse4;
+        cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
+        cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
+        cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
+        cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
+    }
 #endif
+
 #endif
 }