84 files changed, 2303 insertions, 2728 deletions
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index edef36094..b0c7363a7 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -129,32 +129,32 @@ void vp8_setup_version(VP8_COMMON *cm)
     {
     case 0:
         cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 0;
         cm->full_pixel = 0;
         break;
     case 1:
         cm->no_lpf = 0;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 0;
         break;
     case 2:
         cm->no_lpf = 1;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 0;
         break;
     case 3:
         cm->no_lpf = 1;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 1;
         break;
     default:
         /*4,5,6,7 are reserved for future use*/
         cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 0;
         cm->full_pixel = 0;
         break;
@@ -169,7 +169,7 @@ void vp8_create_common(VP8_COMMON *oci)
 
     oci->mb_no_coeff_skip = 1;
     oci->no_lpf = 0;
-    oci->simpler_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
     oci->use_bilinear_mc_filter = 0;
     oci->full_pixel = 0;
     oci->multi_token_partition = ONE_PARTITION;
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index 3532a0356..6d1caa485 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -38,9 +38,8 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
 /*ARMV6 loopfilter functions*/
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -51,20 +50,18 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 }
 
 void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                                int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -75,20 +72,18 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 }
 
 void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                                int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -101,12 +96,11 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -114,9 +108,8 @@ void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -129,12 +122,11 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -145,9 +137,8 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -155,20 +146,18 @@ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -176,20 +165,18 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -199,12 +186,11 @@ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 }
 
 void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -212,9 +198,8 @@ void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -224,12 +209,11 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 }
 
 void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
index d3a79f640..e73dd6401 100644
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -308,7 +308,6 @@
 ; q9    q2
 ; q10   q3
 |vp8_loop_filter_neon| PROC
-    ldr         r12, _lf_coeff_
 
     ; vp8_filter_mask
     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
@@ -339,7 +338,7 @@
     vqadd.u8    q9, q9, q2                  ; a = b + a
     vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
 
-    vld1.u8     {q0}, [r12]!
+    vmov.u8     q0, #0x80                   ; 0x80
 
     ; vp8_filter() function
     ; convert to signed
@@ -348,7 +347,7 @@
     veor        q5, q5, q0                  ; ps1
     veor        q8, q8, q0                  ; qs1
 
-    vld1.u8     {q10}, [r12]!
+    vmov.u8     q10, #3                     ; #3
 
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q11, d15, d13
@@ -367,7 +366,7 @@
     vaddw.s8    q2, q2, d2
     vaddw.s8    q11, q11, d3
 
-    vld1.u8     {q9}, [r12]!
+    vmov.u8     q9, #4                      ; #4
 
     ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d2, q2
@@ -399,12 +398,4 @@
 
 ;-----------------
 
-_lf_coeff_
-    DCD     lf_coeff
-lf_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
-
     END
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
index 5fe7e7e6d..7c5ea3644 100644
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -22,20 +22,19 @@
 ; r1    int p, //pitch
 ; r2    const signed char *flimit,
 ; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
+; stack(r4) const signed char *thresh (unused)
 ; //stack(r5)   int count --unused
 
 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
     sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines
 
-    ldr         r12, _lfhy_coeff_
     vld1.u8     {q5}, [r0], r1              ; p1
     vld1.s8     {d2[], d3[]}, [r2]          ; flimit
     vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
     vld1.u8     {q6}, [r0], r1              ; p0
-    vld1.u8     {q0}, [r12]!                ; 0x80
+    vmov.u8     q0, #0x80                   ; 0x80
     vld1.u8     {q7}, [r0], r1              ; q0
-    vld1.u8     {q10}, [r12]!               ; 0x03
+    vmov.u8     q10, #0x03                  ; 0x03
     vld1.u8     {q8}, [r0]                  ; q1
 
     ;vp8_filter_mask() function
@@ -66,7 +65,7 @@
     vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
     vadd.s16    q12, q3, q3
 
-    vld1.u8     {q9}, [r12]!                ; 0x04
+    vmov.u8     q9, #0x04                   ; 0x04
 
     vadd.s16    q2, q2, q11
     vadd.s16    q3, q3, q12
@@ -105,11 +104,4 @@
 
 ;-----------------
 
-_lfhy_coeff_
-    DCD     lfhy_coeff
-lfhy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-
     END
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index c30378b9c..a7f7b690e 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -22,7 +22,7 @@
 ; r1    int p, //pitch
 ; r2    const signed char *flimit,
 ; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
+; stack(r4) const signed char *thresh (unused)
 ; //stack(r5)   int count --unused
 
 |vp8_loop_filter_simple_vertical_edge_neon| PROC
@@ -32,7 +32,6 @@
     vld1.s8     {d2[], d3[]}, [r2]          ; flimit
     vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
     vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
-    ldr         r12, _vlfy_coeff_
     vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
     vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
     vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
@@ -41,11 +40,11 @@
     vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
 
     vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vld1.u8     {q0}, [r12]!                ; 0x80
+    vmov.u8     q0, #0x80                ; 0x80
     vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vld1.u8     {q11}, [r12]!               ; 0x03
+    vmov.u8     q11, #0x03              ; 0x03
     vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vld1.u8     {q12}, [r12]!               ; 0x04
+    vmov.u8     q12, #0x04               ; 0x04
     vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
     vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
     vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
@@ -146,11 +145,4 @@
 
 ;-----------------
 
-_vlfy_coeff_
-    DCD     vlfy_coeff
-vlfy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-
     END
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
index 981adffd1..72f0f9271 100644
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -372,7 +372,6 @@
 ; q10   q3
 
 |vp8_mbloop_filter_neon| PROC
-    ldr         r12, _mblf_coeff_
 
     ; vp8_filter_mask
     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
@@ -396,7 +395,7 @@
 
     vld1.s8     {d4[], d5[]}, [r2]          ; flimit
 
-    vld1.u8     {q0}, [r12]!
+    vmov.u8     q0, #0x80                   ; 0x80
 
     vadd.u8     q2, q2, q2                  ; flimit * 2
     vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
@@ -431,12 +430,12 @@
     vadd.s16    q2, q2, q10
     vadd.s16    q13, q13, q11
 
-    vld1.u8     {q12}, [r12]!               ; #3
+    vmov.u8     q12, #3                     ; #3
 
     vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
     vaddw.s8    q13, q13, d3
 
-    vld1.u8     {q11}, [r12]!               ; #4
+    vmov.u8     q11, #4                     ; #4
 
     ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d2, q2
@@ -444,16 +443,16 @@
 
     vand        q1, q1, q15                 ; vp8_filter &= mask
 
-    vld1.u8     {q15}, [r12]!               ; #63
-    ;
+    vmov.u16    q15, #63                    ; #63
+
     vand        q13, q1, q14                ; Filter2 &= hev
 
-    vld1.u8     {d7}, [r12]!                ; #9
+    vmov.u8     d7, #9                      ; #9
 
     vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
     vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
 
-    vld1.u8     {d6}, [r12]!                ; #18
+    vmov.u8     d6, #18                     ; #18
 
     vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
     vshr.s8     q13, q13, #3                ; Filter2 >>= 3
@@ -463,7 +462,7 @@
 
     vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
 
-    vld1.u8     {d5}, [r12]!                ; #27
+    vmov.u8     d5, #27                     ; #27
 
     vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
 
@@ -507,14 +506,4 @@
 
 ;-----------------
 
-_mblf_coeff_
-    DCD     mblf_coeff
-mblf_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
-
     END
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index fc8e0722c..aef692744 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -169,12 +169,8 @@ typedef struct
 
     unsigned char partitioning;
     unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-    unsigned char dc_diff;
     unsigned char need_to_clamp_mvs;
-
     unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
-
-    unsigned char force_no_skip; /* encoder only */
 } MB_MODE_INFO;
 
 
diff --git a/vp8/common/extend.c b/vp8/common/extend.c
index 47207fa79..036bafc5d 100644
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -13,10 +13,12 @@
 #include "vpx_mem/vpx_mem.h"
 
 
-static void extend_plane_borders
+static void copy_and_extend_plane
 (
     unsigned char *s, /* source */
-    int sp,           /* pitch */
+    int sp,           /* source pitch */
+    unsigned char *d, /* destination */
+    int dp,           /* destination pitch */
     int h,            /* height */
     int w,            /* width */
     int et,           /* extend top border */
@@ -25,7 +27,6 @@ static void extend_plane_borders
     int er            /* extend right border */
 )
 {
-
     int i;
     unsigned char *src_ptr1, *src_ptr2;
     unsigned char *dest_ptr1, *dest_ptr2;
@@ -34,68 +35,73 @@ static void extend_plane_borders
     /* copy the left and right most columns out */
     src_ptr1 = s;
     src_ptr2 = s + w - 1;
-    dest_ptr1 = s - el;
-    dest_ptr2 = s + w;
+    dest_ptr1 = d - el;
+    dest_ptr2 = d + w;
 
-    for (i = 0; i < h - 0 + 1; i++)
+    for (i = 0; i < h; i++)
     {
-        /* Some linkers will complain if we call vpx_memset with el set to a
-         * constant 0.
-         */
-        if (el)
-            vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
         vpx_memset(dest_ptr2, src_ptr2[0], er);
         src_ptr1  += sp;
         src_ptr2  += sp;
-        dest_ptr1 += sp;
-        dest_ptr2 += sp;
+        dest_ptr1 += dp;
+        dest_ptr2 += dp;
     }
 
-    /* Now copy the top and bottom source lines into each line of the respective borders */
-    src_ptr1 = s - el;
-    src_ptr2 = s + sp * (h - 1) - el;
-    dest_ptr1 = s + sp * (-et) - el;
-    dest_ptr2 = s + sp * (h) - el;
-    linesize = el + er + w + 1;
+    /* Now copy the top and bottom lines into each line of the respective
+     * borders
+     */
+    src_ptr1 = d - el;
+    src_ptr2 = d + dp * (h - 1) - el;
+    dest_ptr1 = d + dp * (-et) - el;
+    dest_ptr2 = d + dp * (h) - el;
+    linesize = el + er + w;
 
-    for (i = 0; i < (int)et; i++)
+    for (i = 0; i < et; i++)
     {
         vpx_memcpy(dest_ptr1, src_ptr1, linesize);
-        dest_ptr1 += sp;
+        dest_ptr1 += dp;
     }
 
-    for (i = 0; i < (int)eb; i++)
+    for (i = 0; i < eb; i++)
     {
         vpx_memcpy(dest_ptr2, src_ptr2, linesize);
-        dest_ptr2 += sp;
+        dest_ptr2 += dp;
     }
 }
 
 
-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst)
 {
-    int er = 0xf & (16 - (width & 0xf));
-    int eb = 0xf & (16 - (height & 0xf));
-
-    /* check for non multiples of 16 */
-    if (er != 0 || eb != 0)
-    {
-        extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
-
-        /* adjust for uv */
-        height = (height + 1) >> 1;
-        width  = (width  + 1) >> 1;
-        er = 0x7 & (8 - (width  & 0x7));
-        eb = 0x7 & (8 - (height & 0x7));
-
-        if (er || eb)
-        {
-            extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-            extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-        }
-    }
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;
+
+    copy_and_extend_plane(src->y_buffer, src->y_stride,
+                          dst->y_buffer, dst->y_stride,
+                          src->y_height, src->y_width,
+                          et, el, eb, er);
+
+    et = (et + 1) >> 1;
+    el = (el + 1) >> 1;
+    eb = (eb + 1) >> 1;
+    er = (er + 1) >> 1;
+
+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                          dst->u_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                          dst->v_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
 }
 
+
 /* note the extension is only for the last row, for intra prediction purpose */
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
 {
diff --git a/vp8/common/extend.h b/vp8/common/extend.h
index fd0a608e5..9e0be4e06 100644
--- a/vp8/common/extend.h
+++ b/vp8/common/extend.h
@@ -14,8 +14,8 @@
 
 #include "vpx_scale/yv12config.h"
 
-void Extend(YV12_BUFFER_CONFIG *ybf);
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
 
 #endif
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 5c6464772..d981f3496 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -17,9 +17,54 @@
 #include "vp8/common/idct.h"
 #include "vp8/common/onyxc_int.h"
 
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#endif
+#endif
+
 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
 
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+    int core_count = 16;
+
+#if HAVE_UNISTD_H
+#if defined(_SC_NPROCESSORS_ONLN)
+    core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+    core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+    {
+        PGNSI pGNSI;
+        SYSTEM_INFO sysinfo;
+
+        /* Call GetNativeSystemInfo if supported or
+         * GetSystemInfo otherwise. */
+
+        pGNSI = (PGNSI) GetProcAddress(
+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+        if (pGNSI != NULL)
+            pGNSI(&sysinfo);
+        else
+            GetSystemInfo(&sysinfo);
+
+        core_count = sysinfo.dwNumberOfProcessors;
+    }
+#else
+    /* other platforms */
+#endif
+
+    return core_count > 0 ? core_count : 1;
+}
+#endif
+
 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -43,6 +88,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
         vp8_build_intra_predictors_mby;
     rtcd->recon.build_intra_predictors_mby_s =
         vp8_build_intra_predictors_mby_s;
+    rtcd->recon.build_intra_predictors_mbuv =
+        vp8_build_intra_predictors_mbuv;
+    rtcd->recon.build_intra_predictors_mbuv_s =
+        vp8_build_intra_predictors_mbuv_s;
+    rtcd->recon.intra4x4_predict =
+        vp8_intra4x4_predict;
 
     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
@@ -82,4 +133,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
     vp8_arch_arm_common_init(ctx);
 #endif
 
+#if CONFIG_MULTITHREAD
+    ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
 }
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index 37c5b7740..a3242716f 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -25,9 +25,8 @@ prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
 
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -38,20 +37,18 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
 }
 
 void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -62,20 +59,18 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
 }
 
 void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -88,12 +83,11 @@ void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
 }
 
 void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -101,9 +95,8 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -116,12 +109,11 @@ void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
 }
 
 void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                           int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -353,6 +345,9 @@ void vp8_loop_filter_frame
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
+                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
+                            mbd->mode_info_context->mbmi.mb_skip_coeff);
 
             filter_level = baseline_filter_level[Segment];
 
@@ -365,17 +360,17 @@ void vp8_loop_filter_frame
             if (filter_level)
             {
                 if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                if (!skip_lf)
+                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
 
                 /* don't apply across umv border */
                 if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                if (!skip_lf)
+                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
             }
 
             y_ptr += 16;
@@ -457,6 +452,10 @@ void vp8_loop_filter_frame_yonly
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
+                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
+                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+
             filter_level = baseline_filter_level[Segment];
 
             /* Apply any context driven MB level adjustment */
@@ -465,17 +464,17 @@ void vp8_loop_filter_frame_yonly
             if (filter_level)
             {
                 if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
                 /* don't apply across umv border */
                 if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
             }
 
             y_ptr += 16;
@@ -565,20 +564,24 @@ void vp8_loop_filter_partial_frame
         for (mb_col = 0; mb_col < mb_cols; mb_col++)
         {
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
+                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
+                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+
             filter_level = baseline_filter_level[Segment];
 
             if (filter_level)
             {
                 if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
 
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (!skip_lf)
+                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
             }
 
             y_ptr += 16;
diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index 2e5997c73..ca136b3a4 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -41,7 +41,7 @@ typedef struct
 
 #define prototype_loopfilter_block(sym) \
     void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
-             int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
+             int ystride, int uv_stride, loop_filter_info *lfi)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/loopfilter_x86.h"
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 426b8fc2b..a05951933 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -109,6 +109,7 @@ extern "C"
         int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
         int Sharpness;          // parameter used for sharpening output: recommendation 0:
         int cpu_used;
+        unsigned int rc_max_intra_bitrate_pct;
 
         // mode ->
         //(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
@@ -139,8 +140,9 @@ extern "C"
 
         int end_usage; // vbr or cbr
 
-        // shoot to keep buffer full at all times by undershooting a bit 95 recommended
+        // buffer targeting aggressiveness
         int under_shoot_pct;
+        int over_shoot_pct;
 
         // buffering parameters
         int starting_buffer_level;  // in seconds
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index c8c227787..cf29d03df 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -119,7 +119,6 @@ typedef struct VP8Common
     /* profile settings */
     int mb_no_coeff_skip;
     int no_lpf;
-    int simpler_lpf;
     int use_bilinear_mc_filter;
     int full_pixel;
 
@@ -196,6 +195,9 @@ typedef struct VP8Common
 #if CONFIG_RUNTIME_CPU_DETECT
     VP8_COMMON_RTCD rtcd;
 #endif
+#if CONFIG_MULTITHREAD
+    int processor_core_count;
+#endif
     struct postproc_state  postproc_state;
 } VP8_COMMON;
 
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 5bfc7d6fb..7dbe96649 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -804,11 +804,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
             for (j = 0; j < mb_cols; j++)
             {
                 char zz[4];
+                int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+                              mi[mb_index].mbmi.mode != SPLITMV &&
+                              mi[mb_index].mbmi.mb_skip_coeff);
 
                 if (oci->frame_type == KEY_FRAME)
                     sprintf(zz, "a");
                 else
-                    sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
+                    sprintf(zz, "%c", dc_diff + '0');
 
                 vp8_blit_text(zz, y_ptr, post->y_stride);
                 mb_index ++;
diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c
index e602feedc..71bf6e2d7 100644
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -53,9 +53,8 @@ loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
 
 // Horizontal MB filtering
 void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
 
     if (u_ptr)
@@ -63,9 +62,8 @@ void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
 }
 
 void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
@@ -74,9 +72,8 @@ void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
 
 // Vertical MB Filtering
 void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
 
     if (u_ptr)
@@ -84,9 +81,8 @@ void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
 }
 
 void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
@@ -95,9 +91,8 @@ void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
 
 // Horizontal B Filtering
 void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     // These should all be done at once with one call, instead of 3
     loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
     loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
@@ -108,9 +103,8 @@ void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
 }
 
 void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
@@ -121,9 +115,8 @@ void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
 
 // Vertical B Filtering
 void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
 
     if (u_ptr)
@@ -131,9 +124,8 @@ void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
 }
 
 void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void)simpler_lpf;
     (void)u_ptr;
     (void)v_ptr;
     (void)uv_stride;
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index e608f218c..7cfc779cd 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -26,6 +26,9 @@
 #define prototype_build_intra_predictors(sym) \
     void sym(MACROBLOCKD *x)
 
+#define prototype_intra4x4_predict(sym) \
+    void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
+
 struct vp8_recon_rtcd_vtable;
 
 #if ARCH_X86 || ARCH_X86_64
@@ -88,11 +91,30 @@ extern prototype_build_intra_predictors\
 extern prototype_build_intra_predictors\
     (vp8_recon_build_intra_predictors_mby_s);
 
+#ifndef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mbuv);
+
+#ifndef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mbuv_s);
+
+#ifndef vp8_recon_intra4x4_predict
+#define vp8_recon_intra4x4_predict vp8_intra4x4_predict
+#endif
+extern prototype_intra4x4_predict\
+    (vp8_recon_intra4x4_predict);
+
 
 typedef prototype_copy_block((*vp8_copy_block_fn_t));
 typedef prototype_recon_block((*vp8_recon_fn_t));
 typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
 typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
+typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
 typedef struct vp8_recon_rtcd_vtable
 {
     vp8_copy_block_fn_t  copy16x16;
@@ -105,6 +127,9 @@ typedef struct vp8_recon_rtcd_vtable
     vp8_recon_mb_fn_t    recon_mby;
     vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
     vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv_s;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv;
+    vp8_intra4x4_pred_fn_t intra4x4_predict;
 } vp8_recon_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 7cfab4140..3b0405ca1 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -207,12 +207,12 @@ static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 }
 
 
+/*encoder only*/
 void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
 {
     int i;
 
-    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
-        x->mode_info_context->mbmi.mode != SPLITMV)
+    if (x->mode_info_context->mbmi.mode != SPLITMV)
     {
         unsigned char *uptr, *vptr;
         unsigned char *upred_ptr = &x->predictor[256];
@@ -257,159 +257,133 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
 }
 
 /*encoder only*/
-void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
 {
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    unsigned char *pred_ptr = x->predictor;
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+    int pre_stride = x->block[0].pre_stride;
 
-  if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
-      x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        unsigned char *ptr_base;
-        unsigned char *ptr;
-        unsigned char *pred_ptr = x->predictor;
-        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
-        int pre_stride = x->block[0].pre_stride;
-
-        ptr_base = x->pre.y_buffer;
-        ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    ptr_base = x->pre.y_buffer;
+    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 
-        if ((mv_row | mv_col) & 7)
-        {
-            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
-        }
-        else
-        {
-            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
-        }
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
     }
     else
     {
-        int i;
-
-        if (x->mode_info_context->mbmi.partitioning < 3)
-        {
-            for (i = 0; i < 4; i++)
-            {
-                BLOCKD *d = &x->block[bbb[i]];
-                build_inter_predictors4b(x, d, 16);
-            }
-
-        }
-        else
-        {
-            for (i = 0; i < 16; i += 2)
-            {
-                BLOCKD *d0 = &x->block[i];
-                BLOCKD *d1 = &x->block[i+1];
-
-                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    build_inter_predictors2b(x, d0, 16);
-                else
-                {
-                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
-                    vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
-                }
-
-            }
-        }
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
     }
 }
 
-void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride)
 {
+    int offset;
+    unsigned char *ptr;
+    unsigned char *uptr, *vptr;
 
-    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
-        x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        int offset;
-        unsigned char *ptr_base;
-        unsigned char *ptr;
-        unsigned char *uptr, *vptr;
-        unsigned char *pred_ptr = x->predictor;
-        unsigned char *upred_ptr = &x->predictor[256];
-        unsigned char *vpred_ptr = &x->predictor[320];
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
 
-        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
-        int pre_stride = x->block[0].pre_stride;
+    unsigned char *ptr_base = x->pre.y_buffer;
+    int pre_stride = x->block[0].pre_stride;
 
-        ptr_base = x->pre.y_buffer;
-        ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 
-        if ((mv_row | mv_col) & 7)
-        {
-            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
-        }
-        else
-        {
-            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
-        }
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, dst_ystride);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, dst_ystride);
+    }
 
-        mv_row = x->block[16].bmi.mv.as_mv.row;
-        mv_col = x->block[16].bmi.mv.as_mv.col;
-        pre_stride >>= 1;
-        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-        uptr = x->pre.u_buffer + offset;
-        vptr = x->pre.v_buffer + offset;
+    mv_row = x->block[16].bmi.mv.as_mv.row;
+    mv_col = x->block[16].bmi.mv.as_mv.col;
+    pre_stride >>= 1;
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->pre.u_buffer + offset;
+    vptr = x->pre.v_buffer + offset;
 
-        if ((mv_row | mv_col) & 7)
-        {
-            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
-            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
-        }
-        else
-        {
-            RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
-            RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
-        }
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, dst_u, dst_uvstride);
+        x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, dst_v, dst_uvstride);
     }
     else
     {
-        int i;
-
-        if (x->mode_info_context->mbmi.partitioning < 3)
-        {
-            for (i = 0; i < 4; i++)
-            {
-                BLOCKD *d = &x->block[bbb[i]];
-                build_inter_predictors4b(x, d, 16);
-            }
-        }
-        else
-        {
-            for (i = 0; i < 16; i += 2)
-            {
-                BLOCKD *d0 = &x->block[i];
-                BLOCKD *d1 = &x->block[i+1];
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, dst_u, dst_uvstride);
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, dst_v, dst_uvstride);
+    }
 
-                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    build_inter_predictors2b(x, d0, 16);
-                else
-                {
-                    vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
-                    vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
-                }
+}
 
-            }
+void vp8_build_inter4x4_predictors_mb(MACROBLOCKD *x)
+{
+    int i;
 
+    if (x->mode_info_context->mbmi.partitioning < 3)
+    {
+        for (i = 0; i < 4; i++)
+        {
+            BLOCKD *d = &x->block[bbb[i]];
+            build_inter_predictors4b(x, d, 16);
         }
-
-        for (i = 16; i < 24; i += 2)
+    }
+    else
+    {
+        for (i = 0; i < 16; i += 2)
         {
             BLOCKD *d0 = &x->block[i];
             BLOCKD *d1 = &x->block[i+1];
 
             if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 16);
             else
             {
-                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
-                vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+                vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
+                vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
             }
 
         }
 
     }
+
+    for (i = 16; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, 8);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+        }
+    }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
+{
+    if (x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
+                                           &x->predictor[320], 16, 8);
+    }
+    else
+    {
+        vp8_build_inter4x4_predictors_mb(x);
+    }
 }
 
 void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
@@ -492,202 +466,5 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
 }
 
 
-/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
- * situation, we can write the result directly to dst buffer instead of writing it to predictor
- * buffer and then copying it to dst buffer.
- */
-static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf)
-{
-    int r;
-    unsigned char *ptr_base;
-    unsigned char *ptr;
-    /*unsigned char *pred_ptr = d->predictor;*/
-    int dst_stride = d->dst_stride;
-    int pre_stride = d->pre_stride;
-
-    ptr_base = *(d->base_pre);
-
-    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
-    {
-        ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride);
-    }
-    else
-    {
-        ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-        ptr = ptr_base;
-
-        for (r = 0; r < 4; r++)
-        {
-#ifdef MUST_BE_ALIGNED
-            dst_ptr[0]   = ptr[0];
-            dst_ptr[1]   = ptr[1];
-            dst_ptr[2]   = ptr[2];
-            dst_ptr[3]   = ptr[3];
-#else
-            *(int *)dst_ptr = *(int *)ptr ;
-#endif
-            dst_ptr      += dst_stride;
-            ptr         += pre_stride;
-        }
-    }
-}
-
-
-
-void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
-{
-    /*unsigned char *pred_ptr = x->block[0].predictor;
-    unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/
-    unsigned char *pred_ptr = x->predictor;
-    unsigned char *dst_ptr = x->dst.y_buffer;
-
-    if (x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        int offset;
-        unsigned char *ptr_base;
-        unsigned char *ptr;
-        unsigned char *uptr, *vptr;
-        /*unsigned char *pred_ptr = x->predictor;
-        unsigned char *upred_ptr = &x->predictor[256];
-        unsigned char *vpred_ptr = &x->predictor[320];*/
-        unsigned char *udst_ptr = x->dst.u_buffer;
-        unsigned char *vdst_ptr = x->dst.v_buffer;
-
-        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
-        int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/
-
-        ptr_base = x->pre.y_buffer;
-        ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
-        if ((mv_row | mv_col) & 7)
-        {
-            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
-        }
-        else
-        {
-            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
-        }
 
-        mv_row = x->block[16].bmi.mv.as_mv.row;
-        mv_col = x->block[16].bmi.mv.as_mv.col;
-        pre_stride >>= 1;
-        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-        uptr = x->pre.u_buffer + offset;
-        vptr = x->pre.v_buffer + offset;
 
-        if ((mv_row | mv_col) & 7)
-        {
-            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
-            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
-        }
-        else
-        {
-            RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride);
-            RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
-        }
-    }
-    else
-    {
-        /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
-         * if sth is wrong, go back to what it is in build_inter_predictors_mb.
-         */
-        int i;
-
-        if (x->mode_info_context->mbmi.partitioning < 3)
-        {
-            for (i = 0; i < 4; i++)
-            {
-                BLOCKD *d = &x->block[bbb[i]];
-                /*build_inter_predictors4b(x, d, 16);*/
-
-                {
-                    unsigned char *ptr_base;
-                    unsigned char *ptr;
-                    unsigned char *pred_ptr = d->predictor;
-
-                    ptr_base = *(d->base_pre);
-                    ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-
-                    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
-                    {
-                        x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
-                    }
-                    else
-                    {
-                        RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
-                    }
-                }
-            }
-        }
-        else
-        {
-            for (i = 0; i < 16; i += 2)
-            {
-                BLOCKD *d0 = &x->block[i];
-                BLOCKD *d1 = &x->block[i+1];
-
-                if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                {
-                    /*build_inter_predictors2b(x, d0, 16);*/
-                    unsigned char *ptr_base;
-                    unsigned char *ptr;
-                    unsigned char *pred_ptr = d0->predictor;
-
-                    ptr_base = *(d0->base_pre);
-                    ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
-                    if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
-                    {
-                        x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
-                    }
-                    else
-                    {
-                        RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
-                    }
-                }
-                else
-                {
-                    vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
-                    vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
-                }
-            }
-        }
-
-        for (i = 16; i < 24; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
-
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            {
-                /*build_inter_predictors2b(x, d0, 8);*/
-                unsigned char *ptr_base;
-                unsigned char *ptr;
-                unsigned char *pred_ptr = d0->predictor;
-
-                ptr_base = *(d0->base_pre);
-                ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
-                if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
-                {
-                    x->subpixel_predict8x4(ptr, d0->pre_stride,
-                        d0->bmi.mv.as_mv.col & 7,
-                        d0->bmi.mv.as_mv.row & 7,
-                        dst_ptr, x->dst.uv_stride);
-                }
-                else
-                {
-                    RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr,
-                        d0->pre_stride, dst_ptr, x->dst.uv_stride);
-                }
-            }
-            else
-            {
-                vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
-                vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
-            }
-        }
-    }
-}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 7c1dee431..a68e4aaba 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -13,9 +13,15 @@
 #define __INC_RECONINTER_H
 
 extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               unsigned char *dst_u,
+                                               unsigned char *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
 
-extern void vp8_build_inter_predictors_mby(MACROBLOCKD *x);
+
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
 extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
 extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x);
diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h
index 4025a5307..47e479285 100644
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -14,9 +14,4 @@
 
 extern void init_intra_left_above_pixels(MACROBLOCKD *x);
 
-extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
-
-extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
-
 #endif
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index cd70dca73..18c514541 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -14,7 +14,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "reconintra.h"
 
-void vp8_predict_intra4x4(BLOCKD *x,
+void vp8_intra4x4_predict(BLOCKD *x,
                           int b_mode,
                           unsigned char *predictor)
 {
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index 44eaf0800..5927cb165 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -12,8 +12,6 @@
 #ifndef _PTHREAD_EMULATION
 #define _PTHREAD_EMULATION
 
-#define VPXINFINITE 10000       /* 10second. */
-
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
 /* Thread management macros */
@@ -28,7 +26,7 @@
 #define pthread_t HANDLE
 #define pthread_attr_t DWORD
 #define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)
-#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
+#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
 #define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
 #define thread_sleep(nms) Sleep(nms)
 #define pthread_cancel(thread) terminate_thread(thread,0)
@@ -61,9 +59,9 @@
 #ifdef _WIN32
 #define sem_t HANDLE
 #define pause(voidpara) __asm PAUSE
-#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL)
-#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE))
-#define sem_post(sem) SetEvent(*sem)
+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
+#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
 #define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
 #define thread_sleep(nms) Sleep(nms)
 
diff --git a/vp8/common/x86/boolcoder.cxx b/vp8/common/x86/boolcoder.cxx
deleted file mode 100644
index faddf1f42..000000000
--- a/vp8/common/x86/boolcoder.cxx
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-
-/* Arithmetic bool coder with largish probability range.
-   Timothy S Murphy  6 August 2004 */
-
-#include <assert.h>
-#include <math.h>
-
-#include "bool_coder.h"
-
-#if tim_vp8
-    extern "C" {
-#       include "VP8cx/treewriter.h"
-    }
-#endif
-
-int_types::~int_types() {}
-
-void bool_coder_spec::check_prec() const {
-    assert( w  &&  (r==Up || w > 1)  &&  w < 24  &&  (ebias || w < 17));
-}
-
-bool bool_coder_spec::float_init( uint Ebits, uint Mbits) {
-    uint b = (ebits = Ebits) + (mbits = Mbits);
-    if( b) {
-        assert( ebits < 6  &&  w + mbits < 31);
-        assert( ebits + mbits  <  sizeof(Index) * 8);
-        ebias = (1 << ebits) + 1 + mbits;
-        mmask = (1 << mbits) - 1;
-        max_index = ( ( half_index = 1 << b ) << 1) - 1;
-    } else {
-        ebias = 0;
-        max_index = 255;
-        half_index = 128;
-    }
-    check_prec();
-    return b? 1:0;
-}
-
-void bool_coder_spec::cost_init()
-{
-    static cdouble c = -(1 << 20)/log( 2.);
-
-    FILE *f = fopen( "costs.txt", "w");
-    assert( f);
-
-    assert( sizeof(int) >= 4);  /* for C interface */
-    assert( max_index <= 255);   /* size of Ctbl */
-    uint i = 0;  do {
-        cdouble p = ( *this)( (Index) i);
-        Ctbl[i] = (uint32) ( log( p) * c);
-        fprintf(
-            f, "cost( %d -> %10.7f) = %10d = %12.5f bits\n",
-            i, p, Ctbl[i], (double) Ctbl[i] / (1<<20)
-        );
-    } while( ++i <= max_index);
-    fclose( f);
-}
-
-bool_coder_spec_explicit_table::bool_coder_spec_explicit_table(
-    cuint16 tbl[256], Rounding rr, uint prec
-)
-  : bool_coder_spec( prec, rr)
-{
-    check_prec();
-    uint i = 0;
-    if( tbl)
-        do { Ptbl[i] = tbl[i];}  while( ++i < 256);
-    else
-        do { Ptbl[i] = i << 8;}  while( ++i < 256);
-    cost_init();
-}
-
-
-bool_coder_spec_exponential_table::bool_coder_spec_exponential_table(
-    uint x, Rounding rr, uint prec
-)
-  : bool_coder_spec( prec, rr)
-{
-    assert( x > 1  &&  x <= 16);
-    check_prec();
-    Ptbl[128] = 32768u;
-    Ptbl[0] = (uint16) pow( 2., 16. - x);
-    --x;
-    int i=1;  do {
-        cdouble d = pow( .5, 1. + (1. - i/128.)*x) * 65536.;
-        uint16 v = (uint16) d;
-        if( v < i)
-            v = i;
-        Ptbl[256-i] = (uint16) ( 65536U - (Ptbl[i] = v));
-    } while( ++i < 128);
-    cost_init();
-}
-
-bool_coder_spec::bool_coder_spec( FILE *fp) {
-    fscanf( fp, "%d", &w);
-    int v;
-    fscanf( fp, "%d", &v);
-    assert( 0 <= v  &&  v <= 2);
-    r = (Rounding) v;
-    fscanf( fp, "%d", &ebits);
-    fscanf( fp, "%d", &mbits);
-    if( float_init( ebits, mbits))
-        return;
-    int i=0;  do {
-        uint v;
-        fscanf( fp, "%d", &v);
-        assert( 0 <=v  &&  v <= 65535U);
-        Ptbl[i] = v;
-    } while( ++i < 256);
-    cost_init();
-}
-
-void bool_coder_spec::dump( FILE *fp) const {
-    fprintf( fp, "%d %d %d %d\n", w, (int) r, ebits, mbits);
-    if( ebits  ||  mbits)
-        return;
-    int i=0;  do { fprintf( fp, "%d\n", Ptbl[i]);}  while( ++i < 256);
-}
-
-vp8bc_index_t bool_coder_spec::operator()( double p) const
-{
-    if( p <= 0.)
-        return 0;
-    if( p >= 1.)
-        return max_index;
-    if( ebias) {
-        if( p > .5)
-            return max_index - ( *this)( 1. - p);
-        int e;
-        uint m = (uint) ldexp( frexp( p, &e), mbits + 2);
-        uint x = 1 << (mbits + 1);
-        assert( x <= m  &&  m < x<<1);
-        if( (m = (m >> 1) + (m & 1)) >= x) {
-            m = x >> 1;
-            ++e;
-        }
-        int y = 1 << ebits;
-        if( (e += y) >= y)
-            return half_index - 1;
-        if( e < 0)
-            return 0;
-        return (Index) ( (e << mbits) + (m & mmask));
-    }
-
-    cuint16 v = (uint16) (p * 65536.);
-    int i = 128;
-    int j = 128;
-    uint16 w;
-    while( w = Ptbl[i], j >>= 1) {
-        if( w < v)
-            i += j;
-        else if( w == v)
-            return (uchar) i;
-        else
-            i -= j;
-    }
-    if( w > v) {
-        cuint16 x = Ptbl[i-1];
-        if( v <= x  ||  w - v > v - x)
-            --i;
-    } else if( w < v  &&  i < 255) {
-        cuint16 x = Ptbl[i+1];
-        if( x <= v  ||  x - v < v - w)
-            ++i;
-    }
-    return (Index) i;
-}
-
-double bool_coder_spec::operator()( Index i) const {
-    if( !ebias)
-        return Ptbl[i]/65536.;
-    if( i >= half_index)
-        return 1. - ( *this)( (Index) (max_index - i));
-    return ldexp( (double)mantissa( i), - (int) exponent( i));
-}
-
-
-
-void bool_writer::carry() {
-    uchar *p = B;
-    assert( p > Bstart);
-    while( *--p == 255) { assert( p > Bstart);  *p = 0;}
-    ++*p;
-}
-
-
-bool_writer::bool_writer( c_spec& s, uchar *Dest, size_t Len)
-  : bool_coder( s),
-    Bstart( Dest),
-    Bend( Len? Dest+Len : 0),
-    B( Dest)
-{
-    assert( Dest);
-    reset();
-}
-
-bool_writer::~bool_writer() { flush();}
-
-#if 1
-    extern "C" { int bc_v = 0;}
-#else
-#   define bc_v 0
-#endif
-
-
-void bool_writer::raw( bool value, uint32 s) {
-    uint32 L = Low;
-
-    assert( Range >= min_range  &&  Range <= spec.max_range());
-    assert( !is_toast  &&  s  &&  s < Range);
-
-    if( bc_v) printf(
-        "Writing a %d, B %x  Low %x  Range %x  s %x   blag %d ...\n",
-        value? 1:0, B-Bstart, Low, Range, s, bit_lag
-    );
-    if( value) {
-        L += s;
-        s = Range - s;
-    } else
-        s -= rinc;
-    if( s < min_range) {
-        int ct = bit_lag;  do {
-            if( !--ct) {
-                ct = 8;
-                if( L & (1 << 31))
-                    carry();
-                assert( !Bend  ||  B < Bend);
-                *B++ = (uchar) (L >> 23);
-                L &= (1<<23) - 1;
-            }
-        } while( L += L, (s += s + rinc) < min_range);
-        bit_lag = ct;
-    }
-    Low = L;
-    Range = s;
-    if( bc_v)
-        printf(
-            "...done, B %x  Low %x  Range %x  blag %d \n",
-                B-Bstart, Low, Range, bit_lag
-        );
-}
-
-bool_writer& bool_writer::flush() {
-    if( is_toast)
-        return *this;
-    int b = bit_lag;
-    uint32 L = Low;
-    assert( b);
-    if( L & (1 << (32 - b)))
-        carry();
-    L <<= b & 7;
-    b >>= 3;
-    while( --b >= 0)
-        L <<= 8;
-    b = 4;
-    assert( !Bend  ||  B + 4 <= Bend);
-    do {
-        *B++ = (uchar) (L >> 24);
-        L <<= 8;
-    } while( --b);
-    is_toast = 1;
-    return *this;
-}
-
-
-bool_reader::bool_reader( c_spec& s, cuchar *src, size_t Len)
-  : bool_coder( s),
-    Bstart( src),
-    B( src),
-    Bend( Len? src+Len : 0),
-    shf( 32 - s.w),
-    bct( 8)
-{
-    int i = 4;  do { Low <<= 8;  Low |= *B++;}  while( --i);
-}
-
-
-bool bool_reader::raw( uint32 s) {
-
-    bool val = 0;
-    uint32 L = Low;
-    cuint32 S = s << shf;
-
-    assert( Range >= min_range  &&  Range <= spec.max_range());
-    assert( s  &&  s < Range  &&  (L >> shf) < Range);
-
-    if( bc_v)
-        printf(
-            "Reading, B %x  Low %x  Range %x  s %x  bct %d ...\n",
-            B-Bstart, Low, Range, s, bct
-        );
-
-    if( L >= S) {
-        L -= S;
-        s = Range - s;
-        assert( L < (s << shf));
-        val = 1;
-    } else
-        s -= rinc;
-    if( s < min_range) {
-        int ct = bct;
-        do {
-            assert( ~L & (1 << 31));
-            L += L;
-            if( !--ct) {
-                ct = 8;
-                if( !Bend  ||  B < Bend)
-                    L |= *B++;
-            }
-        } while( (s += s + rinc) < min_range);
-        bct = ct;
-    }
-    Low = L;
-    Range = s;
-    if( bc_v)
-        printf(
-            "...done, val %d  B %x  Low %x  Range %x  bct %d\n",
-            val? 1:0, B-Bstart, Low, Range, bct
-        );
-    return val;
-}
-
-
-/* C interfaces */
-
-// spec interface
-
-struct NS : bool_coder_namespace {
-    static Rounding r( vp8bc_c_prec *p, Rounding rr =down_full) {
-        return p? (Rounding) p->r : rr;
-    }
-};
-
-bool_coder_spec *vp8bc_vp6spec() {
-    return new bool_coder_spec_explicit_table( 0, bool_coder_namespace::Down, 8);
-}
-bool_coder_spec *vp8bc_float_spec(
-    unsigned int Ebits, unsigned int Mbits, vp8bc_c_prec *p
-) {
-    return new bool_coder_spec_float( Ebits, Mbits, NS::r( p), p? p->prec : 12);
-}
-bool_coder_spec *vp8bc_literal_spec(
-    const unsigned short m[256], vp8bc_c_prec *p
-) {
-    return new bool_coder_spec_explicit_table( m, NS::r( p), p? p->prec : 16);
-}
-bool_coder_spec *vp8bc_exponential_spec( unsigned int x, vp8bc_c_prec *p)
-{
-    return new bool_coder_spec_exponential_table( x, NS::r( p), p? p->prec : 16);
-}
-bool_coder_spec *vp8bc_spec_from_file( FILE *fp) {
-    return new bool_coder_spec( fp);
-}
-void vp8bc_destroy_spec( c_bool_coder_spec *p) { delete p;}
-
-void vp8bc_spec_to_file( c_bool_coder_spec *p, FILE *fp) { p->dump( fp);}
-
-vp8bc_index_t vp8bc_index( c_bool_coder_spec *p, double x) {
-    return ( *p)( x);
-}
-
-vp8bc_index_t vp8bc_index_from_counts(
-    c_bool_coder_spec *p, unsigned int L, unsigned int R
-) {
-    return ( *p)( (R += L)? (double) L/R : .5);
-}
-
-double vp8bc_probability( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return ( *p)( i);
-}
-
-vp8bc_index_t vp8bc_complement( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return p->complement( i);
-}
-unsigned int vp8bc_cost_zero( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return p->cost_zero( i);
-}
-unsigned int vp8bc_cost_one( c_bool_coder_spec *p, vp8bc_index_t i) {
-    return p->cost_one( i);
-}
-unsigned int vp8bc_cost_bit( c_bool_coder_spec *p, vp8bc_index_t i, int v) {
-    return p->cost_bit( i, v);
-}
-
-#if tim_vp8
-    extern "C" int tok_verbose;
-
-#   define dbg_l 1000000
-
-    static vp8bc_index_t dbg_i [dbg_l];
-    static char dbg_v [dbg_l];
-    static size_t dbg_w = 0, dbg_r = 0;
-#endif
-
-// writer interface
-
-bool_writer *vp8bc_create_writer(
-    c_bool_coder_spec *p, unsigned char *D, size_t L
-) {
-    return new bool_writer( *p, D, L);
-}
-
-size_t vp8bc_destroy_writer( bool_writer *p) {
-    const size_t s = p->flush().bytes_written();
-    delete p;
-    return s;
-}
-
-void vp8bc_write_bool( bool_writer *p, int v, vp8bc_index_t i)
-{
-#   if tim_vp8
-        // bc_v = dbg_w < 10;
-        if( bc_v = tok_verbose)
-            printf( " writing %d at prob %d\n", v? 1:0, i);
-        accum_entropy_bc( &p->Spec(), i, v);
-
-        ( *p)( i, (bool) v);
-
-        if( dbg_w < dbg_l) {
-            dbg_i [dbg_w] = i;
-            dbg_v [dbg_w++] = v? 1:0;
-        }
-#   else
-        ( *p)( i, (bool) v);
-#   endif
-}
-
-void vp8bc_write_bits( bool_writer *p, unsigned int v, int n)
-{
-#   if tim_vp8
-        {
-            c_bool_coder_spec * const s = & p->Spec();
-            const vp8bc_index_t i = s->half_index();
-            int m = n;
-            while( --m >= 0)
-                accum_entropy_bc( s, i, (v>>m) & 1);
-        }
-#   endif
-
-    p->write_bits( n, v);
-}
-
-c_bool_coder_spec *vp8bc_writer_spec( c_bool_writer *w) { return & w->Spec();}
-
-// reader interface
-
-bool_reader *vp8bc_create_reader(
-    c_bool_coder_spec *p, const unsigned char *S, size_t L
-) {
-    return new bool_reader( *p, S, L);
-}
-
-void vp8bc_destroy_reader( bool_reader * p) { delete p;}
-
-int vp8bc_read_bool( bool_reader *p, vp8bc_index_t i)
-{
-#   if tim_vp8
-        // bc_v = dbg_r < 10;
-        bc_v = tok_verbose;
-        const int v = ( *p)( i)? 1:0;
-        if( tok_verbose)
-            printf( " reading %d at prob %d\n", v, i);
-        if( dbg_r < dbg_l) {
-            assert( dbg_r <= dbg_w);
-            if( i != dbg_i[dbg_r]  ||  v != dbg_v[dbg_r]) {
-                printf(
-        "Position %d: INCORRECTLY READING %d  prob %d, wrote %d  prob %d\n",
-                    dbg_r, v, i, dbg_v[dbg_r], dbg_i[dbg_r]
-                );
-            }
-            ++dbg_r;
-        }
-        return v;
-#   else
-        return ( *p)( i)? 1:0;
-#   endif
-}
-
-unsigned int vp8bc_read_bits( bool_reader *p, int n) { return p->read_bits( n);}
-
-c_bool_coder_spec *vp8bc_reader_spec( c_bool_reader *r) { return & r->Spec();}
-
-#undef bc_v
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
index 43735bc4b..465626b8f 100644
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -14,18 +14,18 @@
 ; /****************************************************************************
 ; * Notes:
 ; *
-; * This implementation makes use of 16 bit fixed point verio of two multiply
+; * This implementation makes use of 16 bit fixed point version of two multiply
 ; * constants:
 ; *        1.   sqrt(2) * cos (pi/8)
-; *         2.   sqrt(2) * sin (pi/8)
-; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point prrcision as the second one, we use a trick of
+; *        2.   sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
 ; *        x * a = x + x*(a-1)
 ; * so
 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
 ; *
-; * For     the second constant, becuase of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it become a negative
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
 ; * number.
 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
 ; *
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index edee1578e..34a7e18ae 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2):
         mov         rdx,            arg(1) ; dequant
         mov         rax,            arg(0) ; qcoeff
 
-    ; Zero out xmm7, for use unpacking
-        pxor        xmm7,           xmm7
-
         movd        xmm4,           [rax]
         movd        xmm5,           [rdx]
 
@@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2):
 
         pmullw      xmm4,           xmm5
 
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
     ; clear coeffs
-        movd        [rax],          xmm7
-        movd        [rax+32],       xmm7
+        movd        [rax],          xmm5
+        movd        [rax+32],       xmm5
 ;pshufb
         pshuflw     xmm4,           xmm4,       00000000b
         pshufhw     xmm4,           xmm4,       00000000b
@@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2):
         lea         rcx,            [3*rcx]
         movq        xmm3,           [rax+rcx]
 
-        punpcklbw   xmm0,           xmm7
-        punpcklbw   xmm1,           xmm7
-        punpcklbw   xmm2,           xmm7
-        punpcklbw   xmm3,           xmm7
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
 
         mov         rax,            arg(3) ; dst
         movsxd      rdx,            dword ptr arg(4) ; dst_stride
@@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2):
         paddw       xmm3,           xmm4
 
     ; pack up before storing
-        packuswb    xmm0,           xmm7
-        packuswb    xmm1,           xmm7
-        packuswb    xmm2,           xmm7
-        packuswb    xmm3,           xmm7
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
 
     ; store blocks back out
         movq        [rax],          xmm0
@@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2):
     pop         rdi
     pop         rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2):
         mov         rdi,            arg(3) ; dst
         mov         rdx,            arg(5) ; dc
 
-    ; Zero out xmm7, for use unpacking
-        pxor        xmm7,           xmm7
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
 
     ; load up 2 dc words here == 2*16 = doubleword
         movd        xmm4,           [rdx]
@@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2):
         psraw       xmm4,           3
 
     ; Predict buffer needs to be expanded from bytes to words
-        punpcklbw   xmm0,           xmm7
-        punpcklbw   xmm1,           xmm7
-        punpcklbw   xmm2,           xmm7
-        punpcklbw   xmm3,           xmm7
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
 
     ; Add to predict buffer
         paddw       xmm0,           xmm4
@@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2):
         paddw       xmm3,           xmm4
 
     ; pack up before storing
-        packuswb    xmm0,           xmm7
-        packuswb    xmm1,           xmm7
-        packuswb    xmm2,           xmm7
-        packuswb    xmm3,           xmm7
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
 
     ; Load destination stride before writing out,
     ;   doesn't need to persist
@@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2):
     pop         rdi
     pop         rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm
index 83c97df7d..1da4fd8da 100644
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,7 +17,7 @@ sym(vp8_short_inv_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM
+    SAVE_XMM 6
     push        rsi
     push        rdi
     ; end prolog
@@ -41,7 +41,7 @@ sym(vp8_short_inv_walsh4x4_sse2):
     movdqa    xmm4, xmm0
     punpcklqdq  xmm0, xmm3          ;d1 a1
     punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm7, eax
+    movd    xmm6, eax
 
     movdqa    xmm1, xmm4          ;c1 b1
     paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
@@ -66,7 +66,7 @@ sym(vp8_short_inv_walsh4x4_sse2):
     pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
     movdqa    xmm3, xmm4          ;ip[4] ip[0]
 
-    pshufd    xmm7, xmm7, 0       ;03 03 03 03 03 03 03 03
+    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
 
     paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
     psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
@@ -90,8 +90,8 @@ sym(vp8_short_inv_walsh4x4_sse2):
     punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
     punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm7
-    paddw   xmm1, xmm7
+    paddw   xmm5, xmm6
+    paddw   xmm1, xmm6
 
     psraw   xmm5, 3
     psraw   xmm1, 3
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 849133dc4..c2ce1a106 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -288,7 +288,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -338,7 +338,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -584,7 +584,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -634,7 +634,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1024,7 +1024,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1091,7 +1091,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1249,7 +1249,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1318,7 +1318,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1386,7 +1386,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1503,7 +1503,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx         ; save callee-saved reg
     push        rsi
     push        rdi
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 5837bc0dc..a52420c98 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -42,9 +42,8 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
 #if HAVE_MMX
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -56,21 +55,19 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 
 
 void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -82,21 +79,19 @@ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 
 
 void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -110,12 +105,11 @@ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
 
 
 void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -124,9 +118,8 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -140,12 +133,11 @@ void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
 
 
 void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -156,9 +148,8 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 /* Horizontal MB filtering */
 #if HAVE_SSE2
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -167,21 +158,19 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 
 
 void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
@@ -190,21 +179,19 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 
 
 void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -215,12 +202,11 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 
 
 void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -229,9 +215,8 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
@@ -242,12 +227,11 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 
 
 void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
     (void) u_ptr;
     (void) v_ptr;
     (void) uv_stride;
-    (void) simpler_lpf;
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
     vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 30b4bf53a..06d51ec6f 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -26,7 +26,7 @@ sym(vp8_post_proc_down_and_across_xmm):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -256,7 +256,7 @@ sym(vp8_mbpost_proc_down_xmm):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -456,7 +456,7 @@ sym(vp8_mbpost_proc_across_ip_xmm):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 4ad3973ec..97dc4f686 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -67,7 +67,7 @@ sym(vp8_recon4b_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM
+    SAVE_XMM 7
     push        rsi
     push        rdi
     ; end prolog
@@ -229,3 +229,397 @@ sym(vp8_copy_mem16x16_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dc_mmx2)
+sym(vp8_intra_pred_uv_dc_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        mm0,        mm0
+    movq        mm1,        [rsi]
+    psadbw      mm1,        mm0
+
+    ; from left
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi+rax]
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*4]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        mm1, 0x0
+    lea         edx,        [edx+ecx+8]
+    sar         edx,        4
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dctop_mmx2)
+sym(vp8_intra_pred_uv_dctop_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        mm0,        mm0
+    movq        mm1,        [rsi]
+    psadbw      mm1,        mm0
+
+    ; add up
+    paddw       mm1,        [GLOBAL(dc_4)]
+    psraw       mm1,        3
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dcleft_mmx2)
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from left
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+4]
+
+    ; add up
+    shr         edx,        3
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_dc128_mmx)
+sym(vp8_intra_pred_uv_dc128_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    ; end prolog
+
+    ; write out
+    movq        mm1,        [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1)
+sym(vp8_intra_pred_uv_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read top row
+    mov         edx,        4
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm2,       [GLOBAL(dc_1024)]
+%endif
+    movq        xmm1,       [rsi]
+    punpcklbw   xmm1,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm3,       [rsi-1]
+    lea         rsi,        [rsi+rax-1]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    punpcklqdq  xmm3,       xmm3
+%else
+    pshufb      xmm3,       xmm2
+%endif
+    psubw       xmm1,       xmm3
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_uv_tm_%1_loop:
+    movd        xmm3,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm3,       xmm3
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm3,       xmm2
+    pshufb      xmm5,       xmm2
+%endif
+    paddw       xmm3,       xmm1
+    paddw       xmm5,       xmm1
+    packuswb    xmm3,       xmm5
+    movq  [rdi    ],        xmm3
+    movhps[rdi+rcx],        xmm3
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_uv_tm_%1_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_ve_mmx)
+sym(vp8_intra_pred_uv_ve_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    ; end prolog
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+    movsxd      rdx,        dword ptr arg(3) ;src_stride;
+    sub         rax,        rdx
+    movq        mm1,        [rax]
+
+    ; write out
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_uv_ho_mmx2)
+sym(vp8_intra_pred_uv_ho_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read from left and write out
+    mov         edx,        4
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    dec         rsi
+vp8_intra_pred_uv_ho_mmx2_loop:
+    movd        mm0,        [rsi]
+    movd        mm1,        [rsi+rax]
+    punpcklbw   mm0,        mm0
+    punpcklbw   mm1,        mm1
+    pshufw      mm0,        mm0, 0x0
+    pshufw      mm1,        mm1, 0x0
+    movq  [rdi    ],        mm0
+    movq  [rdi+rcx],        mm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_uv_ho_mmx2_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+dc_128:
+    times 8 db 128
+dc_4:
+    times 4 dw 4
+align 16
+dc_1024:
+    times 8 dw 0x400
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
new file mode 100644
index 000000000..86b4da2c2
--- /dev/null
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp8/common/recon.h"
+#include "recon_x86.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+    void sym(unsigned char *dst, int dst_stride, \
+             const unsigned char *src, int src_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+                                                unsigned char *dst_u,
+                                                unsigned char *dst_v,
+                                                int dst_stride,
+                                                build_intra_predictors_mbuv_fn_t tm_func)
+{
+    int mode = x->mode_info_context->mbmi.uv_mode;
+    build_intra_predictors_mbuv_fn_t fn;
+    int src_stride = x->dst.uv_stride;
+
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+        case  H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_uv_dc_mmx2; break;
+                } else {
+                    fn = vp8_intra_pred_uv_dctop_mmx2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+            } else {
+                fn = vp8_intra_pred_uv_dc128_mmx; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
+    fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+                                        &x->predictor[320], 8,
+                                        vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+                                        &x->predictor[320], 8,
+                                        vp8_intra_pred_uv_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+                                        x->dst.v_buffer, x->dst.uv_stride,
+                                        vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+                                        x->dst.v_buffer, x->dst.uv_stride,
+                                        vp8_intra_pred_uv_tm_ssse3);
+}
diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h
index 40ee65a12..fe0f8f0bc 100644
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -46,6 +46,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
 extern prototype_recon_block(vp8_recon2b_sse2);
 extern prototype_recon_block(vp8_recon4b_sse2);
 extern prototype_copy_block(vp8_copy_mem16x16_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon2
@@ -57,6 +59,26 @@ extern prototype_copy_block(vp8_copy_mem16x16_sse2);
 #undef  vp8_recon_copy16x16
 #define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
 
+#undef  vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2
+
+#undef  vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
+
+#endif
+#endif
+
+#if HAVE_SSSE3
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_ssse3
+
+#undef  vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
+
 #endif
 #endif
 #endif
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index b87cad259..83e3b1479 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -37,7 +37,7 @@ sym(vp8_filter_block1d8_h6_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -157,7 +157,7 @@ sym(vp8_filter_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -333,7 +333,7 @@ sym(vp8_filter_block1d8_v6_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -428,7 +428,7 @@ sym(vp8_filter_block1d16_v6_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -538,7 +538,7 @@ sym(vp8_filter_block1d8_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -651,7 +651,7 @@ sym(vp8_filter_block1d16_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -816,7 +816,7 @@ sym(vp8_filter_block1d8_v6_only_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -908,7 +908,6 @@ sym(vp8_unpack_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    ;SAVE_XMM                          ;xmm6, xmm7 are not used here.
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -948,7 +947,6 @@ unpack_block1d16_h6_sse2_rowloop:
     pop rdi
     pop rsi
     RESTORE_GOT
-    ;RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -969,7 +967,7 @@ sym(vp8_bilinear_predict16x16_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1238,7 +1236,7 @@ sym(vp8_bilinear_predict8x8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 7f6fd93e4..1ddbc54bd 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -179,7 +182,7 @@ sym(vp8_filter_block1d16_h6_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -194,10 +197,6 @@ sym(vp8_filter_block1d16_h6_ssse3):
 
     mov         rdi, arg(2)                     ;output_ptr
 
-;;
-;;    cmp         esi, DWORD PTR [rax]
-;;    je          vp8_filter_block1d16_h4_ssse3
-
     mov         rsi, arg(0)                     ;src_ptr
 
     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
@@ -271,61 +270,7 @@ filter_block1d16_h6_rowloop_ssse3:
     pop rdi
     pop rsi
     RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp8_filter_block1d16_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-filter_block1d16_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
-
-    pmaddubsw   xmm2, xmm6
-    movdqa      xmm0, xmm3
-    pshufb      xmm3, [GLOBAL(shuf3b)]
-    pshufb      xmm0, [GLOBAL(shuf2b)]
-
-    paddsw      xmm1, [GLOBAL(rd)]
-    paddsw      xmm1, xmm2
-
-    pmaddubsw   xmm0, xmm5
-    pmaddubsw   xmm3, xmm6
-
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3, xmm0
-    paddsw      xmm3, [GLOBAL(rd)]
-    psraw       xmm3, 7
-    packuswb    xmm3, xmm3
-
-    punpcklqdq  xmm1, xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         filter_block1d16_h4_rowloop_ssse3
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -344,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -451,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -471,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -566,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -638,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -656,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -728,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -776,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -932,7 +885,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1195,7 +1148,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index e89c07a4f..17667330a 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -88,6 +88,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
         rtcd->recon.recon2      = vp8_recon2b_sse2;
         rtcd->recon.recon4      = vp8_recon4b_sse2;
         rtcd->recon.copy16x16   = vp8_copy_mem16x16_sse2;
+        rtcd->recon.build_intra_predictors_mbuv =
+            vp8_build_intra_predictors_mbuv_sse2;
+        rtcd->recon.build_intra_predictors_mbuv_s =
+            vp8_build_intra_predictors_mbuv_s_sse2;
 
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;
 
@@ -126,6 +130,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
         rtcd->subpix.sixtap4x4     = vp8_sixtap_predict4x4_ssse3;
         rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3;
         rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_ssse3;
+
+        rtcd->recon.build_intra_predictors_mbuv =
+            vp8_build_intra_predictors_mbuv_ssse3;
+        rtcd->recon.build_intra_predictors_mbuv_s =
+            vp8_build_intra_predictors_mbuv_s_ssse3;
     }
 #endif
 
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index a83e3f012..5f6b211ea 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -51,19 +51,26 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 #define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
     do \
     { \
-        int shift; \
-        for(shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); shift >= 0; ) \
+        int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \
+        int loop_end, x; \
+        size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
+        \
+        x = shift + CHAR_BIT - bits_left; \
+        loop_end = 0; \
+        if(x >= 0) \
         { \
-            if((_bufptr) >= (_bufend)) { \
-                (_count) = VP8_LOTS_OF_BITS; \
-                break; \
-            } \
-            (_count) += 8; \
+            (_count) += VP8_LOTS_OF_BITS; \
+            loop_end = x; \
+            if(!bits_left) break; \
+        } \
+        while(shift >= loop_end) \
+        { \
+            (_count) += CHAR_BIT; \
             (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
-            shift -= 8; \
+            shift -= CHAR_BIT; \
         } \
     } \
-    while(0)
+    while(0) \
 
 
 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
@@ -119,18 +126,19 @@ static int vp8_decode_value(BOOL_DECODER *br, int bits)
 
 static int vp8dx_bool_error(BOOL_DECODER *br)
 {
-  /* Check if we have reached the end of the buffer.
-   *
-   * Variable 'count' stores the number of bits in the 'value' buffer,
-   * minus 8. So if count == 8, there are 16 bits available to be read.
-   * Normally, count is filled with 8 and one byte is filled into the
-   * value buffer. When we reach the end of the buffer, count is instead
-   * filled with VP8_LOTS_OF_BITS, 8 of which represent the last 8 real
-   * bits from the bitstream. So the last bit in the bitstream will be
-   * represented by count == VP8_LOTS_OF_BITS - 16.
-   */
-    if ((br->count > VP8_BD_VALUE_SIZE)
-        && (br->count <= VP8_LOTS_OF_BITS - 16))
+    /* Check if we have reached the end of the buffer.
+     *
+     * Variable 'count' stores the number of bits in the 'value' buffer, minus
+     * 8. The top byte is part of the algorithm, and the remainder is buffered
+     * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+     * occupied, 8 for the algorithm and 8 in the buffer.
+     *
+     * When reading a byte from the user's buffer, count is filled with 8 and
+     * one byte is filled into the value buffer. When we reach the end of the
+     * data, count is additionally filled with VP8_LOTS_OF_BITS. So when
+     * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted.
+     */
+    if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS))
     {
        /* We have tried to decode bits after the end of
         * stream was encountered.
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 82841e8b8..a585f774c 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -111,16 +111,17 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
  */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-
-        vp8_build_intra_predictors_mbuv_s(xd);
+        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
         RECON_INVOKE(&pbi->common.rtcd.recon,
                      build_intra_predictors_mby_s)(xd);
     }
     else
     {
-        vp8_build_inter_predictors_mb_s(xd);
+        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
     }
 }
 
@@ -195,11 +196,15 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
         clamp_mvs(xd);
     }
 
-    xd->mode_info_context->mbmi.dc_diff = 1;
-
-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED ||
+                  xd->mode_info_context->mbmi.mode == SPLITMV);
+    if (!eobtotal)
     {
-        xd->mode_info_context->mbmi.dc_diff = 0;
+        /* Special case:  Force the loopfilter to skip when eobtotal and
+         * mb_skip_coeff are zero.
+         * */
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
         skip_recon_mb(pbi, xd);
         return;
     }
@@ -208,9 +213,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
         mb_init_dequantizer(pbi, xd);
 
     /* do prediction */
-    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        vp8_build_intra_predictors_mbuv(xd);
+        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
 
         if (xd->mode_info_context->mbmi.mode != B_PRED)
         {
@@ -218,6 +223,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
                          build_intra_predictors_mby)(xd);
         } else {
             vp8_intra_prediction_down_copy(xd);
+
+
+
         }
     }
     else
@@ -229,6 +237,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
     if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
     {
         BLOCKD *b = &xd->block[24];
+
         DEQUANT_INVOKE(&pbi->dequant, block)(b);
 
         /* do 2nd order transform on the dc block */
@@ -255,13 +264,14 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
                          xd->predictor, xd->dst.y_buffer,
                          xd->dst.y_stride, xd->eobs, xd->block[24].diff);
     }
-    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
+    else if (xd->mode_info_context->mbmi.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
         {
 
             BLOCKD *b = &xd->block[i];
-            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+            RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
+                         (b, b->bmi.mode, b->predictor);
 
             if (xd->eobs[i] > 1)
             {
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index c22e0f28c..c22e0f28c 100755..100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index ef2e00d61..1e83ab542 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -76,7 +76,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     pbi->common.current_video_frame = 0;
     pbi->ready_for_new_data = 1;
 
-    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
 #if CONFIG_MULTITHREAD
     pbi->max_threads = oxcf->max_threads;
     vp8_decoder_create_threads(pbi);
@@ -252,7 +251,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
     int retcode = 0;
-    struct vpx_usec_timer timer;
 
     /*if(pbi->ready_for_new_data == 0)
         return -1;*/
@@ -317,8 +315,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
     pbi->common.error.setjmp = 1;
 
-    vpx_usec_timer_start(&timer);
-
     /*cm->current_video_frame++;*/
     pbi->Source = source;
     pbi->source_sz = size;
@@ -379,15 +375,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
         if(pbi->common.filter_level)
         {
-            struct vpx_usec_timer lpftimer;
-            vpx_usec_timer_start(&lpftimer);
             /* Apply the loop filter if appropriate. */
-
             vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
 
-            vpx_usec_timer_mark(&lpftimer);
-            pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
-
             cm->last_frame_type = cm->frame_type;
             cm->last_filter_type = cm->filter_type;
             cm->last_sharpness_level = cm->sharpness_level;
@@ -398,11 +388,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
     vp8_clear_system_state();
 
-    vpx_usec_timer_mark(&timer);
-    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
-
-    pbi->time_decoding += pbi->decode_microseconds;
-
     /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/
 
     if (cm->show_frame)
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index ac1e332e8..9b9175628 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -81,12 +81,6 @@ typedef struct VP8Decompressor
     const unsigned char *Source;
     unsigned int   source_sz;
 
-
-    unsigned int CPUFreq;
-    unsigned int decode_microseconds;
-    unsigned int time_decoding;
-    unsigned int time_loop_filtering;
-
 #if CONFIG_MULTITHREAD
     /* variable for threading */
 
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 3d9d428ef..9ef85e9cd 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -108,21 +108,26 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
         clamp_mvs(xd);
     }
 
-    xd->mode_info_context->mbmi.dc_diff = 1;
-
-    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED ||
+                  xd->mode_info_context->mbmi.mode == SPLITMV);
+    if (!eobtotal)
     {
-        xd->mode_info_context->mbmi.dc_diff = 0;
+        /* Special case:  Force the loopfilter to skip when eobtotal and
+         * mb_skip_coeff are zero.
+         * */
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
 
         /*mt_skip_recon_mb(pbi, xd, mb_row, mb_col);*/
-        if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+        if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
         {
             vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
             vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
         }
         else
         {
-            vp8_build_inter_predictors_mb_s(xd);
+            vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                               xd->dst.u_buffer, xd->dst.v_buffer,
+                                               xd->dst.y_stride, xd->dst.uv_stride);
         }
         return;
     }
@@ -322,6 +327,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 
                         if (pbi->common.filter_level)
                         {
+                            int skip_lf;
                             if( mb_row != pc->mb_rows-1 )
                             {
                                 /* Save decoded MB last row data for next-row decoding */
@@ -349,6 +355,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 
                             /* update loopfilter info */
                             Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                                            xd->mode_info_context->mbmi.mode != SPLITMV &&
+                                            xd->mode_info_context->mbmi.mb_skip_coeff);
+
                             filter_level = pbi->mt_baseline_filter_level[Segment];
                             /* Distance of Mb to the various image edges.
                              * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
@@ -360,17 +370,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
                             if (filter_level)
                             {
                                 if (mb_col > 0)
-                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                                if (xd->mode_info_context->mbmi.dc_diff > 0)
-                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                if (!skip_lf)
+                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
                                 /* don't apply across umv border */
                                 if (mb_row > 0)
-                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                                if (xd->mode_info_context->mbmi.dc_diff > 0)
-                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                                if (!skip_lf)
+                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
                             }
                         }
 
@@ -429,12 +439,18 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
 
     pbi->b_multithreaded_rd = 0;
     pbi->allocated_decoding_thread_count = 0;
-    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;
+
+    /* limit decoding threads to the max number of token partitions */
+    core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
+
+    /* limit decoding threads to the available cores */
+    if (core_count > pbi->common.processor_core_count)
+        core_count = pbi->common.processor_core_count;
 
     if (core_count > 1)
     {
         pbi->b_multithreaded_rd = 1;
-        pbi->decoding_thread_count = core_count -1;
+        pbi->decoding_thread_count = core_count - 1;
 
         CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
         CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
@@ -810,6 +826,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 
                 if (pbi->common.filter_level)
                 {
+                    int skip_lf;
                     /* Save decoded MB last row data for next-row decoding */
                     if(mb_row != pc->mb_rows-1)
                     {
@@ -837,6 +854,9 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 
                     /* update loopfilter info */
                     Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                                    xd->mode_info_context->mbmi.mode != SPLITMV &&
+                                    xd->mode_info_context->mbmi.mb_skip_coeff);
                     filter_level = pbi->mt_baseline_filter_level[Segment];
                     /* Distance of Mb to the various image edges.
                      * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
@@ -848,17 +868,17 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                     if (filter_level)
                     {
                         if (mb_col > 0)
-                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                        if (xd->mode_info_context->mbmi.dc_diff > 0)
-                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                        if (!skip_lf)
+                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
                         /* don't apply across umv border */
                         if (mb_row > 0)
-                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
 
-                        if (xd->mode_info_context->mbmi.dc_diff > 0)
-                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                        if (!skip_lf)
+                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
                     }
                 }
 
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 5a2568dde..6de4c8517 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -34,7 +34,7 @@ typedef struct
     // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
     short *quant;
     short *quant_fast;
-    short *quant_shift;
+    unsigned char *quant_shift;
     short *zbin;
     short *zrun_zbin_boost;
     short *round;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index ab4071d35..4a936ec4a 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -147,7 +147,7 @@ static const int qzbin_factors_y2[129] =
 #define EXACT_QUANT
 #ifdef EXACT_QUANT
 static void vp8cx_invert_quant(int improved_quant, short *quant,
-                               short *shift, short d)
+                               unsigned char *shift, short d)
 {
     if(improved_quant)
     {
@@ -1157,7 +1157,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
-#if !(CONFIG_REALTIME_ONLY)
     if (cpi->sf.RD && cpi->compressor_speed != 2)
     {
         vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -1170,7 +1169,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
         rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
     }
     else
-#endif
     {
         int rate2, best_distortion;
         MB_PREDICTION_MODE mode, best_mode = DC_PRED;
@@ -1188,7 +1186,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
                 (&x->e_mbd);
             distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
             rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];
-            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
             if (Error16x16 > this_rd)
             {
@@ -1242,8 +1240,6 @@ int vp8cx_encode_inter_macroblock
     else
         x->encode_breakout = cpi->oxcf.encode_breakout;
 
-#if !(CONFIG_REALTIME_ONLY)
-
     if (cpi->sf.RD)
     {
         int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
@@ -1270,7 +1266,6 @@ int vp8cx_encode_inter_macroblock
 
     }
     else
-#endif
         vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
 
     cpi->prediction_error += distortion;
@@ -1386,7 +1381,7 @@ int vp8cx_encode_inter_macroblock
             cpi->MVcount[1][mv_max+((xd->block[0].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++;
         }
 
-        if (!x->skip && !x->e_mbd.mode_info_context->mbmi.force_no_skip)
+        if (!x->skip)
         {
             vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
 
@@ -1396,7 +1391,10 @@ int vp8cx_encode_inter_macroblock
 
         }
         else
-            vp8_stuff_inter16x16(x);
+            vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
+
     }
 
     if (!x->skip)
@@ -1405,11 +1403,6 @@ int vp8cx_encode_inter_macroblock
     {
         if (cpi->common.mb_no_coeff_skip)
         {
-            if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
-                xd->mode_info_context->mbmi.dc_diff = 0;
-            else
-                xd->mode_info_context->mbmi.dc_diff = 1;
-
             xd->mode_info_context->mbmi.mb_skip_coeff = 1;
             cpi->skip_true_count ++;
             vp8_fix_contexts(xd);
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 44000063c..9517a1d89 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -32,7 +32,8 @@
 #endif
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
 {
-    vp8_predict_intra4x4(b, best_mode, b->predictor);
+    RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
+                 (b, best_mode, b->predictor);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
 
@@ -75,14 +76,9 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     vp8_quantize_mby(x);
 
-#if !(CONFIG_REALTIME_ONLY)
-#if 1
     if (x->optimize)
         vp8_optimize_mby(x, rtcd);
 
-#endif
-#endif
-
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
     RECON_INVOKE(&rtcd->common->recon, recon_mby)
@@ -118,7 +114,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_intra_predictors_mbuv(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 
@@ -126,15 +122,9 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     vp8_quantize_mbuv(x);
 
-#if !(CONFIG_REALTIME_ONLY)
-#if 1
-
     if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
         vp8_optimize_mbuv(x, rtcd);
 
-#endif
-#endif
-
     vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
     vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 463dbcaa9..2509e0698 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -19,6 +19,7 @@
 #include "vp8/common/reconintra.h"
 #include "dct.h"
 #include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
@@ -195,42 +196,7 @@ static void transform_mby(MACROBLOCK *x)
 }
 
 
-void vp8_stuff_inter16x16(MACROBLOCK *x)
-{
-    vp8_build_inter_predictors_mb_s(&x->e_mbd);
-    /*
-        // recon = copy from predictors to destination
-        {
-            BLOCKD *b = &x->e_mbd.block[0];
-            unsigned char *pred_ptr = b->predictor;
-            unsigned char *dst_ptr = *(b->base_dst) + b->dst;
-            int stride = b->dst_stride;
-
-            int i;
-            for(i=0;i<16;i++)
-                vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
-
-            b = &x->e_mbd.block[16];
-            pred_ptr = b->predictor;
-            dst_ptr = *(b->base_dst) + b->dst;
-            stride = b->dst_stride;
-
-            for(i=0;i<8;i++)
-                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
-
-            b = &x->e_mbd.block[20];
-            pred_ptr = b->predictor;
-            dst_ptr = *(b->base_dst) + b->dst;
-            stride = b->dst_stride;
-
-            for(i=0;i<8;i++)
-                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
-        }
-    */
-}
 
-#if !(CONFIG_REALTIME_ONLY)
-#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 
 typedef struct vp8_token_state vp8_token_state;
@@ -608,7 +574,6 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
             ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 }
-#endif
 
 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
@@ -620,10 +585,8 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     vp8_quantize_mb(x);
 
-#if !(CONFIG_REALTIME_ONLY)
     if (x->optimize)
         optimize_mb(x, rtcd);
-#endif
 
     vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
@@ -635,7 +598,7 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 /* this funciton is used by first pass only */
 void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_inter_predictors_mby(&x->e_mbd);
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
 
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 8c93aa180..47fc72dad 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -95,8 +95,6 @@ typedef struct
 struct VP8_ENCODER_RTCD;
 void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 
-extern void vp8_stuff_inter16x16(MACROBLOCK *x);
-
 void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index f5006ddab..c00494dcf 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -459,15 +459,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
     cpi->b_multi_threaded = 0;
     cpi->encoding_thread_count = 0;
-    cpi->processor_core_count = 32; //vp8_get_proc_core_count();
 
-    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+    if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
     {
         int ithread;
         int th_count = cpi->oxcf.multi_threaded - 1;
 
-        if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
-            th_count = cpi->processor_core_count - 1;
+        /* don't allocate more threads than cores available */
+        if (cpi->oxcf.multi_threaded > cm->processor_core_count)
+            th_count = cm->processor_core_count - 1;
 
         /* we have th_count + 1 (main) threads processing one row each */
         /* no point to have more threads than the sync range allows */
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 8f24a11a0..6f330991b 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -786,7 +786,8 @@ void vp8_first_pass(VP8_COMP *cpi)
 
         // TODO:  handle the case when duration is set to 0, or something less
         // than the full time between subsequent cpi->source_time_stamp s  .
-        fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
+        fps.duration = cpi->source->ts_end
+                       - cpi->source->ts_start;
 
         // don't want to do output stats with a stack variable!
         memcpy(cpi->this_frame_stats,
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 1d672bef9..d48c95bf7 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -89,9 +89,7 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
 
     cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
     cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-#if !(CONFIG_REALTIME_ONLY)
     cpi->rtcd.search.full_search             = vp8_full_search_sad;
-#endif
     cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
 #if !(CONFIG_REALTIME_ONLY)
     cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
new file mode 100644
index 000000000..3b86d4094
--- /dev/null
+++ b/vp8/encoder/lookahead.c
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp8/common/extend.h"
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+struct lookahead_ctx
+{
+    unsigned int max_sz;         /* Absolute size of the queue */
+    unsigned int sz;             /* Number of buffers currently in the queue */
+    unsigned int read_idx;       /* Read index */
+    unsigned int write_idx;      /* Write index */
+    struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+    unsigned int         *idx)
+{
+    unsigned int            index = *idx;
+    struct lookahead_entry *buf = ctx->buf + index;
+
+    assert(index < ctx->max_sz);
+    if(++index >= ctx->max_sz)
+        index -= ctx->max_sz;
+    *idx = index;
+    return buf;
+}
+
+
+void
+vp8_lookahead_destroy(struct lookahead_ctx *ctx)
+{
+    if(ctx)
+    {
+        if(ctx->buf)
+        {
+            int i;
+
+            for(i = 0; i < ctx->max_sz; i++)
+                vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+            free(ctx->buf);
+        }
+        free(ctx);
+    }
+}
+
+
+struct lookahead_ctx*
+vp8_lookahead_init(unsigned int width,
+                   unsigned int height,
+                   unsigned int depth)
+{
+    struct lookahead_ctx *ctx = NULL;
+    int i;
+
+    /* Clamp the lookahead queue depth */
+    if(depth < 1)
+        depth = 1;
+    else if(depth > MAX_LAG_BUFFERS)
+        depth = MAX_LAG_BUFFERS;
+
+    /* Align the buffer dimensions */
+    width = (width + 15) & ~15;
+    height = (height + 15) & ~15;
+
+    /* Allocate the lookahead structures */
+    ctx = calloc(1, sizeof(*ctx));
+    if(ctx)
+    {
+        ctx->max_sz = depth;
+        ctx->buf = calloc(depth, sizeof(*ctx->buf));
+        if(!ctx->buf)
+            goto bail;
+        for(i=0; i<depth; i++)
+            if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, width, height, 16))
+                goto bail;
+    }
+    return ctx;
+bail:
+    vp8_lookahead_destroy(ctx);
+    return NULL;
+}
+
+
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags)
+{
+    struct lookahead_entry* buf;
+
+    if(ctx->sz + 1 > ctx->max_sz)
+        return 1;
+    ctx->sz++;
+    buf = pop(ctx, &ctx->write_idx);
+    vp8_copy_and_extend_frame(src, &buf->img);
+    buf->ts_start = ts_start;
+    buf->ts_end = ts_end;
+    buf->flags = flags;
+    return 0;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain)
+{
+    struct lookahead_entry* buf = NULL;
+
+    if(ctx->sz && (drain || ctx->sz == ctx->max_sz))
+    {
+        buf = pop(ctx, &ctx->read_idx);
+        ctx->sz--;
+    }
+    return buf;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   int                   index)
+{
+    struct lookahead_entry* buf = NULL;
+
+    assert(index < ctx->max_sz);
+    if(index < ctx->sz)
+    {
+        index += ctx->read_idx;
+        if(index >= ctx->max_sz)
+            index -= ctx->max_sz;
+        buf = ctx->buf + index;
+    }
+    return buf;
+}
+
+
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx)
+{
+    return ctx->sz;
+}
diff --git a/vp8/encoder/lookahead.h b/vp8/encoder/lookahead.h
new file mode 100644
index 000000000..a483d7e0b
--- /dev/null
+++ b/vp8/encoder/lookahead.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LOOKAHEAD_H
+#define LOOKAHEAD_H
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+struct lookahead_entry
+{
+    YV12_BUFFER_CONFIG  img;
+    int64_t             ts_start;
+    int64_t             ts_end;
+    unsigned int        flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx* vp8_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int depth
+                                         );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp8_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] src       Pointer to the image to enqueue
+ * \param[in] ts_start  Timestamp for the start of this frame
+ * \param[in] ts_end    Timestamp for the end of this frame
+ * \param[in] flags     Flags set on this frame
+ */
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain);
+
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   int                   index);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#endif
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 716f514af..9d447b210 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -194,13 +194,13 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
 #define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
 
 //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
 
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)
 {
     unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
     unsigned char *z = (*(b->base_src) + b->src);
@@ -214,6 +214,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     unsigned int whichdir;
     unsigned int halfiters = 4;
     unsigned int quarteriters = 4;
+    int thismse;
 
     int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
     int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
@@ -225,7 +226,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     bestmv->col <<= 3;
 
     // calculate central point error
-    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
+    *distortion = besterr;
     besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
@@ -314,7 +316,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #undef CHECK_BETTER
 #undef MIN
 #undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -325,6 +327,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     int left, right, up, down, diag;
     unsigned int sse;
     int whichdir ;
+    int thismse;
 
 
     // Trap uncodable vectors
@@ -332,6 +335,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     {
         bestmv->row <<= 3;
         bestmv->col <<= 3;
+        *distortion = INT_MAX;
         return INT_MAX;
     }
 
@@ -341,51 +345,60 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     startmv = *bestmv;
 
     // calculate central point error
-    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
+    *distortion = bestmse;
     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
         *bestmv = this_mv;
         bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.col += 8;
-    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
         *bestmv = this_mv;
         bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse =  vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
         *bestmv = this_mv;
         bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.row += 8;
-    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
         *bestmv = this_mv;
         bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
 
@@ -400,32 +413,34 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     case 0:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 1:
         this_mv.col += 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 2:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row += 4;
-        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 3:
     default:
         this_mv.col += 4;
         this_mv.row += 4;
-        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
+        thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
         break;
     }
 
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
 //  }
@@ -448,30 +463,34 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     if (startmv.col & 7)
     {
         this_mv.col = startmv.col - 2;
-        left = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.col = (startmv.col - 8) | 6;
-        left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
     }
 
-    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
         *bestmv = this_mv;
         bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.col += 4;
-    right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
         *bestmv = this_mv;
         bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     // go up then down and check error
@@ -480,30 +499,34 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     if (startmv.row & 7)
     {
         this_mv.row = startmv.row - 2;
-        up = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.row = (startmv.row - 8) | 6;
-        up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+        thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
     }
 
-    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
         *bestmv = this_mv;
         bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.row += 4;
-    down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
         *bestmv = this_mv;
         bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
 
@@ -525,12 +548,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+                thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+                thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
             }
         }
         else
@@ -540,12 +563,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+                thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+                thismse = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
             }
         }
 
@@ -556,12 +579,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
         if (startmv.row & 7)
         {
             this_mv.row -= 2;
-            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.row = (startmv.row - 8) | 6;
-            diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+            thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
         }
 
         break;
@@ -571,36 +594,36 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
         if (startmv.col & 7)
         {
             this_mv.col -= 2;
-            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.col = (startmv.col - 8) | 6;
-            diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+            thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
         }
 
         break;
     case 3:
         this_mv.col += 2;
         this_mv.row += 2;
-        diag = vfp->svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        thismse = vfp->svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         break;
     }
 
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
-//  }
-
     return bestmse;
 }
 
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1)
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -610,12 +633,14 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     unsigned char *z = (*(b->base_src) + b->src);
     int left, right, up, down, diag;
     unsigned int sse;
+    int thismse;
 
     // Trap uncodable vectors
     if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
     {
         bestmv->row <<= 3;
         bestmv->col <<= 3;
+        *distortion = INT_MAX;
         return INT_MAX;
     }
 
@@ -625,51 +650,60 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     startmv = *bestmv;
 
     // calculate central point error
-    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
+    *distortion = bestmse;
     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
         *bestmv = this_mv;
         bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.col += 8;
-    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
         *bestmv = this_mv;
         bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
         *bestmv = this_mv;
         bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.row += 8;
-    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
         *bestmv = this_mv;
         bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
@@ -713,44 +747,52 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
 #else
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = (this_mv.row - 8) | 4;
-    diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.col += 8;
-    diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = startmv.row + 4;
-    diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
     this_mv.col += 8;
-    diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
-    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
         *bestmv = this_mv;
         bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
     }
 
 #endif
@@ -789,7 +831,9 @@ int vp8_hex_search
 )
 {
     MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ;
+    //MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ;
+    MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}} ;
+
     int i, j;
     unsigned char *src = (*(b->base_src) + b->src);
     int src_stride = b->src_stride;
@@ -876,24 +920,31 @@ int vp8_hex_search
             break;
     }
 
-    // check 8 1 away neighbors
+    // check 4 1-away neighbors
 cal_neighbors:
-    tr = br;
-    tc = bc;
 
-    for (i = 0; i < 8; i++)
+    for (j = 0; j < 32; j++)
     {
-        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+        tr = br;
+        tc = bc;
 
-        if (nc < x->mv_col_min) continue;
+        for (i = 0; i < 4; i++)
+        {
+            int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
 
-        if (nc > x->mv_col_max) continue;
+            if (nc < x->mv_col_min) continue;
 
-        if (nr < x->mv_row_min) continue;
+            if (nc > x->mv_col_max) continue;
 
-        if (nr > x->mv_row_max) continue;
+            if (nr < x->mv_row_min) continue;
 
-        CHECK_BETTER(thiserr, nr, nc);
+            if (nr > x->mv_row_max) continue;
+
+            CHECK_BETTER(thiserr, nr, nc);
+        }
+
+        if (tr == br && tc == bc)
+            break;
     }
 
     best_mv->row = br;
@@ -1190,8 +1241,6 @@ int vp8_diamond_search_sadx4
     + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 
-
-#if !(CONFIG_REALTIME_ONLY)
 int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)
 {
     unsigned char *what = (*(b->base_src) + b->src);
@@ -1571,7 +1620,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     else
         return INT_MAX;
 }
-#endif /* !(CONFIG_REALTIME_ONLY) */
 
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 5efcec296..b14cbcbc8 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -49,7 +49,7 @@ extern int vp8_hex_search
 
 typedef int (fractional_mv_step_fp)
     (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv,
-     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]);
+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse);
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index a18447d51..1738e5699 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -56,7 +56,6 @@ extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt
 extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val, int sharpness_lvl);
 extern void vp8_dmachine_specific_config(VP8_COMP *cpi);
 extern void vp8_cmachine_specific_config(VP8_COMP *cpi);
-extern void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi);
 extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
 extern void print_parms(VP8_CONFIG *ocf, char *filenam);
 extern unsigned int vp8_get_processor_freq();
@@ -71,7 +70,7 @@ extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_
 int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
 
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi);
+extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
 
 static void set_default_lf_deltas(VP8_COMP *cpi);
 
@@ -96,7 +95,8 @@ extern double vp8_calc_ssimg
     YV12_BUFFER_CONFIG *dest,
     double *ssim_y,
     double *ssim_u,
-    double *ssim_v
+    double *ssim_v,
+    const vp8_variance_rtcd_vtable_t *rtcd
 );
 
 
@@ -287,16 +287,9 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
     vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
     vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
 #if VP8_TEMPORAL_ALT_REF
-    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer);
+    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
 #endif
-    {
-        int i;
-
-        for (i = 0; i < MAX_LAG_BUFFERS; i++)
-            vp8_yv12_de_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer);
-
-        cpi->source_buffer_count = 0;
-    }
+    vp8_lookahead_destroy(cpi->lookahead);
 
     vpx_free(cpi->tok);
     cpi->tok = 0;
@@ -1252,35 +1245,23 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 }
 static void alloc_raw_frame_buffers(VP8_COMP *cpi)
 {
-    int i, buffers;
-    /* allocate source_buffer to be multiples of 16 */
     int width = (cpi->oxcf.Width + 15) & ~15;
+    int height = (cpi->oxcf.Height + 15) & ~15;
 
-    buffers = cpi->oxcf.lag_in_frames;
-
-    if (buffers > MAX_LAG_BUFFERS)
-        buffers = MAX_LAG_BUFFERS;
-
-    if (buffers < 1)
-        buffers = 1;
-
-    for (i = 0; i < buffers; i++)
-        if (vp8_yv12_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer,
-                                        width, cpi->oxcf.Height,
-                                        16))
-            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                               "Failed to allocate lag buffer");
+    cpi->lookahead = vp8_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
+                                        cpi->oxcf.lag_in_frames);
+    if(!cpi->lookahead)
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate lag buffers");
 
 #if VP8_TEMPORAL_ALT_REF
 
-    if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer,
-                                    width, cpi->oxcf.Height, 16))
+    if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
+                                    width, height, 16))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate altref buffer");
 
 #endif
-
-    cpi->source_buffer_count = 0;
 }
 
 static int vp8_alloc_partition_data(VP8_COMP *cpi)
@@ -1461,10 +1442,7 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 
     cpi->auto_gold = 1;
     cpi->auto_adjust_gold_quantizer = 1;
-    cpi->goldquantizer = 1;
     cpi->goldfreq = 7;
-    cpi->auto_adjust_key_quantizer = 1;
-    cpi->keyquantizer = 1;
 
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
@@ -1478,10 +1456,6 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
 
     // Initialise the starting buffer levels
-    cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
-                cpi->oxcf.target_bandwidth, 1000);
-
     cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
 
@@ -1542,7 +1516,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 
         break;
 
-#if !(CONFIG_REALTIME_ONLY)
     case MODE_GOODQUALITY:
         cpi->pass = 0;
         cpi->compressor_speed = 1;
@@ -1583,7 +1556,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         cpi->pass = 2;
         cpi->compressor_speed = 0;
         break;
-#endif
     }
 
     if (cpi->pass == 0)
@@ -1656,6 +1628,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     // Convert target bandwidth from Kbit/s to Bit/s
     cpi->oxcf.target_bandwidth       *= 1000;
 
+    cpi->oxcf.starting_buffer_level =
+        rescale(cpi->oxcf.starting_buffer_level,
+                cpi->oxcf.target_bandwidth, 1000);
+
     // Set or reset optimal and maximum buffer levels.
     if (cpi->oxcf.optimal_buffer_level == 0)
         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
@@ -1705,8 +1681,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     // Only allow dropped frames in buffered mode
     cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
 
-    cm->filter_type          = (LOOPFILTERTYPE) cpi->filter_type;
-
     if (!cm->use_bilinear_mc_filter)
         cm->mcomp_filter_type = SIXTAP;
     else
@@ -1720,9 +1694,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cm->horiz_scale  = cpi->horiz_scale;
     cm->vert_scale   = cpi->vert_scale ;
 
-    // As per VP8
-    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;
-
     // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
     if (cpi->oxcf.Sharpness > 7)
         cpi->oxcf.Sharpness = 7;
@@ -1752,10 +1723,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         vp8_alloc_compressor_data(cpi);
     }
 
-    // Clamp KF frame size to quarter of data rate
-    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
-        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
     if (cpi->oxcf.fixed_q >= 0)
     {
         cpi->last_q[0] = cpi->oxcf.fixed_q;
@@ -1774,7 +1741,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
 
     // YX Temp
-    cpi->last_alt_ref_sei    = -1;
+    cpi->alt_ref_source = NULL;
     cpi->is_src_frame_alt_ref = 0;
     cpi->is_next_src_alt_ref = 0;
 
@@ -1981,7 +1948,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
 
     cpi->frames_till_gf_update_due      = 0;
     cpi->key_frame_count              = 1;
-    cpi->tot_key_frame_bits            = 0;
 
     cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
     cpi->ni_tot_qi                    = 0;
@@ -2007,7 +1973,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
 
     for (i = 0; i < KEY_FRAME_CONTEXT; i++)
     {
-        cpi->prior_key_frame_size[i]     = cpi->intra_frame_target;
         cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
     }
 
@@ -2117,15 +2082,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     cpi->fn_ptr[BLOCK_4X4].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
     cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
 
-#if !(CONFIG_REALTIME_ONLY)
     cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
-#endif
     cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
 
     cpi->ready_for_new_frame = 1;
 
-    cpi->source_encode_index = 0;
-
     // make sure frame 1 is okay
     cpi->error_bins[0] = cpi->common.MBs;
 
@@ -2173,7 +2134,8 @@ void vp8_remove_compressor(VP8_PTR *ptr)
         if (cpi->pass != 1)
         {
             FILE *f = fopen("opsnr.stt", "a");
-            double time_encoded = (cpi->source_end_time_stamp - cpi->first_time_stamp_ever) / 10000000.000;
+            double time_encoded = (cpi->last_end_time_stamp_seen
+                                   - cpi->first_time_stamp_ever) / 10000000.000;
             double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
             double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
 
@@ -2186,7 +2148,7 @@ void vp8_remove_compressor(VP8_PTR *ptr)
                 double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
 
                 fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(us)\n");
-                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f %8.0f\n",
+                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
                         dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
                         total_encode_time);
             }
@@ -2628,37 +2590,13 @@ static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
         vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
                         tmp_height, hs, hr, vs, vr, 0);
 
+        vp8_yv12_extend_frame_borders(&cpi->scaled_source);
         cpi->Source = &cpi->scaled_source;
 #endif
     }
-    // we may need to copy to a buffer so we can extend the image...
-    else if (cm->Width != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-             cm->Height != cm->yv12_fb[cm->lst_fb_idx].y_height)
-    {
-        //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->rtcd.flags & HAS_NEON)
-#endif
-        {
-            vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);
-        }
-#if CONFIG_RUNTIME_CPU_DETECT
-        else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
-        {
-            vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
-        }
-#endif
-
-        cpi->Source = &cpi->scaled_source;
-    }
+}
 
-    vp8_extend_to_multiple_of16(cpi->Source, cm->Width, cm->Height);
 
-}
 static void resize_key_frame(VP8_COMP *cpi)
 {
 #if CONFIG_SPATIAL_RESAMPLING
@@ -2705,64 +2643,7 @@ static void resize_key_frame(VP8_COMP *cpi)
 
 #endif
 }
-// return of 0 means drop frame
-static int pick_frame_size(VP8_COMP *cpi)
-{
-    VP8_COMMON *cm = &cpi->common;
-
-    // First Frame is a special case
-    if (cm->current_video_frame == 0)
-    {
-#if !(CONFIG_REALTIME_ONLY)
 
-        if (cpi->pass == 2)
-            vp8_calc_auto_iframe_target_size(cpi);
-
-        // 1 Pass there is no information on which to base size so use bandwidth per second * fixed fraction
-        else
-#endif
-            cpi->this_frame_target = cpi->oxcf.target_bandwidth / 2;
-
-        // in error resilient mode the first frame is bigger since it likely contains
-        // all the static background
-        if (cpi->oxcf.error_resilient_mode == 1 || (cpi->compressor_speed == 2))
-        {
-            cpi->this_frame_target *= 3;      // 5;
-        }
-
-        // Key frame from VFW/auto-keyframe/first frame
-        cm->frame_type = KEY_FRAME;
-
-    }
-    // Special case for forced key frames
-    // The frame sizing here is still far from ideal for 2 pass.
-    else if (cm->frame_flags & FRAMEFLAGS_KEY)
-    {
-        cm->frame_type = KEY_FRAME;
-        resize_key_frame(cpi);
-        vp8_calc_iframe_target_size(cpi);
-    }
-    else if (cm->frame_type == KEY_FRAME)
-    {
-        vp8_calc_auto_iframe_target_size(cpi);
-    }
-    else
-    {
-        // INTER frame: compute target frame size
-        cm->frame_type = INTER_FRAME;
-        vp8_calc_pframe_target_size(cpi);
-
-        // Check if we're dropping the frame:
-        if (cpi->drop_frame)
-        {
-            cpi->drop_frame = FALSE;
-            cpi->drop_count++;
-            return 0;
-        }
-    }
-
-    return 1;
-}
 
 static void set_quantizer(VP8_COMP *cpi, int Q)
 {
@@ -3551,7 +3432,7 @@ static void encode_frame_to_data_rate
     }
 
     // Decide how big to make the frame
-    if (!pick_frame_size(cpi))
+    if (!vp8_pick_frame_size(cpi))
     {
         cm->current_video_frame++;
         cpi->frames_since_key++;
@@ -3571,7 +3452,6 @@ static void encode_frame_to_data_rate
         if (Adjustment)
         {
             int buff_lvl_step;
-            int tmp_lvl = cpi->buffer_level;
 
             if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
             {
@@ -3880,7 +3760,10 @@ static void encode_frame_to_data_rate
         }
 
         if (cm->frame_type == KEY_FRAME)
+        {
+            resize_key_frame(cpi);
             vp8_setup_key_frame(cpi);
+        }
 
         // transform / motion compensation build reconstruction frame
         vp8_encode_frame(cpi);
@@ -3904,11 +3787,11 @@ static void encode_frame_to_data_rate
 #else
             if (decide_key_frame(cpi))
             {
-                vp8_calc_auto_iframe_target_size(cpi);
-
                 // Reset all our sizing numbers and recode
                 cm->frame_type = KEY_FRAME;
 
+                vp8_pick_frame_size(cpi);
+
                 // Clear the Alt reference frame active flag when we have a key frame
                 cpi->source_alt_ref_active = FALSE;
 
@@ -3937,7 +3820,6 @@ static void encode_frame_to_data_rate
                 loop_count++;
                 Loop = TRUE;
 
-                resize_key_frame(cpi);
                 continue;
             }
 #endif
@@ -4419,9 +4301,9 @@ static void encode_frame_to_data_rate
         vp8_clear_system_state();  //__asm emms;
 
         if (cpi->total_coded_error_left != 0.0)
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
-                       "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
-                       "%10.3f %8ld\n",
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+                       "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+                       "%10.3f %8d\n",
                        cpi->common.current_video_frame, cpi->this_frame_target,
                        cpi->projected_frame_size,
                        (cpi->projected_frame_size - cpi->this_frame_target),
@@ -4438,9 +4320,9 @@ static void encode_frame_to_data_rate
                        (double)cpi->bits_left / cpi->total_coded_error_left,
                        cpi->tot_recode_hits);
         else
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
-                       "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
-                       "%8ld\n",
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+                       "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+                       "%8d\n",
                        cpi->common.current_video_frame,
                        cpi->this_frame_target, cpi->projected_frame_size,
                        (cpi->projected_frame_size - cpi->this_frame_target),
@@ -4672,17 +4554,17 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
 extern void vp8_push_neon(INT64 *store);
 extern void vp8_pop_neon(INT64 *store);
 #endif
+
+
 int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time)
 {
 #if HAVE_ARMV7
     INT64 store_reg[8];
 #endif
-    VP8_COMP *cpi = (VP8_COMP *) ptr;
-    VP8_COMMON *cm = &cpi->common;
+    VP8_COMP              *cpi = (VP8_COMP *) ptr;
+    VP8_COMMON            *cm = &cpi->common;
     struct vpx_usec_timer  timer;
-
-    if (!cpi)
-        return -1;
+    int                    res = 0;
 
 #if HAVE_ARMV7
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -4694,75 +4576,10 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
 #endif
 
     vpx_usec_timer_start(&timer);
-
-    // no more room for frames;
-    if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames)
-    {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->rtcd.flags & HAS_NEON)
-#endif
-        {
-            vp8_pop_neon(store_reg);
-        }
-#endif
-        return -1;
-    }
-
-    //printf("in-cpi->source_buffer_count: %d\n", cpi->source_buffer_count);
-
+    if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+                          frame_flags))
+        res = -1;
     cm->clr_type = sd->clrtype;
-
-    // make a copy of the frame for use later...
-#if !(CONFIG_REALTIME_ONLY)
-
-    if (cpi->oxcf.allow_lag)
-    {
-        int which_buffer =  cpi->source_encode_index - 1;
-        SOURCE_SAMPLE *s;
-
-        if (which_buffer == -1)
-            which_buffer = cpi->oxcf.lag_in_frames - 1;
-
-        if (cpi->source_buffer_count < cpi->oxcf.lag_in_frames - 1)
-            which_buffer = cpi->source_buffer_count;
-
-        s = &cpi->src_buffer[which_buffer];
-
-        s->source_time_stamp = time_stamp;
-        s->source_end_time_stamp = end_time;
-        s->source_frame_flags = frame_flags;
-        vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
-
-        cpi->source_buffer_count ++;
-    }
-    else
-#endif
-    {
-        SOURCE_SAMPLE *s;
-        s = &cpi->src_buffer[0];
-        s->source_end_time_stamp = end_time;
-        s->source_time_stamp = time_stamp;
-        s->source_frame_flags = frame_flags;
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->rtcd.flags & HAS_NEON)
-#endif
-        {
-            vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);
-        }
-#if CONFIG_RUNTIME_CPU_DETECT
-        else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
-        {
-            vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
-        }
-#endif
-        cpi->source_buffer_count = 1;
-    }
-
     vpx_usec_timer_mark(&timer);
     cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
@@ -4775,8 +4592,10 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
     }
 #endif
 
-    return 0;
+    return res;
 }
+
+
 int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush)
 {
 #if HAVE_ARMV7
@@ -4787,6 +4606,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
     struct vpx_usec_timer  tsctimer;
     struct vpx_usec_timer  ticktimer;
     struct vpx_usec_timer  cmptimer;
+    YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
 
     if (!cpi)
         return -1;
@@ -4802,95 +4622,24 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
 
     vpx_usec_timer_start(&cmptimer);
 
-
-    // flush variable tells us that even though we have less than 10 frames
-    // in our buffer we need to start producing compressed frames.
-    // Probably because we are at the end of a file....
-    if ((cpi->source_buffer_count == cpi->oxcf.lag_in_frames && cpi->oxcf.lag_in_frames > 0)
-        || (!cpi->oxcf.allow_lag && cpi->source_buffer_count > 0)
-        || (flush && cpi->source_buffer_count > 0))
-    {
-
-        SOURCE_SAMPLE *s;
-
-        s = &cpi->src_buffer[cpi->source_encode_index];
-        cpi->source_time_stamp = s->source_time_stamp;
-        cpi->source_end_time_stamp = s->source_end_time_stamp;
+    cpi->source = NULL;
 
 #if !(CONFIG_REALTIME_ONLY)
-
-        // Should we code an alternate reference frame
-        if (cpi->oxcf.error_resilient_mode == 0 &&
-            cpi->oxcf.play_alternate &&
-            cpi->source_alt_ref_pending  &&
-            (cpi->frames_till_gf_update_due < cpi->source_buffer_count) &&
-            cpi->oxcf.lag_in_frames != 0)
+    // Should we code an alternate reference frame
+    if (cpi->oxcf.error_resilient_mode == 0 &&
+        cpi->oxcf.play_alternate &&
+        cpi->source_alt_ref_pending)
+    {
+        if ((cpi->source = vp8_lookahead_peek(cpi->lookahead,
+                                              cpi->frames_till_gf_update_due)))
         {
-            cpi->last_alt_ref_sei = (cpi->source_encode_index + cpi->frames_till_gf_update_due) % cpi->oxcf.lag_in_frames;
-
-#if VP8_TEMPORAL_ALT_REF
-
+            cpi->alt_ref_source = cpi->source;
             if (cpi->oxcf.arnr_max_frames > 0)
             {
-#if 0
-                // my attempt at a loop that tests the results of strength filter.
-                int start_frame = cpi->last_alt_ref_sei - 3;
-
-                int i, besti = -1, pastin = cpi->oxcf.arnr_strength;
-
-                int besterr;
-
-                if (start_frame < 0)
-                    start_frame += cpi->oxcf.lag_in_frames;
-
-                besterr = calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
-                                              &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
-
-                for (i = 0; i < 7; i++)
-                {
-                    int thiserr;
-                    cpi->oxcf.arnr_strength = i;
-                    vp8_temporal_filter_prepare_c(cpi);
-
-                    thiserr = calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
-                                                  &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
-
-                    if (10 * thiserr < besterr * 8)
-                    {
-                        besterr = thiserr;
-                        besti = i;
-                    }
-                }
-
-                if (besti != -1)
-                {
-                    cpi->oxcf.arnr_strength = besti;
-                    vp8_temporal_filter_prepare_c(cpi);
-                    s = &cpi->alt_ref_buffer;
-
-                    // FWG not sure if I need to copy this data for the Alt Ref frame
-                    s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp;
-                    s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp;
-                    s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags;
-                }
-                else
-                    s = &cpi->src_buffer[cpi->last_alt_ref_sei];
-
-#else
-                vp8_temporal_filter_prepare_c(cpi);
-                s = &cpi->alt_ref_buffer;
-
-                // FWG not sure if I need to copy this data for the Alt Ref frame
-                s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp;
-                s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp;
-                s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags;
-
-#endif
+                vp8_temporal_filter_prepare_c(cpi,
+                                              cpi->frames_till_gf_update_due);
+                force_src_buffer = &cpi->alt_ref_buffer;
             }
-            else
-#endif
-                s = &cpi->src_buffer[cpi->last_alt_ref_sei];
-
             cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
             cm->refresh_alt_ref_frame = 1;
             cm->refresh_golden_frame = 0;
@@ -4900,40 +4649,33 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
             cpi->is_src_frame_alt_ref = 0;
             cpi->is_next_src_alt_ref = 0;
         }
-        else
+    }
 #endif
+
+    if (!cpi->source)
+    {
+        if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush)))
         {
             cm->show_frame = 1;
-#if !(CONFIG_REALTIME_ONLY)
 
-            if (cpi->oxcf.allow_lag)
-            {
-                if (cpi->source_encode_index ==  cpi->last_alt_ref_sei)
-                {
-                    cpi->is_src_frame_alt_ref = 1;
-                    cpi->last_alt_ref_sei    = -1;
-                }
-                else
-                    cpi->is_src_frame_alt_ref = 0;
-
-                cpi->source_encode_index = (cpi->source_encode_index + 1) % cpi->oxcf.lag_in_frames;
+            cpi->is_src_frame_alt_ref = cpi->alt_ref_source
+                                        && (cpi->source == cpi->alt_ref_source);
 
-                if(cpi->source_encode_index == cpi->last_alt_ref_sei)
-                    cpi->is_next_src_alt_ref = 1;
-                else
-                    cpi->is_next_src_alt_ref = 0;
-            }
-
-#endif
-            cpi->source_buffer_count--;
+            cpi->is_next_src_alt_ref = cpi->alt_ref_source
+                                        && (vp8_lookahead_peek(cpi->lookahead, 0)
+                                            == cpi->alt_ref_source);
+            if(cpi->is_src_frame_alt_ref)
+                cpi->alt_ref_source = NULL;
         }
+    }
 
-        cpi->un_scaled_source = &s->source_buffer;
-        cpi->Source = &s->source_buffer;
-        cpi->source_frame_flags = s->source_frame_flags;
-
-        *time_stamp = cpi->source_time_stamp;
-        *time_end = cpi->source_end_time_stamp;
+    if (cpi->source)
+    {
+        cpi->un_scaled_source =
+        cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+        *time_stamp = cpi->source->ts_start;
+        *time_end = cpi->source->ts_end;
+        *frame_flags = cpi->source->flags;
     }
     else
     {
@@ -4959,26 +4701,24 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
         return -1;
     }
 
-    *frame_flags = cpi->source_frame_flags;
-
-    if (cpi->source_time_stamp < cpi->first_time_stamp_ever)
+    if (cpi->source->ts_start < cpi->first_time_stamp_ever)
     {
-        cpi->first_time_stamp_ever = cpi->source_time_stamp;
-        cpi->last_end_time_stamp_seen = cpi->source_time_stamp;
+        cpi->first_time_stamp_ever = cpi->source->ts_start;
+        cpi->last_end_time_stamp_seen = cpi->source->ts_start;
     }
 
     // adjust frame rates based on timestamps given
     if (!cm->refresh_alt_ref_frame)
     {
-        if (cpi->source_time_stamp == cpi->first_time_stamp_ever)
+        if (cpi->source->ts_start == cpi->first_time_stamp_ever)
         {
-            double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp);
+            double this_fps = 10000000.000 / (cpi->source->ts_end - cpi->source->ts_start);
 
             vp8_new_frame_rate(cpi, this_fps);
         }
         else
         {
-            long long nanosecs = cpi->source_end_time_stamp
+            long long nanosecs = cpi->source->ts_end
                 - cpi->last_end_time_stamp_seen;
 
             if (nanosecs > 0)
@@ -4989,8 +4729,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
 
         }
 
-        cpi->last_time_stamp_seen = cpi->source_time_stamp;
-        cpi->last_end_time_stamp_seen = cpi->source_end_time_stamp;
+        cpi->last_time_stamp_seen = cpi->source->ts_start;
+        cpi->last_end_time_stamp_seen = cpi->source->ts_end;
     }
 
     if (cpi->compressor_speed == 2)
@@ -5111,7 +4851,6 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
 
             if (cpi->b_calculate_psnr)
             {
-                double y, u, v;
                 double ye,ue,ve;
                 double frame_psnr;
                 YV12_BUFFER_CONFIG      *orig = cpi->Source;
@@ -5144,7 +4883,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
                 {
-                    double y2, u2, v2, frame_psnr2, frame_ssim2 = 0;
+                    double frame_psnr2, frame_ssim2 = 0;
                     double weight = 0;
 
                     vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
@@ -5185,7 +4924,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
             if (cpi->b_calculate_ssimg)
             {
                 double y, u, v, frame_all;
-                frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+                frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
+                    &y, &u, &v, IF_RTCD(&cpi->rtcd.variance));
                 cpi->total_ssimg_y += y;
                 cpi->total_ssimg_u += u;
                 cpi->total_ssimg_v += v;
@@ -5376,35 +5116,6 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const
 }
 
 
-static int calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
-{
-    int i, j;
-    int Total = 0;
-
-    unsigned char *src = source->y_buffer;
-    unsigned char *dst = dest->y_buffer;
-    (void)rtcd;
-
-    // Loop through the Y plane raw and reconstruction data summing (square differences)
-    for (i = 0; i < source->y_height; i += 16)
-    {
-        for (j = 0; j < source->y_width; j += 16)
-        {
-            unsigned int sse;
-            VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
-
-            if (sse < 8096)
-                Total += sse;
-        }
-
-        src += 16 * source->y_stride;
-        dst += 16 * dest->y_stride;
-    }
-
-    return Total;
-}
-
-
 int vp8_get_quantizer(VP8_PTR c)
 {
     VP8_COMP   *cpi = (VP8_COMP *) c;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 0e53f6803..e2e6b367c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -29,6 +29,7 @@
 #include "mcomp.h"
 #include "temporal_filter.h"
 #include "vp8/common/findnearmv.h"
+#include "lookahead.h"
 
 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -217,14 +218,6 @@ typedef struct
     void *ptr1;
 } LPFTHREAD_DATA;
 
-typedef struct
-{
-    INT64  source_time_stamp;
-    INT64  source_end_time_stamp;
-
-    DECLARE_ALIGNED(16, YV12_BUFFER_CONFIG, source_buffer);
-    unsigned int source_frame_flags;
-} SOURCE_SAMPLE;
 
 typedef struct VP8_ENCODER_RTCD
 {
@@ -251,17 +244,17 @@ typedef struct
 {
 
     DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
 
     DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
 
     DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
 
@@ -280,19 +273,17 @@ typedef struct
 
     VP8_CONFIG oxcf;
 
+    struct lookahead_ctx    *lookahead;
+    struct lookahead_entry  *source;
+    struct lookahead_entry  *alt_ref_source;
+
     YV12_BUFFER_CONFIG *Source;
     YV12_BUFFER_CONFIG *un_scaled_source;
-    INT64 source_time_stamp;
-    INT64 source_end_time_stamp;
-    unsigned int source_frame_flags;
     YV12_BUFFER_CONFIG scaled_source;
 
-    int source_buffer_count;    // number of src_buffers in use for lagged encoding
-    int source_encode_index;    // index of buffer in src_buffer to encode
     int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
     int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
 
-    int last_alt_ref_sei;       // index into src_buffers of frame used as alt reference
     int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
     int is_next_src_alt_ref;    // source of next frame to encode is an exact copy of an alt ref frame
 
@@ -301,8 +292,6 @@ typedef struct
     int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
     //int refresh_alt_ref_frame;
-    SOURCE_SAMPLE src_buffer[MAX_LAG_BUFFERS];
-
     YV12_BUFFER_CONFIG last_frame_uf;
 
     TOKENEXTRA *tok;
@@ -396,14 +385,11 @@ typedef struct
     int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
 
     INT64 key_frame_count;
-    INT64 tot_key_frame_bits;
-    int prior_key_frame_size[KEY_FRAME_CONTEXT];
     int prior_key_frame_distance[KEY_FRAME_CONTEXT];
     int per_frame_bandwidth;          // Current section per frame bandwidth target
     int av_per_frame_bandwidth;        // Average frame size target for clip
     int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
     int last_key_frame_size;
-    int intra_frame_target;
     int inter_frame_target;
     double output_frame_rate;
     long long last_time_stamp_seen;
@@ -515,12 +501,8 @@ typedef struct
     int interquantizer;
     int auto_gold;
     int auto_adjust_gold_quantizer;
-    int goldquantizer;
     int goldfreq;
-    int auto_adjust_key_quantizer;
-    int keyquantizer;
     int auto_worst_q;
-    int filter_type;
     int cpu_used;
     int chroma_boost;
     int horiz_scale;
@@ -594,7 +576,6 @@ typedef struct
     // multithread data
     int * mt_current_mb_col;
     int mt_sync_range;
-    int processor_core_count;
     int b_multi_threaded;
     int encoding_thread_count;
 
@@ -638,7 +619,7 @@ typedef struct
     VP8_ENCODER_RTCD            rtcd;
 #endif
 #if VP8_TEMPORAL_ALT_REF
-    SOURCE_SAMPLE alt_ref_buffer;
+    YV12_BUFFER_CONFIG alt_ref_buffer;
     YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
     int fixed_divide[512];
 #endif
diff --git a/vp8/encoder/parms.cpp b/vp8/encoder/parms.cpp
deleted file mode 100644
index 2a39b2ca3..000000000
--- a/vp8/encoder/parms.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if 0
-
-#include <map>
-#include <string>
-#include <fstream>
-extern "C"
-{
-    #include "vp8/common/onyx.h"
-}
-
-
-using namespace std;
-
-typedef map<string,int> Parms;
-
-#define ALLPARMS(O,DOTHIS) \
-    DOTHIS(O,  interquantizer             )\
-    DOTHIS(O,  auto_gold                   )\
-    DOTHIS(O,  auto_adjust_gold_quantizer    )\
-    DOTHIS(O,  goldquantizer              )\
-    DOTHIS(O,  goldfreq                   )\
-    DOTHIS(O,  auto_key                    )\
-    DOTHIS(O,  auto_adjust_key_quantizer     )\
-    DOTHIS(O,  keyquantizer               )\
-    DOTHIS(O,  keyfreq                    )\
-    DOTHIS(O,  pass                       )\
-    DOTHIS(O,  fixed_q                     )\
-    DOTHIS(O,  target_bandwidth            )\
-    DOTHIS(O,  auto_worst_q                 )\
-    DOTHIS(O,  worst_quality               )\
-    DOTHIS(O,  best_allowed_q               )\
-    DOTHIS(O,  end_usage                   )\
-    DOTHIS(O,  starting_buffer_level        )\
-    DOTHIS(O,  optimal_buffer_level         )\
-    DOTHIS(O,  maximum_buffer_size          )\
-    DOTHIS(O,  under_shoot_pct              )\
-    DOTHIS(O,  allow_df                    )\
-    DOTHIS(O,  drop_frames_water_mark        )\
-    DOTHIS(O,  max_allowed_datarate         )\
-    DOTHIS(O,  two_pass_vbrbias             )\
-    DOTHIS(O,  two_pass_vbrmin_section       )\
-    DOTHIS(O,  two_pass_vbrmax_section       )\
-    DOTHIS(O,  filter_type                 )\
-    DOTHIS(O,  compressor_speed            )\
-    DOTHIS(O,  mbpitch_feature             )\
-    DOTHIS(O,  allow_spatial_resampling     )\
-    DOTHIS(O,  resample_down_water_mark      )\
-    DOTHIS(O,  resample_up_water_mark        )\
-    DOTHIS(O,  noise_sensitivity           )\
-    DOTHIS(O,  horiz_scale                 )\
-    DOTHIS(O,  vert_scale                  )
-
-
-#define GET(O,V) O->V = x[#V];
-#define PUT(O,V) x[#V] = O->V;
-
-
-extern "C" void get_parms(VP8_CONFIG *ocf,char *filename)
-{
-
-    Parms x;
-    int value;
-    string variable;
-    string equal;
-
-    ifstream config_file(filename);
-
-    ALLPARMS(ocf, PUT);
-
-    // store all the parms in a map (really simple parsing)
-    while(!config_file.eof() && config_file.is_open())
-    {
-        config_file >> variable;
-        config_file >> equal;
-
-        if(equal != "=")
-            continue;
-
-        config_file >> value;
-
-        x[variable] = value;
-    }
-
-    ALLPARMS(ocf, GET);
-
-}
-
-#define PRINT(O,V) debug_file<<#V <<" = " << O->V <<"\n";
-extern "C" void print_parms(VP8_CONFIG *ocf,char *filename)
-{
-    ofstream debug_file(filename,ios_base::app);
-    ALLPARMS(ocf, PRINT);
-    debug_file << "=============================================="<<"\n";
-}
-
-#endif
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0759e2d5b..ea4f01fad 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,7 +50,7 @@ extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
 
 
-int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse)
 {
     (void) b;
     (void) d;
@@ -58,6 +58,8 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv,
     (void) error_per_bit;
     (void) vfp;
     (void) mvcost;
+    (void) distortion;
+    (void) sse;
     bestmv->row <<= 3;
     bestmv->col <<= 3;
     return 0;
@@ -192,9 +194,10 @@ static int pick_intra4x4block(
         int this_rd;
 
         rate = mode_costs[mode];
-        vp8_predict_intra4x4(b, mode, b->predictor);
+        RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
+                     (b, mode, b->predictor);
         distortion = get_prediction_error(be, b, &rtcd->variance);
-        this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate, distortion);
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
         if (this_rd < best_rd)
         {
@@ -212,7 +215,13 @@ static int pick_intra4x4block(
 }
 
 
-int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int *Rate, int *best_dist)
+int vp8_pick_intra4x4mby_modes
+(
+    const VP8_ENCODER_RTCD *rtcd,
+    MACROBLOCK *mb,
+    int *Rate,
+    int *best_dist
+)
 {
     MACROBLOCKD *const xd = &mb->e_mbd;
     int i;
@@ -239,20 +248,18 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
 
         mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
 
-        // Break out case where we have already exceeded best so far value that was bassed in
+        // Break out case where we have already exceeded best so far value
+        // that was passed in
         if (distortion > *best_dist)
             break;
     }
 
-    for (i = 0; i < 16; i++)
-        xd->block[i].bmi.mv.as_int = 0;
-
     *Rate = cost;
 
     if (i == 16)
     {
         *best_dist = distortion;
-        error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, distortion);
+        error = RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
     }
     else
     {
@@ -260,6 +267,9 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
         error = INT_MAX;
     }
 
+    for (i = 0; i < 16; i++)
+        xd->block[i].bmi.mv.as_int = 0;
+
     return error;
 }
 
@@ -435,7 +445,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
     int bestsme;
     //int all_rds[MAX_MODES];         // Experimental debug code.
     int best_mode_index = 0;
-    int sse = INT_MAX;
+    unsigned int sse = INT_MAX;
 
     MV mvp;
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -452,6 +462,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
 
     int skip_mode[4] = {0, 0, 0, 0};
 
+    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode, when Speed >= 15, no sub-pixel search. */
+
     vpx_memset(mode_mv, 0, sizeof(mode_mv));
     vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
     vpx_memset(near_mv, 0, sizeof(near_mv));
@@ -632,10 +644,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
         switch (this_mode)
         {
         case B_PRED:
-            distortion2 = *returndistortion;                    // Best so far passed in as breakout value to vp8_pick_intra4x4mby_modes
-            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
-            rate2 += rate;
-            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+            // Pass best so far to vp8_pick_intra4x4mby_modes to use as breakout
+            distortion2 = *returndistortion;
+            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x,
+                                         &rate, &distortion2);
 
             if (distortion2 == INT_MAX)
             {
@@ -643,7 +655,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
             }
             else
             {
-                this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+                rate2 += rate;
+                distortion2 = VARIANCE_INVOKE
+                                (&cpi->rtcd.variance, get16x16prederror)(
+                                    x->src.y_buffer, x->src.y_stride,
+                                    x->e_mbd.predictor, 16, 0x7fffffff);
+                this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
                 if (this_rd < best_intra_rd)
                 {
@@ -667,7 +684,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
                 (&x->e_mbd);
             distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
             rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
-            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
             if (this_rd < best_intra_rd)
             {
@@ -781,7 +798,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
             }
 
             if (bestsme < INT_MAX)
-                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse);
 
             mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
             mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -811,9 +828,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
             x->e_mbd.block[0].bmi.mode = this_mode;
             x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int;
 
-            distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));
+            if((this_mode != NEWMV) || !(have_subp_search))
+                distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &sse);
 
-            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
             if (cpi->active_map_enabled && x->active_ptr[0] == 0)
             {
@@ -921,7 +939,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
         best_mbmode.uv_mode = 0;
         best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
         best_mbmode.partitioning = 0;
-        best_mbmode.dc_diff = 0;
 
         vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
         vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
@@ -932,6 +949,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re
         }
 
         x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        return;
     }
 
 
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index 8fea98397..f96fc5376 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -14,7 +14,6 @@
 #include "vpx_ports/config.h"
 #include "vp8/common/onyxc_int.h"
 
-#define RD_ESTIMATE(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion);
 extern void vp8_pick_intra_mbuv_mode(MACROBLOCK *mb);
 extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 803e3a51d..86ed267fb 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -27,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
     short *zbin_ptr        = b->zbin;
     short *round_ptr       = b->round;
     short *quant_ptr       = b->quant_fast;
-    short *quant_shift_ptr = b->quant_shift;
+    unsigned char *quant_shift_ptr = b->quant_shift;
     short *qcoeff_ptr      = d->qcoeff;
     short *dqcoeff_ptr     = d->dqcoeff;
     short *dequant_ptr     = d->dequant;
@@ -112,7 +112,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
     short *zbin_ptr        = b->zbin;
     short *round_ptr       = b->round;
     short *quant_ptr       = b->quant;
-    short *quant_shift_ptr = b->quant_shift;
+    unsigned char *quant_shift_ptr = b->quant_shift;
     short *qcoeff_ptr      = d->qcoeff;
     short *dqcoeff_ptr     = d->dqcoeff;
     short *dequant_ptr     = d->dequant;
@@ -166,7 +166,7 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
     int sz;
     short *coeff_ptr;
     short *quant_ptr;
-    short *quant_shift_ptr;
+    unsigned char *quant_shift_ptr;
     short *qcoeff_ptr;
     short *dqcoeff_ptr;
     short *dequant_ptr;
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 9821d2990..d87591cb9 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -329,62 +329,96 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
     cpi->common.refresh_alt_ref_frame = TRUE;
 }
 
-void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+                              double correction_factor)
+{
+    int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]);
+
+    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+     * largest Bpm takes 20 bits.
+     */
+    if (MBs > (1 << 11))
+        return (Bpm >> BPER_MB_NORMBITS) * MBs;
+    else
+        return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP8_COMP *cpi)
 {
     // boost defaults to half second
     int kf_boost;
+    int target;
 
     // Clear down mmx registers to allow floating point in what follows
     vp8_clear_system_state();  //__asm emms;
 
     if (cpi->oxcf.fixed_q >= 0)
     {
-        vp8_calc_iframe_target_size(cpi);
-        return;
-    }
+        int Q = cpi->oxcf.key_q;
 
-    if (cpi->pass == 2)
+        target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs,
+                                    cpi->key_frame_rate_correction_factor);
+    }
+    else if (cpi->pass == 2)
+    {
+        // New Two pass RC
+        target = cpi->per_frame_bandwidth;
+    }
+    // First Frame is a special case
+    else if (cpi->common.current_video_frame == 0)
     {
-        cpi->this_frame_target = cpi->per_frame_bandwidth;      // New Two pass RC
+        /* 1 Pass there is no information on which to base size so use
+         * bandwidth per second * fraction of the initial buffer
+         * level
+         */
+        target = cpi->oxcf.starting_buffer_level / 2;
+
+        if(target > cpi->oxcf.target_bandwidth * 3 / 2)
+            target = cpi->oxcf.target_bandwidth * 3 / 2;
     }
     else
     {
+        // if this keyframe was forced, use a more recent Q estimate
+        int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
+                ? cpi->avg_frame_qindex : cpi->ni_av_qi;
+
         // Boost depends somewhat on frame rate
         kf_boost = (int)(2 * cpi->output_frame_rate - 16);
 
         // adjustment up based on q
-        kf_boost = kf_boost * kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
 
         // frame separation adjustment ( down)
         if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
-            kf_boost = (int)(kf_boost * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+            kf_boost = (int)(kf_boost
+                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));
 
         if (kf_boost < 16)
             kf_boost = 16;
 
-        // Reset the active worst quality to the baseline value for key frames.
-        cpi->active_worst_quality = cpi->worst_quality;
-
-        cpi->this_frame_target = ((16 + kf_boost)  * cpi->per_frame_bandwidth) >> 4;
+        target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
     }
 
 
-    // Should the next frame be an altref frame
-    if (cpi->pass != 2)
+    if (cpi->oxcf.rc_max_intra_bitrate_pct)
     {
-        // For now Alt ref is not allowed except in 2 pass modes.
-        cpi->source_alt_ref_pending = FALSE;
+        unsigned int max_rate = cpi->per_frame_bandwidth
+                                * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
 
-        /*if ( cpi->oxcf.fixed_q == -1)
-        {
-            if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
-                cpi->source_alt_ref_pending = TRUE;
-            else
-                cpi->source_alt_ref_pending = FALSE;
-        }*/
+        if (target > max_rate)
+            target = max_rate;
     }
 
-    if (0)
+    cpi->this_frame_target = target;
+
+    // TODO: if we separate rate targeting from Q targetting, move this.
+    // Reset the active worst quality to the baseline value for key frames.
+    cpi->active_worst_quality = cpi->worst_quality;
+
+#if 0
     {
         FILE *f;
 
@@ -397,8 +431,10 @@ void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
 
         fclose(f);
     }
+#endif
 }
 
+
 //  Do the best we can to define the parameteres for the next GF based on what information we have available.
 static void calc_gf_params(VP8_COMP *cpi)
 {
@@ -564,106 +600,13 @@ static void calc_gf_params(VP8_COMP *cpi)
         }*/
     }
 }
-/* This is equvialent to estimate_bits_at_q without the rate_correction_factor. */
-static int baseline_bits_at_q(int frame_kind, int Q, int MBs)
-{
-    int Bpm = vp8_bits_per_mb[frame_kind][Q];
 
-    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
-     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-     * largest Bpm takes 20 bits.
-     */
-    if (MBs > (1 << 11))
-        return (Bpm >> BPER_MB_NORMBITS) * MBs;
-    else
-        return (Bpm * MBs) >> BPER_MB_NORMBITS;
-}
 
-void vp8_calc_iframe_target_size(VP8_COMP *cpi)
-{
-    int Q;
-    int Boost = 100;
-
-    Q = (cpi->oxcf.fixed_q >= 0) ? cpi->oxcf.fixed_q : cpi->avg_frame_qindex;
-
-    if (cpi->auto_adjust_key_quantizer == 1)
-    {
-        // If (auto_adjust_key_quantizer==1) then a lower Q is selected for key-frames.
-        // The enhanced Q is calculated so as to boost the key frame size by a factor
-        // specified in kf_boost_qadjustment. Also, can adjust based on distance
-        // between key frames.
-
-        // Adjust boost based upon ambient Q
-        Boost = kf_boost_qadjustment[Q];
-
-        // Make the Key frame boost less if the seperation from the previous key frame is small
-        if (cpi->frames_since_key < 16)
-            Boost = Boost * kf_boost_seperation_adjustment[cpi->frames_since_key] / 100;
-        else
-            Boost = Boost * kf_boost_seperation_adjustment[15] / 100;
-
-        // Apply limits on boost
-        if (Boost > kf_gf_boost_qlimits[Q])
-            Boost = kf_gf_boost_qlimits[Q];
-        else if (Boost < 120)
-            Boost = 120;
-    }
-
-    // Keep a record of the boost that was used
-    cpi->last_boost = Boost;
-
-    // Should the next frame be an altref frame
-    if (cpi->pass != 2)
-    {
-        // For now Alt ref is not allowed except in 2 pass modes.
-        cpi->source_alt_ref_pending = FALSE;
-
-        /*if ( cpi->oxcf.fixed_q == -1)
-        {
-            if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
-                cpi->source_alt_ref_pending = TRUE;
-            else
-                cpi->source_alt_ref_pending = FALSE;
-        }*/
-    }
-
-    if (cpi->oxcf.fixed_q >= 0)
-    {
-        cpi->this_frame_target = (baseline_bits_at_q(0, Q, cpi->common.MBs) * Boost) / 100;
-    }
-    else
-    {
-
-        int bits_per_mb_at_this_q ;
-
-        if (cpi->oxcf.error_resilient_mode == 1)
-        {
-            cpi->this_frame_target = 2 * cpi->av_per_frame_bandwidth;
-            return;
-        }
-
-        // Rate targetted scenario:
-        // Be careful of 32-bit OVERFLOW if restructuring the caluclation of cpi->this_frame_target
-        bits_per_mb_at_this_q = (int)(.5 +
-                                      cpi->key_frame_rate_correction_factor * vp8_bits_per_mb[0][Q]);
-
-        cpi->this_frame_target = (((bits_per_mb_at_this_q * cpi->common.MBs) >> BPER_MB_NORMBITS) * Boost) / 100;
-
-        // Reset the active worst quality to the baseline value for key frames.
-        if (cpi->pass < 2)
-            cpi->active_worst_quality = cpi->worst_quality;
-    }
-}
-
-
-
-void vp8_calc_pframe_target_size(VP8_COMP *cpi)
+static void calc_pframe_target_size(VP8_COMP *cpi)
 {
     int min_frame_target;
     int Adjustment;
 
-    // Set the min frame bandwidth.
-    //min_frame_target = estimate_min_frame_size( cpi );
     min_frame_target = 0;
 
     if (cpi->pass == 2)
@@ -817,11 +760,6 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
         }
     }
 
-    // Set a reduced data rate target for our initial Q calculation.
-    // This should help to save bits during earier sections.
-    if ((cpi->oxcf.under_shoot_pct > 0) && (cpi->oxcf.under_shoot_pct <= 100))
-        cpi->this_frame_target = (cpi->this_frame_target * cpi->oxcf.under_shoot_pct) / 100;
-
     // Sanity check that the total sum of adjustments is not above the maximum allowed
     // That is that having allowed for KF and GF penalties we have not pushed the
     // current interframe target to low. If the adjustment we apply here is not capable of recovering
@@ -858,11 +796,6 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                     percent_low =
                         (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
                         one_percent_bits;
-
-                    if (percent_low > 100)
-                        percent_low = 100;
-                    else if (percent_low < 0)
-                        percent_low = 0;
                 }
                 // Are we overshooting the long term clip data rate...
                 else if (cpi->bits_off_target < 0)
@@ -870,16 +803,16 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                     // Adjust per frame data target downwards to compensate.
                     percent_low = (int)(100 * -cpi->bits_off_target /
                                        (cpi->total_byte_count * 8));
-
-                    if (percent_low > 100)
-                        percent_low = 100;
-                    else if (percent_low < 0)
-                        percent_low = 0;
                 }
 
+                if (percent_low > cpi->oxcf.under_shoot_pct)
+                    percent_low = cpi->oxcf.under_shoot_pct;
+                else if (percent_low < 0)
+                    percent_low = 0;
+
                 // lower the target bandwidth for this frame.
-                cpi->this_frame_target =
-                    (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+                cpi->this_frame_target -= (cpi->this_frame_target * percent_low)
+                                          / 200;
 
                 // Are we using allowing control of active_worst_allowed_q
                 // according to buffer level.
@@ -950,20 +883,29 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
             }
             else
             {
-                int percent_high;
+                int percent_high = 0;
 
-                if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                     && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level))
+                {
+                    percent_high = (cpi->buffer_level
+                                    - cpi->oxcf.optimal_buffer_level)
+                                   / one_percent_bits;
+                }
+                else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
                 {
-                    percent_high = (int)(100 * (cpi->bits_off_target - cpi->oxcf.optimal_buffer_level) / (cpi->total_byte_count * 8));
+                    percent_high = (int)((100 * cpi->bits_off_target)
+                                         / (cpi->total_byte_count * 8));
+                }
 
-                    if (percent_high > 100)
-                        percent_high = 100;
-                    else if (percent_high < 0)
-                        percent_high = 0;
+                if (percent_high > cpi->oxcf.over_shoot_pct)
+                    percent_high = cpi->oxcf.over_shoot_pct;
+                else if (percent_high < 0)
+                    percent_high = 0;
 
-                    cpi->this_frame_target = (cpi->this_frame_target * (100 + (percent_high / 2))) / 100;
+                cpi->this_frame_target += (cpi->this_frame_target *
+                                           percent_high) / 200;
 
-                }
 
                 // Are we allowing control of active_worst_allowed_q according to bufferl level.
                 if (cpi->auto_worst_q)
@@ -1152,7 +1094,9 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
                     }
                 }
                 else
-                    cpi->this_frame_target = (baseline_bits_at_q(1, Q, cpi->common.MBs) * cpi->last_boost) / 100;
+                    cpi->this_frame_target =
+                        (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+                         * cpi->last_boost) / 100;
 
             }
             // If there is an active ARF at this location use the minimum
@@ -1274,21 +1218,6 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
     }
 }
 
-static int estimate_bits_at_q(VP8_COMP *cpi, int Q)
-{
-    int Bpm = (int)(.5 + cpi->rate_correction_factor * vp8_bits_per_mb[INTER_FRAME][Q]);
-
-    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
-     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-     * largest Bpm takes 20 bits.
-     */
-    if (cpi->common.MBs > (1 << 11))
-        return (Bpm >> BPER_MB_NORMBITS) * cpi->common.MBs;
-    else
-        return (Bpm * cpi->common.MBs) >> BPER_MB_NORMBITS;
-
-}
-
 
 int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
 {
@@ -1419,119 +1348,85 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
     return Q;
 }
 
-static int estimate_min_frame_size(VP8_COMP *cpi)
-{
-    double correction_factor;
-    int bits_per_mb_at_max_q;
-
-    // This funtion returns a default value for the first few frames untill the correction factor has had time to adapt.
-    if (cpi->common.current_video_frame < 10)
-    {
-        if (cpi->pass == 2)
-            return (cpi->min_frame_bandwidth);
-        else
-            return cpi->per_frame_bandwidth / 3;
-    }
-
-    /*  // Select the appropriate correction factor based upon type of frame.
-        if ( cpi->common.frame_type == KEY_FRAME )
-            correction_factor = cpi->key_frame_rate_correction_factor;
-        else
-        {
-            if ( cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame )
-                correction_factor = cpi->gf_rate_correction_factor;
-            else
-                correction_factor = cpi->rate_correction_factor;
-        }*/
-
-    // We estimate at half the value we get from vp8_bits_per_mb
-    correction_factor = cpi->rate_correction_factor / 2.0;
 
-    bits_per_mb_at_max_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][MAXQ]);
-
-    return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS;
-}
-
-void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+static int estimate_keyframe_frequency(VP8_COMP *cpi)
 {
     int i;
-    int av_key_frames_per_second;
-
-    // Average key frame frequency and size
-    unsigned int total_weight = 0;
-    unsigned int av_key_frame_frequency = 0;
-    unsigned int av_key_frame_bits = 0;
 
-    unsigned int output_frame_rate = (unsigned int)(100 * cpi->output_frame_rate);
-    unsigned int target_bandwidth = (unsigned int)(100 * cpi->target_bandwidth);
+    // Average key frame frequency
+    int av_key_frame_frequency = 0;
 
-    // Clear down mmx registers to allow floating point in what follows
-    vp8_clear_system_state();  //__asm emms;
-
-    // Update the count of total key frame bits
-    cpi->tot_key_frame_bits += cpi->projected_frame_size;
-
-    // First key frame at start of sequence is a special case. We have no frequency data.
+    /* First key frame at start of sequence is a special case. We have no
+     * frequency data.
+     */
     if (cpi->key_frame_count == 1)
     {
-        av_key_frame_frequency = (int)cpi->output_frame_rate * 2;            // Assume a default of 1 kf every 2 seconds
-        av_key_frame_bits = cpi->projected_frame_size;
-        av_key_frames_per_second  = output_frame_rate / av_key_frame_frequency;  // Note output_frame_rate not cpi->output_frame_rate
+        /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+         * whichever is smaller.
+         */
+        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
+        av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+
+        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
+            av_key_frame_frequency = cpi->oxcf.key_freq;
+
+        cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+            = av_key_frame_frequency;
     }
     else
     {
+        unsigned int total_weight = 0;
         int last_kf_interval =
                 (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
 
-        // reset keyframe context and calculate weighted average of last KEY_FRAME_CONTEXT keyframes
+        /* reset keyframe context and calculate weighted average of last
+         * KEY_FRAME_CONTEXT keyframes
+         */
         for (i = 0; i < KEY_FRAME_CONTEXT; i++)
         {
             if (i < KEY_FRAME_CONTEXT - 1)
-            {
-                cpi->prior_key_frame_size[i]     = cpi->prior_key_frame_size[i+1];
-                cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i+1];
-            }
+                cpi->prior_key_frame_distance[i]
+                    = cpi->prior_key_frame_distance[i+1];
             else
-            {
-                cpi->prior_key_frame_size[i]     = cpi->projected_frame_size;
                 cpi->prior_key_frame_distance[i] = last_kf_interval;
-            }
 
-            av_key_frame_bits      += prior_key_frame_weight[i] * cpi->prior_key_frame_size[i];
-            av_key_frame_frequency += prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i];
-            total_weight         += prior_key_frame_weight[i];
+            av_key_frame_frequency += prior_key_frame_weight[i]
+                                      * cpi->prior_key_frame_distance[i];
+            total_weight += prior_key_frame_weight[i];
         }
 
-        av_key_frame_bits       /= total_weight;
         av_key_frame_frequency  /= total_weight;
-        av_key_frames_per_second  = output_frame_rate / av_key_frame_frequency;
 
     }
+    return av_key_frame_frequency;
+}
+
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+    // Clear down mmx registers to allow floating point in what follows
+    vp8_clear_system_state();
 
     // Do we have any key frame overspend to recover?
-    if ((cpi->pass != 2) && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+    // Two-pass overspend handled elsewhere.
+    if ((cpi->pass != 2)
+         && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
     {
-        // Update the count of key frame overspend to be recovered in subsequent frames
-        // A portion of the KF overspend is treated as gf overspend (and hence recovered more quickly)
-        // as the kf is also a gf. Otherwise the few frames following each kf tend to get more bits
-        // allocated than those following other gfs.
-        cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8;
-        cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8;
-        if(!av_key_frame_frequency)
-            av_key_frame_frequency = 60;
-
-        // Work out how much to try and recover per frame.
-        // For one pass we estimate the number of frames to spread it over based upon past history.
-        // For two pass we know how many frames there will be till the next kf.
-        if (cpi->pass == 2)
-        {
-            if (cpi->frames_to_key > 16)
-                cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)cpi->frames_to_key;
-            else
-                cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / 16;
-        }
-        else
-            cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)av_key_frame_frequency;
+        int overspend;
+
+        /* Update the count of key frame overspend to be recovered in
+         * subsequent frames. A portion of the KF overspend is treated as gf
+         * overspend (and hence recovered more quickly) as the kf is also a
+         * gf. Otherwise the few frames following each kf tend to get more
+         * bits allocated than those following other gfs.
+         */
+        overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth);
+        cpi->kf_overspend_bits += overspend * 7 / 8;
+        cpi->gf_overspend_bits += overspend * 1 / 8;
+
+        /* Work out how much to try and recover per frame. */
+        cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits
+                                     / estimate_keyframe_frequency(cpi);
     }
 
     cpi->frames_since_key = 0;
@@ -1539,6 +1434,7 @@ void vp8_adjust_key_frame_context(VP8_COMP *cpi)
     cpi->key_frame_count++;
 }
 
+
 void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
 {
     // Set-up bounds on acceptable frame size:
@@ -1605,3 +1501,26 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
         }
     }
 }
+
+
+// return of 0 means drop frame
+int vp8_pick_frame_size(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    if (cm->frame_type == KEY_FRAME)
+        calc_iframe_target_size(cpi);
+    else
+    {
+        calc_pframe_target_size(cpi);
+
+        // Check if we're dropping the frame:
+        if (cpi->drop_frame)
+        {
+            cpi->drop_frame = FALSE;
+            cpi->drop_count++;
+            return 0;
+        }
+    }
+    return 1;
+}
diff --git a/vp8/encoder/ratectrl.h b/vp8/encoder/ratectrl.h
index 766dfdfce..d4f779677 100644
--- a/vp8/encoder/ratectrl.h
+++ b/vp8/encoder/ratectrl.h
@@ -17,11 +17,12 @@ extern void vp8_save_coding_context(VP8_COMP *cpi);
 extern void vp8_restore_coding_context(VP8_COMP *cpi);
 
 extern void vp8_setup_key_frame(VP8_COMP *cpi);
-extern void vp8_calc_iframe_target_size(VP8_COMP *cpi);
-extern void vp8_calc_pframe_target_size(VP8_COMP *cpi);
 extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
 extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
 extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
 extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
 
+// return of 0 means drop frame
+extern int vp8_pick_frame_size(VP8_COMP *cpi);
+
 #endif
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 863b6d419..2789cffbb 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -31,7 +31,7 @@
 #include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
-
+#include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
 #include "dct.h"
 #include "vp8/common/systemdependent.h"
@@ -46,13 +46,8 @@
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
 
-
-#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
-
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
 
-
-
 static const int auto_speed_thresh[17] =
 {
     1000,
@@ -248,9 +243,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
 
     vp8_set_speed_features(cpi);
 
-    if (cpi->common.simpler_lpf)
-        cpi->common.filter_type = SIMPLE_LOOPFILTER;
-
     q = (int)pow(Qvalue, 1.25);
 
     if (q < 8)
@@ -480,7 +472,6 @@ int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd)
 
 }
 
-#if !(CONFIG_REALTIME_ONLY)
 static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
 {
     int c = !type;              /* start at coef 0, unless Y with Y2 */
@@ -629,7 +620,8 @@ static int rd_pick_intra4x4block(
 
         rate = bmode_costs[mode];
 
-        vp8_predict_intra4x4(b, mode, b->predictor);
+        RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict)
+                     (b, mode, b->predictor);
         ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
         x->quantize_b(be, b);
@@ -818,7 +810,8 @@ void vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *r
         int this_rd;
 
         x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-        vp8_build_intra_predictors_mbuv(&x->e_mbd);
+        RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv)
+                     (&x->e_mbd);
         ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
                       x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
                       x->src.uv_stride);
@@ -847,7 +840,6 @@ void vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *r
 
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
 }
-#endif
 
 int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
 {
@@ -875,7 +867,6 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv)
     }
 }
 
-#if !(CONFIG_REALTIME_ONLY)
 static int labels2mode(
     MACROBLOCK *x,
     int const *labelings, int which_label,
@@ -1213,12 +1204,15 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
 
                 if (bestsme < INT_MAX)
                 {
+                    int distortion;
+                    unsigned int sse;
+
                     if (!cpi->common.full_pixel)
                         cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                                     bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost);
+                                                     bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion, &sse);
                     else
                         vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                                    bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost);
+                                                    bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion, &sse);
                 }
             } /* NEW4X4 */
 
@@ -1437,88 +1431,56 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
 
     return bsi.segment_rd;
 }
-#endif
 
-static void swap(int *x,int *y)
+static void insertsortmv(int arr[], int len)
 {
-   int tmp;
+    int i, j, k;
 
-   tmp = *x;
-   *x = *y;
-   *y = tmp;
-}
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp;
 
-static void quicksortmv(int arr[],int left, int right)
-{
-   int lidx,ridx,pivot;
-
-   lidx = left;
-   ridx = right;
-
-   if( left < right)
-   {
-      pivot = (left + right)/2;
-
-      while(lidx <=pivot && ridx >=pivot)
-      {
-          while(arr[lidx] < arr[pivot] && lidx <= pivot)
-              lidx++;
-          while(arr[ridx] > arr[pivot] && ridx >= pivot)
-              ridx--;
-          swap(&arr[lidx], &arr[ridx]);
-          lidx++;
-          ridx--;
-          if(lidx-1 == pivot)
-          {
-              ridx++;
-              pivot = ridx;
-          }
-          else if(ridx+1 == pivot)
-          {
-              lidx--;
-              pivot = lidx;
-          }
-      }
-      quicksortmv(arr, left, pivot - 1);
-      quicksortmv(arr, pivot + 1, right);
-   }
+                temp = arr[i];
+
+                for ( k = i; k >j; k--)
+                    arr[k] = arr[k - 1] ;
+
+                arr[j] = temp ;
+            }
+        }
+    }
 }
 
-static void quicksortsad(int arr[],int idx[], int left, int right)
+static void insertsortsad(int arr[],int idx[], int len)
 {
-   int lidx,ridx,pivot;
-
-   lidx = left;
-   ridx = right;
-
-   if( left < right)
-   {
-      pivot = (left + right)/2;
-
-      while(lidx <=pivot && ridx >=pivot)
-      {
-          while(arr[lidx] < arr[pivot] && lidx <= pivot)
-              lidx++;
-          while(arr[ridx] > arr[pivot] && ridx >= pivot)
-              ridx--;
-          swap(&arr[lidx], &arr[ridx]);
-          swap(&idx[lidx], &idx[ridx]);
-          lidx++;
-          ridx--;
-          if(lidx-1 == pivot)
-          {
-              ridx++;
-              pivot = ridx;
-          }
-          else if(ridx+1 == pivot)
-          {
-              lidx--;
-              pivot = lidx;
-          }
-      }
-      quicksortsad(arr, idx, left, pivot - 1);
-      quicksortsad(arr, idx, pivot + 1, right);
-   }
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp, tempi;
+
+                temp = arr[i];
+                tempi = idx[i];
+
+                for ( k = i; k >j; k--)
+                {
+                    arr[k] = arr[k - 1] ;
+                    idx[k] = idx[k - 1];
+                }
+
+                arr[j] = temp ;
+                idx[j] = tempi;
+            }
+        }
+    }
 }
 
 //The improved MV prediction
@@ -1654,8 +1616,8 @@ void vp8_mv_pred
                 mvy[i] = near_mvs[i].as_mv.col;
             }
 
-            quicksortmv (mvx, 0, vcnt-1);
-            quicksortmv (mvy, 0, vcnt-1);
+            insertsortmv(mvx, vcnt);
+            insertsortmv(mvy, vcnt);
             mv.as_mv.row = mvx[vcnt/2];
             mv.as_mv.col = mvy[vcnt/2];
 
@@ -1718,14 +1680,13 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
 
     if(cpi->common.last_frame_type != KEY_FRAME)
     {
-        quicksortsad(near_sad, near_sadidx, 0, 7);
+        insertsortsad(near_sad, near_sadidx, 8);
     }else
     {
-        quicksortsad(near_sad, near_sadidx, 0, 2);
+        insertsortsad(near_sad, near_sadidx, 3);
     }
 }
 
-#if !(CONFIG_REALTIME_ONLY)
 void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
     BLOCK *b = &x->block[0];
@@ -1760,9 +1721,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     //int intermodecost[MAX_MODES];
 
     MB_PREDICTION_MODE uv_intra_mode;
-
-    int force_no_skip = 0;
-
     MV mvp;
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
@@ -1865,8 +1823,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         int disable_skip = 0;
         int other_cost = 0;
 
-        force_no_skip = 0;
-
         // Experimental debug code.
         // Record of rd values recorded for this MB. -1 indicates not measured
         //all_rds[mode_index] = -1;
@@ -2198,8 +2154,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 x->mv_row_max = tmp_row_max;
 
                 if (bestsme < INT_MAX)
-                    // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost);  // normal mvc=11
-                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost);
+                {
+                    int dis; /* TODO: use dis in distortion calculation later. */
+                    unsigned int sse;
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis, &sse);
+                }
 
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -2230,8 +2189,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
         case ZEROMV:
 
-        mv_selected:
-
             // Trap vectors that reach beyond the UMV borders
             // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
             // because of the lack of break statements in the previous two cases.
@@ -2240,14 +2197,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 continue;
 
             vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
-            vp8_build_inter_predictors_mby(&x->e_mbd);
+            vp8_build_inter16x16_predictors_mby(&x->e_mbd);
 
             if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
                 x->skip = 1;
             }
             else if (x->encode_breakout)
             {
-                int sum, sse;
+                int sum;
+                unsigned int sse;
                 int threshold = (xd->block[0].dequant[1]
                             * xd->block[0].dequant[1] >>4);
 
@@ -2256,7 +2214,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
                 VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)
                     (x->src.y_buffer, x->src.y_stride,
-                     x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);
+                     x->e_mbd.predictor, 16, &sse, &sum);
 
                 if (sse < threshold)
                 {
@@ -2280,8 +2238,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                             distortion_uv = sse2;
 
                             disable_skip = 1;
-                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2,
-                                             distortion2);
+                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
                             break;
                         }
@@ -2376,7 +2333,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         {
             // Note index of best mode so far
             best_mode_index = mode_index;
-            x->e_mbd.mode_info_context->mbmi.force_no_skip = force_no_skip;
 
             if (this_mode <= B_PRED)
             {
@@ -2473,7 +2429,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         best_mbmode.uv_mode = 0;
         best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
         best_mbmode.partitioning = 0;
-        best_mbmode.dc_diff = 0;
 
         vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
         vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
@@ -2484,6 +2439,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         }
 
         x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        return;
     }
 
 
@@ -2507,4 +2463,3 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
     x->e_mbd.mode_info_context->mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv;
 }
-#endif
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 1d1be11a4..1d5f9a3a8 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -11,6 +11,9 @@
 
 #ifndef __INC_RDOPT_H
 #define __INC_RDOPT_H
+
+#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);
 extern int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index 64d67c6dd..8646b5fdb 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -18,223 +18,6 @@
 #else
 #define IF_RTCD(x)  NULL
 #endif
-// Google version of SSIM
-// SSIM
-#define KERNEL 3
-#define KERNEL_SIZE  (2 * KERNEL + 1)
-
-typedef unsigned char uint8;
-typedef unsigned int uint32;
-
-static const int K[KERNEL_SIZE] =
-{
-    1, 4, 11, 16, 11, 4, 1    // 16 * exp(-0.3 * i * i)
-};
-static const double ki_w = 1. / 2304.;  // 1 / sum(i:0..6, j..6) K[i]*K[j]
-double get_ssimg(const uint8 *org, const uint8 *rec,
-                 int xo, int yo, int W, int H,
-                 const int stride1, const int stride2
-                )
-{
-    // TODO(skal): use summed tables
-    int y, x;
-
-    const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL;
-    const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL;
-    const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL;
-    const int xmax = (xo + KERNEL > W - 1) ? W - 1 : xo + KERNEL;
-    // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1)
-    // with a diff of 255, squares. That would a max error of 0x8ee0900,
-    // which fits into 32 bits integers.
-    uint32 w = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
-    org += ymin * stride1;
-    rec += ymin * stride2;
-
-    for (y = ymin; y <= ymax; ++y, org += stride1, rec += stride2)
-    {
-        const int Wy = K[KERNEL + y - yo];
-
-        for (x = xmin; x <= xmax; ++x)
-        {
-            const  int Wxy = Wy * K[KERNEL + x - xo];
-            // TODO(skal): inlined assembly
-            w   += Wxy;
-            xm  += Wxy * org[x];
-            ym  += Wxy * rec[x];
-            xxm += Wxy * org[x] * org[x];
-            xym += Wxy * org[x] * rec[x];
-            yym += Wxy * rec[x] * rec[x];
-        }
-    }
-
-    {
-        const double iw = 1. / w;
-        const double iwx = xm * iw;
-        const double iwy = ym * iw;
-        double sxx = xxm * iw - iwx * iwx;
-        double syy = yym * iw - iwy * iwy;
-
-        // small errors are possible, due to rounding. Clamp to zero.
-        if (sxx < 0.) sxx = 0.;
-
-        if (syy < 0.) syy = 0.;
-
-        {
-            const double sxsy = sqrt(sxx * syy);
-            const double sxy = xym * iw - iwx * iwy;
-            static const double C11 = (0.01 * 0.01) * (255 * 255);
-            static const double C22 = (0.03 * 0.03) * (255 * 255);
-            static const double C33 = (0.015 * 0.015) * (255 * 255);
-            const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
-            const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
-
-            const double s = (sxy + C33) / (sxsy + C33);
-            return l * c * s;
-
-        }
-    }
-
-}
-
-double get_ssimfull_kernelg(const uint8 *org, const uint8 *rec,
-                            int xo, int yo, int W, int H,
-                            const int stride1, const int stride2)
-{
-    // TODO(skal): use summed tables
-    // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1)
-    // with a diff of 255, squares. That would a max error of 0x8ee0900,
-    // which fits into 32 bits integers.
-    int y_, x_;
-    uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
-    org += (yo - KERNEL) * stride1;
-    org += (xo - KERNEL);
-    rec += (yo - KERNEL) * stride2;
-    rec += (xo - KERNEL);
-
-    for (y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride1, rec += stride2)
-    {
-        const int Wy = K[y_];
-
-        for (x_ = 0; x_ < KERNEL_SIZE; ++x_)
-        {
-            const int Wxy = Wy * K[x_];
-            // TODO(skal): inlined assembly
-            const int org_x = org[x_];
-            const int rec_x = rec[x_];
-            xm  += Wxy * org_x;
-            ym  += Wxy * rec_x;
-            xxm += Wxy * org_x * org_x;
-            xym += Wxy * org_x * rec_x;
-            yym += Wxy * rec_x * rec_x;
-        }
-    }
-
-    {
-        const double iw = ki_w;
-        const double iwx = xm * iw;
-        const double iwy = ym * iw;
-        double sxx = xxm * iw - iwx * iwx;
-        double syy = yym * iw - iwy * iwy;
-
-        // small errors are possible, due to rounding. Clamp to zero.
-        if (sxx < 0.) sxx = 0.;
-
-        if (syy < 0.) syy = 0.;
-
-        {
-            const double sxsy = sqrt(sxx * syy);
-            const double sxy = xym * iw - iwx * iwy;
-            static const double C11 = (0.01 * 0.01) * (255 * 255);
-            static const double C22 = (0.03 * 0.03) * (255 * 255);
-            static const double C33 = (0.015 * 0.015) * (255 * 255);
-            const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
-            const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
-            const double s = (sxy + C33) / (sxsy + C33);
-            return l * c * s;
-        }
-    }
-}
-
-double calc_ssimg(const uint8 *org, const uint8 *rec,
-                  const int image_width, const int image_height,
-                  const int stride1, const int stride2
-                 )
-{
-    int j, i;
-    double SSIM = 0.;
-
-    for (j = 0; j < KERNEL; ++j)
-    {
-        for (i = 0; i < image_width; ++i)
-        {
-            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
-        }
-    }
-
-    for (j = KERNEL; j < image_height - KERNEL; ++j)
-    {
-        for (i = 0; i < KERNEL; ++i)
-        {
-            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
-        }
-
-        for (i = KERNEL; i < image_width - KERNEL; ++i)
-        {
-            SSIM += get_ssimfull_kernelg(org, rec, i, j,
-                                         image_width, image_height, stride1, stride2);
-        }
-
-        for (i = image_width - KERNEL; i < image_width; ++i)
-        {
-            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
-        }
-    }
-
-    for (j = image_height - KERNEL; j < image_height; ++j)
-    {
-        for (i = 0; i < image_width; ++i)
-        {
-            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
-        }
-    }
-
-    return SSIM;
-}
-
-
-double vp8_calc_ssimg
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    double *ssim_y,
-    double *ssim_u,
-    double *ssim_v
-)
-{
-    double ssim_all = 0;
-    int ysize  = source->y_width * source->y_height;
-    int uvsize = ysize / 4;
-
-    *ssim_y = calc_ssimg(source->y_buffer, dest->y_buffer,
-                         source->y_width, source->y_height,
-                         source->y_stride, dest->y_stride);
-
-
-    *ssim_u = calc_ssimg(source->u_buffer, dest->u_buffer,
-                         source->uv_width, source->uv_height,
-                         source->uv_stride, dest->uv_stride);
-
-
-    *ssim_v = calc_ssimg(source->v_buffer, dest->v_buffer,
-                         source->uv_width, source->uv_height,
-                         source->uv_stride, dest->uv_stride);
-
-    ssim_all = (*ssim_y + *ssim_u + *ssim_v) / (ysize + uvsize + uvsize);
-    *ssim_y /= ysize;
-    *ssim_u /= uvsize;
-    *ssim_v /= uvsize;
-    return ssim_all;
-}
 
 
 void ssim_parms_c
@@ -290,8 +73,8 @@ void ssim_parms_8x8_c
      }
 }
 
-const static long long c1 =  426148; // (256^2*(.01*255)^2
-const static long long c2 = 3835331; //(256^2*(.03*255)^2
+const static long long cc1 =  26634; // (64^2*(.01*255)^2
+const static long long cc2 = 239708; // (64^2*(.03*255)^2
 
 static double similarity
 (
@@ -303,10 +86,19 @@ static double similarity
     int count
 )
 {
-    long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);
+    long long ssim_n, ssim_d;
+    long long c1, c2;
 
-    long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
-            (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;
+    //scale the constants by number of pixels
+    c1 = (cc1*count*count)>>12;
+    c2 = (cc2*count*count)>>12;
+
+    ssim_n = (2*sum_s*sum_r+ c1)*((long long) 2*count*sum_sxr-
+          (long long) 2*sum_s*sum_r+c2);
+
+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+        ((long long)count*sum_sq_s-(long long)sum_s*sum_s +
+        (long long)count*sum_sq_r-(long long) sum_r*sum_r +c2) ;
 
     return ssim_n * 1.0 / ssim_d;
 }
@@ -332,23 +124,38 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
            const vp8_variance_rtcd_vtable_t *rtcd)
 {
     unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    double ssim3;
-    long long ssim_n;
-    long long ssim_d;
+    long long ssim3;
+    long long ssim_n,ssim_n1,ssim_n2;
+    long long ssim_d,ssim_d1,ssim_d2;
+    long long ssim_t1,ssim_t2;
+    long long c1, c2;
+
+    // normalize by 256/64
+    c1 = cc1*16;
+    c2 = cc2*16;
 
     rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);
+    ssim_n1 = (2*sum_s*sum_r+ c1);
 
-    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
-            (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;
+    ssim_n2 =((long long) 2*256*sum_sxr-(long long) 2*sum_s*sum_r+c2);
+
+    ssim_d1 =((long long)sum_s*sum_s +(long long)sum_r*sum_r+c1);
+
+    ssim_d2 = (256 * (long long) sum_sq_s-(long long) sum_s*sum_s +
+                    (long long) 256*sum_sq_r-(long long) sum_r*sum_r +c2) ;
 
-    ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;
-    return (long)( 256*ssim3 * ssim3 );
+    ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;
+    ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;
+
+    ssim3 = 256 *ssim_t1 * ssim_t2;
+    if(ssim3 <0 )
+        ssim3=0;
+    return (long)( ssim3  );
 }
-// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels
-// such that the window regions overlap block boundaries to penalize blocking
-// artifacts.
 
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
 double vp8_ssim2
 (
     unsigned char *img1,
@@ -361,20 +168,21 @@ double vp8_ssim2
 )
 {
     int i,j;
-
+    int samples =0;
     double ssim_total=0;
 
-    // we can sample points as frequently as we like start with 1 per 8x8
-    for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)
+    // sample point start with each 4x4 location
+    for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)
     {
-        for(j=0; j < width; j+=8 )
+        for(j=0; j < width-8; j+=4 )
         {
-            ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);
+            double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2, rtcd);
+            ssim_total += v;
+            samples++;
         }
     }
-    ssim_total /= (width/8 * height /8);
+    ssim_total /= samples;
     return ssim_total;
-
 }
 double vp8_calc_ssim
 (
@@ -406,3 +214,35 @@ double vp8_calc_ssim
 
     return ssimv;
 }
+
+double vp8_calc_ssimg
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    double *ssim_y,
+    double *ssim_u,
+    double *ssim_v,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    double ssim_all = 0;
+    double a, b, c;
+
+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width,
+                 source->y_height, rtcd);
+
+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+    *ssim_y = a;
+    *ssim_u = b;
+    *ssim_v = c;
+    ssim_all = (a * 4 + b + c) /6;
+
+    return ssim_all;
+}
+\ No newline at end of file
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index cec951897..b77195511 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -208,10 +208,12 @@ static int vp8_temporal_filter_find_matching_mb_c
     // Try sub-pixel MC?
     //if (bestsme > error_thresh && bestsme < INT_MAX)
     {
+        int distortion;
+        unsigned int sse;
         bestsme = cpi->find_fractional_mv_step(x, b, d,
                     &d->bmi.mv.as_mv, &best_ref_mv1,
                     x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
-                    mvcost);
+                    mvcost, &distortion, &sse);
     }
 #endif
 
@@ -357,8 +359,8 @@ static void vp8_temporal_filter_iterate_c
             }
 
             // Normalize filter output to produce AltRef frame
-            dst1 = cpi->alt_ref_buffer.source_buffer.y_buffer;
-            stride = cpi->alt_ref_buffer.source_buffer.y_stride;
+            dst1 = cpi->alt_ref_buffer.y_buffer;
+            stride = cpi->alt_ref_buffer.y_stride;
             byte = mb_y_offset;
             for (i = 0,k = 0; i < 16; i++)
             {
@@ -377,9 +379,9 @@ static void vp8_temporal_filter_iterate_c
                 byte += stride - 16;
             }
 
-            dst1 = cpi->alt_ref_buffer.source_buffer.u_buffer;
-            dst2 = cpi->alt_ref_buffer.source_buffer.v_buffer;
-            stride = cpi->alt_ref_buffer.source_buffer.uv_stride;
+            dst1 = cpi->alt_ref_buffer.u_buffer;
+            dst2 = cpi->alt_ref_buffer.v_buffer;
+            stride = cpi->alt_ref_buffer.uv_stride;
             byte = mb_uv_offset;
             for (i = 0,k = 256; i < 8; i++)
             {
@@ -422,7 +424,8 @@ static void vp8_temporal_filter_iterate_c
 
 void vp8_temporal_filter_prepare_c
 (
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    int distance
 )
 {
     int frame = 0;
@@ -441,12 +444,9 @@ void vp8_temporal_filter_prepare_c
 
     int max_frames = cpi->active_arnr_frames;
 
-    num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index;
-
-    if (num_frames_backward < 0)
-        num_frames_backward += cpi->oxcf.lag_in_frames;
-
-    num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1);
+    num_frames_backward = distance;
+    num_frames_forward = vp8_lookahead_depth(cpi->lookahead)
+                         - (num_frames_backward + 1);
 
     switch (blur_type)
     {
@@ -498,8 +498,7 @@ void vp8_temporal_filter_prepare_c
         break;
     }
 
-    start_frame = (cpi->last_alt_ref_sei
-                    + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;
+    start_frame = distance + frames_to_blur_forward;
 
 #ifdef DEBUGFWG
     // DEBUG FWG
@@ -520,12 +519,9 @@ void vp8_temporal_filter_prepare_c
     for (frame = 0; frame < frames_to_blur; frame++)
     {
         int which_buffer =  start_frame - frame;
-
-        if (which_buffer < 0)
-            which_buffer += cpi->oxcf.lag_in_frames;
-
-        cpi->frames[frames_to_blur-1-frame]
-                = &cpi->src_buffer[which_buffer].source_buffer;
+        struct lookahead_entry* buf = vp8_lookahead_peek(cpi->lookahead,
+                                                         which_buffer);
+        cpi->frames[frames_to_blur-1-frame] = &buf->img;
     }
 
     vp8_temporal_filter_iterate_c (
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index e3f423f8a..1c5923813 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -224,28 +224,9 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
     int plane_type;
     int b;
 
-    TOKENEXTRA *start = *t;
-    TOKENEXTRA *tp = *t;
-
-    x->mode_info_context->mbmi.dc_diff = 1;
-
-#if 0
-
-    if (x->mbmi.force_no_skip)
-    {
-        x->mbmi.mb_skip_coeff = 1;
-        //reset for next_mb.
-        x->mbmi.force_no_skip = 0;
-    }
-
-#endif
-
-#if 1
-
     x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x);
     if (x->mode_info_context->mbmi.mb_skip_coeff)
     {
-
         cpi->skip_true_count++;
 
         if (!cpi->common.mb_no_coeff_skip)
@@ -255,17 +236,11 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
             vp8_fix_contexts(x);
         }
 
-        if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-            x->mode_info_context->mbmi.dc_diff = 0;
-        else
-            x->mode_info_context->mbmi.dc_diff = 1;
-
-
         return;
     }
 
     cpi->skip_false_count++;
-#endif
+
 #if 0
     vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
 #endif
@@ -292,42 +267,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
                             A + vp8_block2above[b],
                             L + vp8_block2left[b], cpi);
 
-#if 0
-
-    if (cpi->common.mb_no_coeff_skip)
-    {
-        int skip = 1;
-
-        while ((tp != *t) && skip)
-        {
-            skip = (skip && (tp->Token == DCT_EOB_TOKEN));
-            tp ++;
-        }
-
-        if (skip != x->mbmi.mb_skip_coeff)
-            skip += 0;
-
-        x->mbmi.mb_skip_coeff = skip;
-
-        if (x->mbmi.mb_skip_coeff == 1)
-        {
-            x->mbmi.dc_diff = 0;
-            //redo the coutnts
-            vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts));
-
-            *t = start;
-            cpi->skip_true_count++;
-            //skip_true_count++;
-        }
-        else
-        {
-
-            cpi->skip_false_count++;
-            //skip_false_count++;
-        }
-    }
-
-#endif
 }
 
 
@@ -510,13 +449,6 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
                      A + vp8_block2above[24], L + vp8_block2left[24], cpi);
     plane_type = 0;
 
-
-    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-        x->mode_info_context->mbmi.dc_diff = 0;
-    else
-        x->mode_info_context->mbmi.dc_diff = 1;
-
-
     for (b = 0; b < 16; b++)
         stuff1st_order_b(x->block + b, t, plane_type, x->frame_type,
                          A + vp8_block2above[b],
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 652dd9804..3d52a5d54 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -33,6 +33,7 @@
     %define     input       rcx
     %define     output      rdx
     %define     pitch       r8
+    SAVE_XMM 7, u
   %else
     %define     input       rdi
     %define     output      rsi
@@ -53,6 +54,7 @@
     pop         rbp
 %else
   %ifidn __OUTPUT_FORMAT__,x64
+    RESTORE_XMM
   %endif
 %endif
     ret
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index c0f06bbbb..994629499 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -22,33 +22,33 @@ sym(vp8_block_error_xmm):
     ; end prologue
 
         mov         rsi,        arg(0) ;coeff_ptr
-
         mov         rdi,        arg(1) ;dcoef_ptr
-        movdqa      xmm3,       [rsi]
 
-        movdqa      xmm4,       [rdi]
-        movdqa      xmm5,       [rsi+16]
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
 
-        movdqa      xmm6,       [rdi+16]
-        psubw       xmm3,       xmm4
+        psubw       xmm0,       xmm1
+        psubw       xmm2,       xmm3
 
-        psubw       xmm5,       xmm6
-        pmaddwd     xmm3,       xmm3
-        pmaddwd     xmm5,       xmm5
+        pmaddwd     xmm0,       xmm0
+        pmaddwd     xmm2,       xmm2
 
-        paddd       xmm3,       xmm5
+        paddd       xmm0,       xmm2
 
-        pxor        xmm7,       xmm7
-        movdqa      xmm0,       xmm3
+        pxor        xmm5,       xmm5
+        movdqa      xmm1,       xmm0
 
-        punpckldq   xmm0,       xmm7
-        punpckhdq   xmm3,       xmm7
+        punpckldq   xmm0,       xmm5
+        punpckhdq   xmm1,       xmm5
 
-        paddd       xmm0,       xmm3
-        movdqa      xmm3,       xmm0
+        paddd       xmm0,       xmm1
+        movdqa      xmm1,       xmm0
 
         psrldq      xmm0,       8
-        paddd       xmm0,       xmm3
+        paddd       xmm0,       xmm1
 
         movq        rax,        xmm0
 
@@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 6
     push rsi
     push rdi
     ; end prolog
 
 
         mov         rsi,        arg(0) ;coeff_ptr
-        pxor        xmm7,       xmm7
+        pxor        xmm6,       xmm6
 
         mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        xmm2,       xmm2
+        pxor        xmm4,       xmm4
 
-        movd        xmm1,       dword ptr arg(2) ;dc
-        por         xmm1,       xmm2
+        movd        xmm5,       dword ptr arg(2) ;dc
+        por         xmm5,       xmm4
 
-        pcmpeqw     xmm1,       xmm7
+        pcmpeqw     xmm5,       xmm6
         mov         rcx,        16
 
 mberror_loop:
-        movdqa      xmm3,       [rsi]
-        movdqa      xmm4,       [rdi]
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
 
-        movdqa      xmm5,       [rsi+16]
-        movdqa      xmm6,       [rdi+16]
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
 
 
-        psubw       xmm5,       xmm6
-        pmaddwd     xmm5,       xmm5
+        psubw       xmm2,       xmm3
+        pmaddwd     xmm2,       xmm2
 
-        psubw       xmm3,       xmm4
-        pand        xmm3,       xmm1
+        psubw       xmm0,       xmm1
+        pand        xmm0,       xmm5
 
-        pmaddwd     xmm3,       xmm3
+        pmaddwd     xmm0,       xmm0
         add         rsi,        32
 
         add         rdi,        32
 
         sub         rcx,        1
-        paddd       xmm2,       xmm5
+        paddd       xmm4,       xmm2
 
-        paddd       xmm2,       xmm3
+        paddd       xmm4,       xmm0
         jnz         mberror_loop
 
-        movdqa      xmm0,       xmm2
-        punpckldq   xmm0,       xmm7
+        movdqa      xmm0,       xmm4
+        punpckldq   xmm0,       xmm6
 
-        punpckhdq   xmm2,       xmm7
-        paddd       xmm0,       xmm2
+        punpckhdq   xmm4,       xmm6
+        paddd       xmm0,       xmm4
 
         movdqa      xmm1,       xmm0
         psrldq      xmm0,       8
@@ -265,6 +266,7 @@ mberror_loop:
     pop rdi
     pop rsi
     ; begin epilog
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl):
         mov             rdi,        arg(1) ;d_ptr
 
         mov             rcx,        16
-        pxor            xmm7,       xmm7
+        pxor            xmm3,       xmm3
 
 mbuverror_loop:
 
@@ -352,7 +354,7 @@ mbuverror_loop:
         psubw           xmm1,       xmm2
         pmaddwd         xmm1,       xmm1
 
-        paddd           xmm7,       xmm1
+        paddd           xmm3,       xmm1
 
         add             rsi,        16
         add             rdi,        16
@@ -361,7 +363,7 @@ mbuverror_loop:
         jnz             mbuverror_loop
 
         pxor        xmm0,           xmm0
-        movdqa      xmm1,           xmm7
+        movdqa      xmm1,           xmm3
 
         movdqa      xmm2,           xmm1
         punpckldq   xmm1,           xmm0
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
index 39439f0d8..71efd5613 100644
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,7 +17,7 @@ sym(vp8_short_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 5e40dc7de..056b64c39 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -20,7 +20,7 @@ global sym(vp8_regular_quantize_b_sse2)
 sym(vp8_regular_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
 
 %if ABI_IS_32BIT
@@ -142,7 +142,7 @@ sym(vp8_regular_quantize_b_sse2):
     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
 
     ; downshift by quant_shift[rc]
-    movsx       ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc]
+    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
     sar         edi, cl                     ; also sets Z bit
     je          rq_zigzag_loop_%1           ; !y
     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
new file mode 100644
index 000000000..258899eed
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -0,0 +1,254 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_regular_quantize_b_sse4 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp8_regular_quantize_b_sse4)
+sym(vp8_regular_quantize_b_sse4):
+
+%if ABI_IS_32BIT
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rdi
+    push        rsi
+
+    ALIGN_STACK 16, rax
+    %define qcoeff      0 ; 32
+    %define stack_size 32
+    sub         rsp, stack_size
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 8, u
+    push        rdi
+    push        rsi
+  %endif
+%endif
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp8_block_coeff]
+    mov         rcx, [rdi + vp8_block_zbin]
+    mov         rdx, [rdi + vp8_block_round]
+    movd        xmm7, [rdi + vp8_block_zbin_extra]
+
+    ; z
+    movdqa      xmm0, [rax]
+    movdqa      xmm1, [rax + 16]
+
+    ; duplicate zbin_oq_value
+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm1, 15
+
+    ; (z ^ sz)
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm1
+
+    ; x = abs(z)
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm1
+
+    ; zbin
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm3
+
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm6, xmm4
+    psubw       xmm7, xmm5
+
+    ; round
+    movdqa      xmm4, [rdx]
+    movdqa      xmm5, [rdx + 16]
+
+    mov         rax, [rdi + vp8_block_quant_shift]
+    mov         rcx, [rdi + vp8_block_quant]
+    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
+
+    ; x + round
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    ; quant
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm4, xmm2
+    pmulhw      xmm5, xmm3
+
+    ; y += x
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    pxor        xmm4, xmm4
+%if ABI_IS_32BIT
+    movdqa      [rsp + qcoeff], xmm4
+    movdqa      [rsp + qcoeff + 16], xmm4
+%else
+    pxor        xmm8, xmm8
+%endif
+
+    ; quant_shift
+    movdqa      xmm5, [rax]
+
+    ; zrun_zbin_boost
+    mov         rax, rdx
+
+%macro ZIGZAG_LOOP 5
+    ; x
+    pextrw      ecx, %4, %2
+
+    ; if (x >= zbin)
+    sub         cx, WORD PTR[rdx]           ; x - zbin
+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+    jl          rq_zigzag_loop_%1           ; x < zbin
+
+    pextrw      edi, %3, %2                 ; y
+
+    ; downshift by quant_shift[rc]
+    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
+    sar         edi, cl                     ; also sets Z bit
+    je          rq_zigzag_loop_%1           ; !y
+%if ABI_IS_32BIT
+    mov         WORD PTR[rsp + qcoeff + %1 *2], di
+%else
+    pinsrw      %5, edi, %2                 ; qcoeff[rc]
+%endif
+    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
+rq_zigzag_loop_%1:
+%endmacro
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+
+    mov         rcx, [rsi + vp8_blockd_dequant]
+    mov         rdi, [rsi + vp8_blockd_dqcoeff]
+
+%if ABI_IS_32BIT
+    movdqa      xmm4, [rsp + qcoeff]
+    movdqa      xmm5, [rsp + qcoeff + 16]
+%else
+    %define     xmm5 xmm8
+%endif
+
+    ; y ^ sz
+    pxor        xmm4, xmm0
+    pxor        xmm5, xmm1
+    ; x = (y ^ sz) - sz
+    psubw       xmm4, xmm0
+    psubw       xmm5, xmm1
+
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rsi + vp8_blockd_qcoeff]
+
+    pmullw      xmm0, xmm4
+    pmullw      xmm1, xmm5
+
+    ; store qcoeff
+    movdqa      [rcx], xmm4
+    movdqa      [rcx + 16], xmm5
+
+    ; store dqcoeff
+    movdqa      [rdi], xmm0
+    movdqa      [rdi + 16], xmm1
+
+    ; select the last value (in zig_zag order) for EOB
+    pxor        xmm6, xmm6
+    pcmpeqw     xmm4, xmm6
+    pcmpeqw     xmm5, xmm6
+
+    packsswb    xmm4, xmm5
+    pshufb      xmm4, [GLOBAL(zig_zag1d)]
+    pmovmskb    edx, xmm4
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax
+    bsr         eax, edx
+    sub         edi, edx
+    sar         edi, 31
+    add         eax, 1
+    and         eax, edi
+
+    mov         [rsi + vp8_blockd_eob], eax
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    add         rsp, stack_size
+    pop         rsp
+
+    pop         rsi
+    pop         rdi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %undef xmm5
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+    RESTORE_XMM
+  %endif
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+; vp8/common/entropy.c: vp8_default_zig_zag1d
+zig_zag1d:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
index f09358061..bbe475f8c 100644
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -51,4 +51,17 @@ extern prototype_quantize_block(vp8_fast_quantize_b_ssse3);
 
 #endif /* HAVE_SSSE3 */
 
+
+#if HAVE_SSE4_1
+extern prototype_quantize_block(vp8_regular_quantize_b_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_quantize_quantb
+#define vp8_quantize_quantb vp8_regular_quantize_b_sse4
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_SSE4_1 */
+
 #endif /* QUANTIZE_X86_H */
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index cc6bc3cd9..04ee72f72 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 6
     push        rsi
     push        rdi
     ; end prolog
@@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt):
         lea             rcx,        [rsi+rax*8]
 
         lea             rcx,        [rcx+rax*8]
-        pxor            xmm7,       xmm7
+        pxor            xmm6,       xmm6
 
 x16x16sad_wmt_loop:
 
@@ -52,32 +53,33 @@ x16x16sad_wmt_loop:
         punpcklbw       xmm1,       xmm3
 
         psadbw          xmm0,       xmm1
-        movq            xmm6,       QWORD PTR [rsi+rax+8]
+        movq            xmm2,       QWORD PTR [rsi+rax+8]
 
         movq            xmm3,       QWORD PTR [rdi+rdx+8]
         lea             rsi,        [rsi+rax*2]
 
         lea             rdi,        [rdi+rdx*2]
-        punpcklbw       xmm4,       xmm6
+        punpcklbw       xmm4,       xmm2
 
         punpcklbw       xmm5,       xmm3
         psadbw          xmm4,       xmm5
 
-        paddw           xmm7,       xmm0
-        paddw           xmm7,       xmm4
+        paddw           xmm6,       xmm0
+        paddw           xmm6,       xmm4
 
         cmp             rsi,        rcx
         jne             x16x16sad_wmt_loop
 
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
 
-        paddw           xmm0,       xmm7
+        paddw           xmm0,       xmm6
         movq            rax,        xmm0
 
     ; begin epilog
     pop rdi
     pop rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index f0336ab17..2dbcc7dc9 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -33,14 +33,15 @@
     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
 %else
   %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
     %define     ref_ptr     r8
     %define     ref_stride  r9
     %define     end_ptr     r10
     %define     ret_var     r11
-    %define     result_ptr  [rsp+8+4*8]
-    %define     max_err     [rsp+8+4*8]
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_err     [rsp+xmm_stack_space+8+4*8]
   %else
     %define     src_ptr     rdi
     %define     src_stride  rsi
@@ -72,6 +73,7 @@
     pop         rbp
 %else
   %ifidn __OUTPUT_FORMAT__,x64
+    RESTORE_XMM
   %endif
 %endif
     ret
@@ -106,6 +108,7 @@
     xchg        rbx,        rax
 %else
   %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
     %define     r0_ptr      rsi
@@ -113,7 +116,7 @@
     %define     r2_ptr      r11
     %define     r3_ptr      r8
     %define     ref_stride  r9
-    %define     result_ptr  [rsp+16+4*8]
+    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
     push        rsi
 
     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
@@ -151,6 +154,7 @@
 %else
   %ifidn __OUTPUT_FORMAT__,x64
     pop         rsi
+    RESTORE_XMM
   %endif
 %endif
     ret
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 69c5eaedc..6ecf08184 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
     push        rsi
     push        rdi
     push        rcx
@@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off:
     pop         rcx
     pop         rdi
     pop         rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
     push        rsi
     push        rdi
     push        rcx
@@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off:
     pop         rcx
     pop         rdi
     pop         rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index c267cdb54..d5d267a69 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -16,12 +16,12 @@
         paddusw         xmm14, xmm4  ; sum_r
         movdqa          xmm1, xmm3
         pmaddwd         xmm1, xmm1
-        paddq           xmm13, xmm1 ; sum_sq_s
+        paddd           xmm13, xmm1 ; sum_sq_s
         movdqa          xmm2, xmm4
         pmaddwd         xmm2, xmm2
-        paddq           xmm12, xmm2 ; sum_sq_r
+        paddd           xmm12, xmm2 ; sum_sq_r
         pmaddwd         xmm3, xmm4
-        paddq           xmm11, xmm3  ; sum_sxr
+        paddd           xmm11, xmm3  ; sum_sxr
 %endmacro
 
 ; Sum across the register %1 starting with q words
@@ -66,6 +66,7 @@ sym(vp8_ssim_parms_16x16_sse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
     push        rsi
     push        rdi
     ; end prolog
@@ -115,19 +116,20 @@ NextRow:
     SUM_ACROSS_Q    xmm11
 
     mov             rdi,arg(4)
-    movq            [rdi], xmm15;
+    movd            [rdi], xmm15;
     mov             rdi,arg(5)
-    movq            [rdi], xmm14;
+    movd            [rdi], xmm14;
     mov             rdi,arg(6)
-    movq            [rdi], xmm13;
+    movd            [rdi], xmm13;
     mov             rdi,arg(7)
-    movq            [rdi], xmm12;
+    movd            [rdi], xmm12;
     mov             rdi,arg(8)
-    movq            [rdi], xmm11;
+    movd            [rdi], xmm11;
 
     ; begin epilog
     pop         rdi
     pop         rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -154,6 +156,7 @@ sym(vp8_ssim_parms_8x8_sse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
     push        rsi
     push        rdi
     ; end prolog
@@ -174,11 +177,8 @@ sym(vp8_ssim_parms_8x8_sse3):
 NextRow2:
 
     ;grab source and reference pixels
-    movq            xmm5, [rsi]
-    movq            xmm6, [rdi]
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
     punpcklbw       xmm3, xmm0 ; low_s
     punpcklbw       xmm4, xmm0 ; low_r
 
@@ -197,19 +197,20 @@ NextRow2:
     SUM_ACROSS_Q    xmm11
 
     mov             rdi,arg(4)
-    movq            [rdi], xmm15;
+    movd            [rdi], xmm15;
     mov             rdi,arg(5)
-    movq            [rdi], xmm14;
+    movd            [rdi], xmm14;
     mov             rdi,arg(6)
-    movq            [rdi], xmm13;
+    movd            [rdi], xmm13;
     mov             rdi,arg(7)
-    movq            [rdi], xmm12;
+    movd            [rdi], xmm12;
     mov             rdi,arg(8)
-    movq            [rdi], xmm11;
+    movd            [rdi], xmm11;
 
     ; begin epilog
     pop         rdi
     pop         rsi
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 3fb23d097..95888f6be 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -77,7 +77,7 @@ sym(vp8_subtract_mby_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index 0127b012e..b777ef566 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -26,7 +26,7 @@ sym(vp8_temporal_filter_apply_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -164,10 +164,10 @@ temporal_filter_apply_load_finished:
         movdqa      xmm6,           [rdi+32]
         movdqa      xmm7,           [rdi+48]
         ; += modifier
-        paddw       xmm4,           xmm0
-        paddw       xmm5,           xmm2
-        paddw       xmm6,           xmm1
-        paddw       xmm7,           xmm3
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
         ; write back
         movdqa      [rdi],          xmm4
         movdqa      [rdi+16],       xmm5
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index c2c30deb2..5becc7344 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
     push rbx
     push rsi
     push rdi
@@ -206,6 +207,7 @@ var16loop:
     pop rdi
     pop rsi
     pop rbx
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -321,6 +324,7 @@ var16peloop:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2):
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -528,7 +534,7 @@ sym(vp8_filter_block2d_bil_var_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -925,7 +933,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -1146,7 +1156,7 @@ sym(vp8_half_vert_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1:
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -1357,7 +1369,7 @@ sym(vp8_half_horiz_variance16x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index 3c0fef9b5..a582f8dc5 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -34,7 +34,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 8f2774b7a..b01319fa4 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -271,9 +271,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_sse3;
         cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_sse3;
         cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_sse3;
-#if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.search.full_search             = vp8_full_search_sadx3;
-#endif
         cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_sse3;
         cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_sse3;
         cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
@@ -314,9 +312,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
         cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
         cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
-#if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
-#endif
+
+        cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse4;
     }
 #endif
 
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 2a2f0cfad..5f5ba3a35 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -89,6 +89,7 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 2622738ec..db60bfe4f 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -137,8 +137,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp8_extracfg *vp8_cfg)
 {
-    RANGE_CHECK(cfg, g_w,                   1, 16384);
-    RANGE_CHECK(cfg, g_h,                   1, 16384);
+    RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
+    RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
     RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
     RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
     RANGE_CHECK_HI(cfg, g_profile,          3);
@@ -151,7 +151,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
 #endif
     RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
-    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
+    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
     RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
     RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
     //RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);
@@ -174,16 +175,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
               "or kf_max_dist instead.");
 
     RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
+    RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
+
 #if !(CONFIG_REALTIME_ONLY)
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
-    RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
     RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
 #else
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_REAL_TIME_ENCODING, VP8_REAL_TIME_ENCODING);
-
-    if (!((vp8_cfg->cpu_used >= -16 && vp8_cfg->cpu_used <= -4) || (vp8_cfg->cpu_used >= 4 && vp8_cfg->cpu_used <= 16)))
-        ERROR("cpu_used out of range [-16..-4] or [4..16]");
-
     RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 0);
 #endif
 
@@ -197,8 +195,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #if !(CONFIG_REALTIME_ONLY)
     if (cfg->g_pass == VPX_RC_LAST_PASS)
     {
-        int              mb_r = (cfg->g_h + 15) / 16;
-        int              mb_c = (cfg->g_w + 15) / 16;
         size_t           packet_sz = sizeof(FIRSTPASS_STATS);
         int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
         FIRSTPASS_STATS *stats;
@@ -309,6 +305,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     }
 
     oxcf->target_bandwidth       = cfg.rc_target_bitrate;
+    oxcf->rc_max_intra_bitrate_pct = cfg.rc_max_intra_bitrate_pct;
 
     oxcf->best_allowed_q          = cfg.rc_min_quantizer;
     oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
@@ -316,7 +313,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     oxcf->fixed_q = -1;
 
     oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
-    //oxcf->over_shoot_pct        = cfg.rc_overshoot_pct;
+    oxcf->over_shoot_pct          = cfg.rc_overshoot_pct;
 
     oxcf->maximum_buffer_size     = cfg.rc_buf_sz;
     oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
@@ -362,6 +359,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
         printf("key_freq: %d\n", oxcf->key_freq);
         printf("end_usage: %d\n", oxcf->end_usage);
         printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+        printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
         printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
         printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
         printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
@@ -1088,11 +1086,11 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         {0},                /* rc_twopass_stats_in */
 #endif
         256,                /* rc_target_bandwidth */
-
+        0,                  /* rc_max_intra_bitrate_pct */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
-        95,                 /* rc_undershoot_pct */
-        200,                /* rc_overshoot_pct */
+        100,                /* rc_undershoot_pct */
+        100,                /* rc_overshoot_pct */
 
         6000,               /* rc_max_buffer_size */
         4000,               /* rc_buffer_initial_size; */
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 8037f9adb..c17837164 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -52,6 +52,8 @@ VP8_CX_SRCS-yes += encoder/encodeintra.h
 VP8_CX_SRCS-yes += encoder/encodemb.h
 VP8_CX_SRCS-yes += encoder/encodemv.h
 VP8_CX_SRCS-yes += encoder/firstpass.h
+VP8_CX_SRCS-yes += encoder/lookahead.c
+VP8_CX_SRCS-yes += encoder/lookahead.h
 VP8_CX_SRCS-yes += encoder/mcomp.h
 VP8_CX_SRCS-yes += encoder/modecosts.h
 VP8_CX_SRCS-yes += encoder/onyx_int.h
@@ -115,6 +117,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm