32 files changed, 3057 insertions, 3814 deletions
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 5841f8091..370ebe8f8 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -11,126 +11,68 @@
 #include <stdio.h>
 
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
 
-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
-                                        int frame, char *file) {
+static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+          cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+                          size_t member_offset) {
   int mi_row;
   int mi_col;
   int mi_index = 0;
-  FILE *mvs = fopen(file, "a");
-
-  // Print out the macroblock Y modes
-  fprintf(mvs, "SB Types for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type);
-
-      mi_index++;
-    }
-
-    fprintf(mvs, "\n");
-    mi_index += 8;
-  }
+  MODE_INFO *mi = common->mi;
+  int rows = common->mi_rows;
+  int cols = common->mi_cols;
+  char prefix = descriptor[0];
 
-  // Print out the macroblock Y modes
-  fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+  log_frame_info(common, descriptor, file);
   mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode);
-
+      fprintf(file, "%2d ",
+              *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
       mi_index++;
     }
-
-    fprintf(mvs, "\n");
+    fprintf(file, "\n");
     mi_index += 8;
   }
-
-  fprintf(mvs, "\n");
-
-  mi_index = 0;
-  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]);
-
-      mi_index++;
-    }
-
-    fprintf(mvs, "\n");
-    mi_index += 8;
-  }
-  fprintf(mvs, "\n");
-
-  mi_index = 0;
-  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
+  fprintf(file, "\n");
+}
+void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
+  int mi_row;
+  int mi_col;
+  int mi_index = 0;
+  FILE *mvs = fopen(file, "a");
+  MODE_INFO *mi = cm->mi;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+  log_frame_info(cm, "Vectors ",mvs);
   for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs,"V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
               mi[mi_index].mbmi.mv[0].as_mv.col);
-
       mi_index++;
     }
-
     fprintf(mvs, "\n");
     mi_index += 8;
   }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock txform sizes */
-  mi_index = 0;
-  fprintf(mvs, "TXFM size for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size);
-
-      mi_index++;
-    }
-
-    mi_index += 8;
-    fprintf(mvs, "\n");
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock UV modes */
-  mi_index = 0;
-  fprintf(mvs, "UV Modes for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode);
-
-      mi_index++;
-    }
-
-    mi_index += 8;
-    fprintf(mvs, "\n");
-  }
-
-  fprintf(mvs, "\n");
-
-  /* print out the macroblock mvs */
-  mi_index = 0;
-  fprintf(mvs, "MVs for Frame %d\n", frame);
-
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2,
-              mi[mi_index].mbmi.mv[0].as_mv.col / 2);
-
-      mi_index++;
-    }
-
-    mi_index += 8;
-    fprintf(mvs, "\n");
-  }
-
   fprintf(mvs, "\n");
 
   fclose(mvs);
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index e07e43c8b..000e284ee 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -114,7 +114,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   return c;
 }
 
-int vp9_use_nmv_hp(const MV *ref) {
+int vp9_use_mv_hp(const MV *ref) {
   return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
          (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
 }
@@ -123,54 +123,50 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
   return mv_class_base(c) + offset;
 }
 
-static void increment_nmv_component_count(int v,
-                                          nmv_component_counts *mvcomp,
-                                          int incr,
-                                          int usehp) {
-  assert (v != 0);            /* should not be zero */
-  mvcomp->mvcount[MV_MAX + v] += incr;
+static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
+                                   int incr) {
+  assert (v != 0);
+  comp_counts->mvcount[MV_MAX + v] += incr;
 }
 
-static void increment_nmv_component(int v,
-                                    nmv_component_counts *mvcomp,
-                                    int incr,
-                                    int usehp) {
+static void inc_mv_component(int v, nmv_component_counts *comp_counts,
+                             int incr, int usehp) {
   int s, z, c, o, d, e, f;
   if (!incr)
     return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
-  mvcomp->sign[s] += incr;
+  comp_counts->sign[s] += incr;
   z = (s ? -v : v) - 1;       /* magnitude - 1 */
 
   c = vp9_get_mv_class(z, &o);
-  mvcomp->classes[c] += incr;
+  comp_counts->classes[c] += incr;
 
   d = (o >> 3);               /* int mv data */
   f = (o >> 1) & 3;           /* fractional pel mv data */
   e = (o & 1);                /* high precision mv data */
   if (c == MV_CLASS_0) {
-    mvcomp->class0[d] += incr;
+    comp_counts->class0[d] += incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
-      mvcomp->bits[i][((d >> i) & 1)] += incr;
+      comp_counts->bits[i][((d >> i) & 1)] += incr;
   }
 
   /* Code the fractional pel bits */
   if (c == MV_CLASS_0) {
-    mvcomp->class0_fp[d][f] += incr;
+    comp_counts->class0_fp[d][f] += incr;
   } else {
-    mvcomp->fp[f] += incr;
+    comp_counts->fp[f] += incr;
   }
 
   /* Code the high precision bit */
   if (usehp) {
     if (c == MV_CLASS_0) {
-      mvcomp->class0_hp[e] += incr;
+      comp_counts->class0_hp[e] += incr;
     } else {
-      mvcomp->hp[e] += incr;
+      comp_counts->hp[e] += incr;
     }
   }
 }
@@ -197,8 +193,8 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
   int v;
   vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
   for (v = 1; v <= MV_MAX; v++) {
-    increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
   }
 }
 
@@ -206,12 +202,12 @@ void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
                        int usehp) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   mvctx->joints[j]++;
-  usehp = usehp && vp9_use_nmv_hp(ref);
+  usehp = usehp && vp9_use_mv_hp(ref);
   if (mv_joint_vertical(j))
-    increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
+    inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
 
   if (mv_joint_horizontal(j))
-    increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
+    inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
 }
 
 static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
@@ -332,7 +328,7 @@ static unsigned int adapt_probs(unsigned int i,
 }
 
 
-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
   int i, j;
 #ifdef MV_COUNT_TESTING
   printf("joints count: ");
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 15994a6ae..0fc20dbfc 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -21,8 +21,8 @@ struct VP9Common;
 void vp9_entropy_mv_init();
 void vp9_init_mv_probs(struct VP9Common *cm);
 
-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_nmv_hp(const MV *ref);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+int vp9_use_mv_hp(const MV *ref);
 
 #define VP9_NMV_UPDATE_PROB  252
 
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index a6922715e..643b229a6 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_sadmxn.h"
 
 static void lower_mv_precision(int_mv *mv, int usehp) {
-  if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
+  if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
     if (mv->as_mv.row & 1)
       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
     if (mv->as_mv.col & 1)
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index a405aab8d..892ad5615 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -266,88 +266,84 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
 specialize vp9_variance4x4 mmx sse2
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 sse2
+specialize vp9_sub_pixel_variance64x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64
+specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64
+specialize vp9_sub_pixel_variance32x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64
+specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32
+specialize vp9_sub_pixel_variance64x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32
+specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16
+specialize vp9_sub_pixel_variance32x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16
+specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32
+specialize vp9_sub_pixel_variance16x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32
+specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 sse2
+specialize vp9_sub_pixel_variance32x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32
+specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+specialize vp9_sub_pixel_variance16x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16
+specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+specialize vp9_sub_pixel_variance8x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16
+specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+specialize vp9_sub_pixel_variance16x8 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8
+specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+specialize vp9_sub_pixel_variance8x8 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8
+specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
 
 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
 prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4
+specialize vp9_sub_pixel_variance8x4 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4
+specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
 
 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8
+specialize vp9_sub_pixel_variance4x8 sse ssse3
 
 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8
+specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
 
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+specialize vp9_sub_pixel_variance4x4 sse ssse3
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
 prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4
+specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
 
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
@@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co
 specialize vp9_sad4x4 mmx sse
 
 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
+specialize vp9_variance_halfpixvar16x16_h sse2
 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
 
 prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
+specialize vp9_variance_halfpixvar16x16_v sse2
 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
 
 prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+specialize vp9_variance_halfpixvar16x16_hv sse2
 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
 
 prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
 
-prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
+#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+#specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
 prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse16x16 mmx sse2
@@ -533,9 +529,11 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
-prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
-specialize vp9_block_error mmx sse2
-vp9_block_error_sse2=vp9_block_error_xmm
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
+specialize vp9_block_error sse2
+
+prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
+specialize vp9_subtract_block sse2
 
 #
 # Structured Similarity (SSIM)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index b3d41bed7..d8836c962 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -8,20 +8,21 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include "vp9/decoder/vp9_treereader.h"
-#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodframe.h"
-#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_treereader.h"
+
 #if CONFIG_DEBUG
 #include <assert.h>
 #endif
@@ -37,14 +38,52 @@ extern int dec_debug;
 #endif
 
 static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p);
-  return m;
+  return treed_read(r, vp9_intra_mode_tree, p);
 }
 
 static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
   return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
 }
 
+static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
+  TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
+  if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
+    txfm_size += vp9_read(r, tx_probs[1]);
+    if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
+      txfm_size += vp9_read(r, tx_probs[2]);
+  }
+
+  if (bsize >= BLOCK_SIZE_SB32X32)
+    cm->fc.tx_count_32x32p[context][txfm_size]++;
+  else if (bsize >= BLOCK_SIZE_MB16X16)
+    cm->fc.tx_count_16x16p[context][txfm_size]++;
+  else
+    cm->fc.tx_count_8x8p[context][txfm_size]++;
+
+  return txfm_size;
+}
+
+static TX_SIZE get_txfm_size(VP9D_COMP *pbi, TXFM_MODE txfm_mode,
+                             BLOCK_SIZE_TYPE bsize, int select_cond,
+                             vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+
+  if (txfm_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond)
+    return select_txfm_size(cm, xd, r, bsize);
+  else if (txfm_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32)
+    return TX_32X32;
+  else if (txfm_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16)
+    return TX_16X16;
+  else if (txfm_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8)
+    return TX_8X8;
+  else
+    return TX_4X4;
+}
+
 static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
                            int mi_row, int mi_col, int segment_id) {
   const int mi_index = mi_row * cm->mi_cols + mi_col;
@@ -63,27 +102,6 @@ static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
   }
 }
 
-static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
-  const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
-  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
-  TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
-  if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
-    txfm_size += vp9_read(r, tx_probs[1]);
-    if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
-      txfm_size += vp9_read(r, tx_probs[2]);
-  }
-  if (bsize >= BLOCK_SIZE_SB32X32) {
-    cm->fc.tx_count_32x32p[context][txfm_size]++;
-  } else if (bsize >= BLOCK_SIZE_MB16X16) {
-    cm->fc.tx_count_16x16p[context][txfm_size]++;
-  } else {
-    cm->fc.tx_count_8x8p[context][txfm_size]++;
-  }
-  return txfm_size;
-}
-
-
 static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
                          int mi_row, int mi_col,
                          vp9_reader *r) {
@@ -106,21 +124,8 @@ static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
                        [m->mbmi.mb_skip_coeff]++;
   }
 
-  if (cm->txfm_mode == TX_MODE_SELECT &&
-      m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type);
-  } else if (cm->txfm_mode >= ALLOW_32X32 &&
-             m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-    m->mbmi.txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 &&
-             m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) {
-    m->mbmi.txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 &&
-             m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
-    m->mbmi.txfm_size = TX_8X8;
-  } else {
-    m->mbmi.txfm_size = TX_4X4;
-  }
+  m->mbmi.txfm_size = get_txfm_size(pbi, cm->txfm_mode, m->mbmi.sb_type,
+                                    1, r);
 
   // luma mode
   m->mbmi.ref_frame[0] = INTRA_FRAME;
@@ -303,28 +308,22 @@ unsigned int vp9_mv_cont_count[5][4] = {
 };
 #endif
 
-static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-        cm->fc.switchable_interp_prob[j][i] =
-            // vp9_read_prob(r);
-            vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]);
-      }
-    }
+  for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        fc->switchable_interp_prob[j][i] = vp9_read_prob_diff_update(r,
+                                             fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-    for (j = 0; j < VP9_INTER_MODES - 1; ++j) {
-      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-        // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r);
-        cm->fc.inter_mode_probs[i][j] =
-            vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]);
-      }
-    }
+    for (j = 0; j < VP9_INTER_MODES - 1; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        fc->inter_mode_probs[i][j] = vp9_read_prob_diff_update(r,
+                                       fc->inter_mode_probs[i][j]);
 }
 
 static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -337,21 +336,20 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
 static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
 
-  if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) {
+  if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
     nmv_context *const nmvc = &pbi->common.fc.nmvc;
     MACROBLOCKD *const xd = &pbi->mb;
     int i, j;
 
-    read_inter_mode_probs(cm, r);
+    read_inter_mode_probs(&cm->fc, r);
 
     if (cm->mcomp_filter_type == SWITCHABLE)
-      read_switchable_interp_probs(cm, r);
+      read_switchable_interp_probs(&cm->fc, r);
 
-    for (i = 0; i < INTRA_INTER_CONTEXTS; i++) {
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
         cm->fc.intra_inter_prob[i] =
             vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
-    }
 
     if (cm->allow_comp_inter_inter) {
       cm->comp_pred_mode = read_comp_pred_mode(r);
@@ -461,7 +459,7 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
   const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
   MV diff = {0, 0};
 
-  usehp = usehp && vp9_use_nmv_hp(ref);
+  usehp = usehp && vp9_use_mv_hp(ref);
   if (mv_joint_vertical(j))
     diff.row = read_mv_component(r, &ctx->comps[0], usehp);
 
@@ -476,27 +474,80 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
 
 static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
     VP9D_COMP *pbi, vp9_reader *r) {
-  const int index = treed_read(r, vp9_switchable_interp_tree,
-                               vp9_get_pred_probs(&pbi->common, &pbi->mb,
-                                                  PRED_SWITCHABLE_INTERP));
-  ++pbi->common.fc.switchable_interp_count
-                [vp9_get_pred_context(
-                    &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index];
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const vp9_prob *probs = vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP);
+  const int index = treed_read(r, vp9_switchable_interp_tree, probs);
+  const int ctx = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+  ++cm->fc.switchable_interp_count[ctx][index];
   return vp9_switchable_interp[index];
 }
 
+static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
+                                   MB_MODE_INFO *mbmi, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const int bw = 1 << b_width_log2(bsize);
+  const int bh = 1 << b_height_log2(bsize);
+
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+    const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+    const int bsl = MIN(bwl, bhl);
+    mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
+    cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
+  } else {
+     int idx, idy;
+     for (idy = 0; idy < 2; idy += bh) {
+       for (idx = 0; idx < 2; idx += bw) {
+         int ib = idy * 2 + idx, k;
+         int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+         mi->bmi[ib].as_mode.first = m;
+         cm->fc.y_mode_counts[0][m]++;
+         for (k = 1; k < bh; ++k)
+           mi->bmi[ib + k * 2].as_mode.first = m;
+         for (k = 1; k < bw; ++k)
+           mi->bmi[ib + k].as_mode.first = m;
+      }
+    }
+    mbmi->mode = mi->bmi[3].as_mode.first;
+  }
+
+  mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+  cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+}
+
+static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id,
+                                               vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+
+  MV_REFERENCE_FRAME ref;
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
+    const int ctx = vp9_get_pred_context(cm, xd, PRED_INTRA_INTER);
+    ref = (MV_REFERENCE_FRAME)
+              vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
+    cm->fc.intra_inter_count[ctx][ref != INTRA_FRAME]++;
+  } else {
+    ref = (MV_REFERENCE_FRAME)
+              vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
+  }
+  return ref;
+}
+
 static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                              int mi_row, int mi_col,
                              vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
-  nmv_context *const nmvc = &cm->fc.nmvc;
   MACROBLOCKD *const xd = &pbi->mb;
+  nmv_context *const nmvc = &cm->fc.nmvc;
 
   int_mv *const mv0 = &mbmi->mv[0];
   int_mv *const mv1 = &mbmi->mv[1];
-  BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
-  int bw = 1 << b_width_log2(bsize);
-  int bh = 1 << b_height_log2(bsize);
+  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const int bw = 1 << b_width_log2(bsize);
+  const int bh = 1 << b_height_log2(bsize);
 
   int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
   int j, idx, idy;
@@ -529,32 +580,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                        [mbmi->mb_skip_coeff]++;
   }
 
-  // Read the reference frame
-  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) {
-    mbmi->ref_frame[0] =
-        vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
-    cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
-                            [mbmi->ref_frame[0] != INTRA_FRAME]++;
-  } else {
-    mbmi->ref_frame[0] =
-        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
-  }
-
-  if (cm->txfm_mode == TX_MODE_SELECT &&
-      (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) &&
-      bsize >= BLOCK_SIZE_SB8X8) {
-    mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize);
-  } else if (bsize >= BLOCK_SIZE_SB32X32 &&
-             cm->txfm_mode >= ALLOW_32X32) {
-    mbmi->txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 &&
-             bsize >= BLOCK_SIZE_MB16X16) {
-    mbmi->txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) {
-    mbmi->txfm_size = TX_8X8;
-  } else {
-    mbmi->txfm_size = TX_4X4;
-  }
+  mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r);
+  mbmi->txfm_size = get_txfm_size(pbi, cm->txfm_mode, bsize,
+     (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME), r);
 
   // If reference frame is an Inter frame
   if (mbmi->ref_frame[0] != INTRA_FRAME) {
@@ -745,34 +773,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       }
     }
   } else {
-    // required for left and above block mv
-    mv0->as_int = 0;
-
-    if (bsize >= BLOCK_SIZE_SB8X8) {
-      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-      const int bsl = MIN(bwl, bhl);
-      mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
-      cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
-    } else {
-      int idx, idy;
-      for (idy = 0; idy < 2; idy += bh) {
-        for (idx = 0; idx < 2; idx += bw) {
-          int ib = idy * 2 + idx, k;
-          int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
-          mi->bmi[ib].as_mode.first = m;
-          cm->fc.y_mode_counts[0][m]++;
-          for (k = 1; k < bh; ++k)
-            mi->bmi[ib + k * 2].as_mode.first = m;
-          for (k = 1; k < bw; ++k)
-            mi->bmi[ib + k].as_mode.first = m;
-        }
-      }
-      mbmi->mode = mi->bmi[3].as_mode.first;
-    }
-
-    mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
-    cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+    mv0->as_int = 0;  // required for left and above block mv
+    read_intra_block_modes(pbi, mi, mbmi, r);
   }
 }
 
@@ -782,13 +784,10 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
 
   // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-    if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    if (vp9_read(r, VP9_MODE_UPDATE_PROB))
       cm->fc.mbskip_probs[k] =
           vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
-    }
-    // cm->fc.mbskip_probs[k] = vp9_read_prob(r);
-  }
 
   mb_mode_mv_init(pbi, r);
 }
@@ -802,7 +801,7 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
   MODE_INFO *mi = xd->mode_info_context;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+  if (cm->frame_type == KEY_FRAME || cm->intra_only) {
     kfread_modes(pbi, mi, mi_row, mi_col, r);
   } else {
     read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 49b181d69..078d09b0d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -276,9 +276,7 @@ static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
-  if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
 
   // prediction
   vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -327,8 +325,7 @@ static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
   assert(mbmi->sb_type == bsize);
   assert(mbmi->ref_frame[0] != INTRA_FRAME);
 
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
 
   // generate prediction
   vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -392,26 +389,24 @@ static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  if (mbmi->ref_frame[0] > INTRA_FRAME) {
+  // Select the appropriate reference frame for this MB
+  const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
+  const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
+  xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+  xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+  setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, xd->scale_factor,
+                   xd->scale_factor_uv);
+  xd->corrupted |= cfg->corrupted;
+
+  if (mbmi->ref_frame[1] > INTRA_FRAME) {
     // Select the appropriate reference frame for this MB
-    const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
-    const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
-    xd->scale_factor[0]    = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
-    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
-    setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,
-                     xd->scale_factor, xd->scale_factor_uv);
-    xd->corrupted |= cfg->corrupted;
-
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      // Select the appropriate reference frame for this MB
-      const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
-      const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
-      xd->scale_factor[1]    = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
-      xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
-      setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,
-                       xd->scale_factor, xd->scale_factor_uv);
-      xd->corrupted |= second_cfg->corrupted;
-    }
+    const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
+    const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
+    xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+    xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+    setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, xd->scale_factor,
+                     xd->scale_factor_uv);
+    xd->corrupted |= second_cfg->corrupted;
   }
 }
 
@@ -424,16 +419,17 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
       return;
   set_offsets(pbi, bsize, mi_row, mi_col);
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
-  set_refs(pbi, mi_row, mi_col);
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
     decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
                                      BLOCK_SIZE_SB8X8 : bsize);
-  else if (bsize < BLOCK_SIZE_SB8X8)
-    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
-  else
-    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-
+  } else {
+    set_refs(pbi, mi_row, mi_col);
+    if (bsize < BLOCK_SIZE_SB8X8)
+      decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
+    else
+      decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+  }
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
@@ -1187,7 +1183,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     if ((!keyframe) && (!pc->intra_only)) {
       vp9_adapt_mode_probs(pc);
       vp9_adapt_mode_context(pc);
-      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+      vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
     }
   }
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 54b6e2440..f655d456b 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_encodeframe.h"
@@ -46,9 +45,8 @@ int enc_debug = 0;
 
 void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE_TYPE bsize);
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -64,10 +62,8 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
  * Eventually this should be replaced by custom no-reference routines,
  *  which will be faster.
  */
-static const uint8_t VP9_VAR_OFFS[16] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
-
+static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128};
 
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -92,13 +88,11 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
 }
 
 // Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi,
-                                         MACROBLOCK *x, int use_dc_pred) {
+static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
+                                         int use_dc_pred) {
   return vp9_encode_intra(cpi, x, use_dc_pred);
 }
-
-DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
-
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
 
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
@@ -136,13 +130,12 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
 
     // Create a list to sort to
     CHECK_MEM_ERROR(sortlist,
-    vpx_calloc(sizeof(unsigned int),
-    cpi->common.MBs));
+        vpx_calloc(sizeof(unsigned int),
+            cpi->common.MBs));
 
     // Copy map to sort list
     vpx_memcpy(sortlist, cpi->mb_activity_map,
-    sizeof(unsigned int) * cpi->common.MBs);
-
+        sizeof(unsigned int) * cpi->common.MBs);
 
     // Ripple each value down to its correct position
     for (i = 1; i < cpi->common.MBs; i ++) {
@@ -153,13 +146,13 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
           sortlist[j - 1] = sortlist[j];
           sortlist[j] = tmp;
         } else
-          break;
+        break;
       }
     }
 
     // Even number MBs so estimate median as mean of two either side.
     median = (1 + sortlist[cpi->common.MBs >> 1] +
-              sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
+        sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
 
     cpi->activity_avg = median;
 
@@ -167,7 +160,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
   }
 #else
   // Simple mean for now
-  cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
+  cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
 #endif
 
   if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
@@ -211,9 +204,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
       b = 4 * act + cpi->activity_avg;
 
       if (b >= a)
-        *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
+      *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
       else
-        *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+      *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
 
 #if OUTPUT_NORM_ACT_STATS
       fprintf(f, " %6d", *(x->mb_activity_ptr));
@@ -238,9 +231,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
 // Loop through all MBs. Note activity of each, average activity and
 // calculate a normalized activity for each
 static void build_activity_map(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
 
 #if ALT_ACT_MEASURE
   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
@@ -285,7 +278,6 @@ static void build_activity_map(VP9_COMP *cpi) {
       x->plane[0].src.buf += 16;
     }
 
-
     // adjust to the next row of mbs
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
   }
@@ -315,7 +307,7 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   a = act + (2 * cpi->activity_avg);
   b = (2 * act) + cpi->activity_avg;
 
-  x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
+  x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a);
   x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
   x->errorperbit += (x->errorperbit == 0);
 #endif
@@ -324,15 +316,13 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   adjust_act_zbin(cpi, x);
 }
 
-static void update_state(VP9_COMP *cpi,
-                         PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE_TYPE bsize,
-                         int output_enabled) {
+static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                         BLOCK_SIZE_TYPE bsize, int output_enabled) {
   int i, x_idx, y;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
 #if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
   MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
 #endif
@@ -352,8 +342,8 @@ static void update_state(VP9_COMP *cpi,
   // when the mode was picked for it
   for (y = 0; y < bh; y++) {
     for (x_idx = 0; x_idx < bw; x_idx++) {
-      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx &&
-          (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
+      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx
+          && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
         MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
         *mi_addr = *mi;
       }
@@ -408,27 +398,27 @@ static void update_state(VP9_COMP *cpi,
 #endif
   } else {
     /*
-            // Reduce the activation RD thresholds for the best choice mode
-            if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
-                (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
-            {
-                int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
-                cpi->rd_thresh_mult[mb_mode_index] =
-                        (cpi->rd_thresh_mult[mb_mode_index]
-                         >= (MIN_THRESHMULT + best_adjustment)) ?
-                                cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
-                                MIN_THRESHMULT;
-                cpi->rd_threshes[mb_mode_index] =
-                        (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
-                        * cpi->rd_thresh_mult[mb_mode_index];
-
-            }
-    */
+     // Reduce the activation RD thresholds for the best choice mode
+     if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
+     (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
+     {
+     int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
+
+     cpi->rd_thresh_mult[mb_mode_index] =
+     (cpi->rd_thresh_mult[mb_mode_index]
+     >= (MIN_THRESHMULT + best_adjustment)) ?
+     cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
+     MIN_THRESHMULT;
+     cpi->rd_threshes[mb_mode_index] =
+     (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
+     * cpi->rd_thresh_mult[mb_mode_index];
+
+     }
+     */
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (mbmi->ref_frame[0] != INTRA_FRAME &&
-        (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
+    if (mbmi->ref_frame[0] != INTRA_FRAME
+        && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv, best_second_mv;
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -447,21 +437,21 @@ static void update_state(VP9_COMP *cpi,
       int i, j;
       for (j = 0; j < bh; ++j)
         for (i = 0; i < bw; ++i)
-          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i &&
-              (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
+          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i
+              && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
             xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 
-    if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-        is_inter_mode(mbmi->mode)) {
-      ++cpi->common.fc.switchable_interp_count
-          [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-          [vp9_switchable_interp_map[mbmi->interp_filter]];
+    if (cpi->common.mcomp_filter_type == SWITCHABLE
+        && is_inter_mode(mbmi->mode)) {
+      ++cpi->common.fc.switchable_interp_count[vp9_get_pred_context(
+          &cpi->common, xd, PRED_SWITCHABLE_INTERP)][vp9_switchable_interp_map[mbmi
+          ->interp_filter]];
     }
 
     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;
+    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
   }
 }
 
@@ -484,29 +474,26 @@ static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,
   return seg_id;
 }
 
-void vp9_setup_src_planes(MACROBLOCK *x,
-                          const YV12_BUFFER_CONFIG *src,
+void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mb_row, int mb_col) {
-  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                         src->alpha_buffer};
-  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                    src->alpha_stride};
+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
+      ->alpha_buffer};
+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
+      ->alpha_stride};
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    setup_pred_plane(&x->plane[i].src,
-                     buffers[i], strides[i],
-                     mb_row, mb_col, NULL,
-                     x->e_mbd.plane[i].subsampling_x,
+    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+                     NULL, x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
   }
 }
 
-static void set_offsets(VP9_COMP *cpi,
-                        int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
+                        BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
@@ -518,10 +505,10 @@ static void set_offsets(VP9_COMP *cpi,
 
   // entropy context structures
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 2 >>  xd->plane[i].subsampling_x);
-    xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
+    xd->plane[i].above_context = cm->above_context[i]
+        + (mi_col * 2 >> xd->plane[i].subsampling_x);
+    xd->plane[i].left_context = cm->left_context[i]
+        + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
 
   // partition contexts
@@ -532,25 +519,24 @@ static void set_offsets(VP9_COMP *cpi,
   x->active_ptr = cpi->active_map + idx_map;
 
   /* pointers to mode info contexts */
-  x->partition_info          = x->pi + idx_str;
-  xd->mode_info_context      = cm->mi + idx_str;
+  x->partition_info = x->pi + idx_str;
+  xd->mode_info_context = cm->mi + idx_str;
   mbmi = &xd->mode_info_context->mbmi;
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
-  xd->prev_mode_info_context = cm->prev_mi ?
-                                 cm->prev_mi + idx_str : NULL;
+  xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL;
 
   // Set up destination pointers
   setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
 
   /* Set up limit values for MV components to prevent them from
    * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE +
-                   (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
-  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE +
-                   (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
+  x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
+      + (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
+  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
+      + (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
 
   // Set up distance of MB to edge of frame in 1/8th pel units
   assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));
@@ -565,30 +551,30 @@ static void set_offsets(VP9_COMP *cpi,
 
   /* segment ID */
   if (xd->segmentation_enabled) {
-    uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
-                                                  : cm->last_frame_seg_map;
-    mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row,
-                                   cm->mi_rows, mi_col, cm->mi_cols);
+    uint8_t *map =
+        xd->update_mb_segmentation_map ?
+            cpi->segmentation_map : cm->last_frame_seg_map;
+    mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row, cm->mi_rows, mi_col,
+                                   cm->mi_cols);
 
     assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
     vp9_mb_init_quantizer(cpi, x);
 
-    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0
+        && !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME)
+        && vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
       cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
     } else {
       const int y = mb_row & ~3;
       const int x = mb_col & ~3;
-      const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
+      const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress =
-          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
-      const int mb_cols =
-          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
+      const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
+      const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
+          >> 1;
 
-      cpi->seg0_progress =
-          ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
+      cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
+          << 16) / cm->MBs;
     }
   } else {
     mbmi->segment_id = 0;
@@ -596,11 +582,11 @@ static void set_offsets(VP9_COMP *cpi,
 }
 
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
-                          TOKENEXTRA **tp, int *totalrate, int *totaldist,
+                          TOKENEXTRA **tp, int *totalrate, int64_t *totaldist,
                           BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
 
   x->rd_search = 1;
 
@@ -624,22 +610,21 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
 }
 
 static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MB_MODE_INFO * const mbmi = &mi->mbmi;
 
   if (cm->frame_type != KEY_FRAME) {
     int segment_id, seg_ref_active;
 
     segment_id = mbmi->segment_id;
-    seg_ref_active = vp9_segfeature_active(xd, segment_id,
-                                           SEG_LVL_REF_FRAME);
+    seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
 
     if (!seg_ref_active)
-      cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
-                            [mbmi->ref_frame[0] > INTRA_FRAME]++;
+      cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)][mbmi
+          ->ref_frame[0] > INTRA_FRAME]++;
 
     // If the segment reference feature is enabled we have only a single
     // reference frame allowed for the segment so exclude it from
@@ -647,19 +632,18 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
     if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
       if (cm->comp_pred_mode == HYBRID_PREDICTION)
         cpi->comp_inter_count[vp9_get_pred_context(cm, xd,
-                                                   PRED_COMP_INTER_INTER)]
-                             [mbmi->ref_frame[1] > INTRA_FRAME]++;
+                                                   PRED_COMP_INTER_INTER)][mbmi
+            ->ref_frame[1] > INTRA_FRAME]++;
 
       if (mbmi->ref_frame[1] > INTRA_FRAME) {
-        cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)]
-                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
+        cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)][mbmi
+            ->ref_frame[0] == GOLDEN_FRAME]++;
       } else {
-        cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)]
-                             [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+        cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)][0][mbmi
+            ->ref_frame[0] != LAST_FRAME]++;
         if (mbmi->ref_frame[0] != LAST_FRAME)
-          cpi->single_ref_count[vp9_get_pred_context(cm, xd,
-                                                     PRED_SINGLE_REF_P2)]
-                               [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+          cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2)][1][mbmi
+              ->ref_frame[0] != GOLDEN_FRAME]++;
       }
     }
     // Count of last ref frame 0,0 usage
@@ -673,7 +657,7 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
 // partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD * const xd = &x->e_mbd;
 
   switch (bsize) {
     case BLOCK_SIZE_SB64X64:
@@ -704,7 +688,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
       return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL;
+      return NULL ;
   }
 }
 
@@ -722,48 +706,45 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
       return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
-      return NULL;
+      return NULL ;
   }
 }
 
 static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                            PARTITION_CONTEXT sa[8],
-                            PARTITION_CONTEXT sl[8],
+                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int p;
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int mwl = mi_width_log2(bsize), mw = 1 << mwl;
   int mhl = mi_height_log2(bsize), mh = 1 << mhl;
   for (p = 0; p < MAX_MB_PLANE; p++) {
-    vpx_memcpy(cm->above_context[p] +
-               ((mi_col * 2) >> xd->plane[p].subsampling_x),
-               a + bw * p,
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(cm->left_context[p] +
-               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-               l + bh * p,
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
-  }
+    vpx_memcpy(
+        cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        a + bw * p, sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(
+        cm->left_context[p]
+            + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),l + bh * p,
+            sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+          }
   vpx_memcpy(cm->above_seg_context + mi_col, sa,
              sizeof(PARTITION_CONTEXT) * mw);
   vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(PARTITION_CONTEXT) * mh);
-}
+  sizeof(PARTITION_CONTEXT) * mh)
+             ;}
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
-                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                          PARTITION_CONTEXT sa[8],
-                          PARTITION_CONTEXT sl[8],
-                          BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                         BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int p;
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
@@ -772,25 +753,26 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
-    vpx_memcpy(a + bw * p, cm->above_context[p] +
-               (mi_col * 2 >> xd->plane[p].subsampling_x),
-               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(l + bh * p, cm->left_context[p] +
-               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
-  }
+    vpx_memcpy(
+        a + bw * p,
+        cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(
+        l + bh * p,
+        cm->left_context[p]
+            + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+          }
   vpx_memcpy(sa, cm->above_seg_context + mi_col,
              sizeof(PARTITION_CONTEXT) * mw);
   vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-             sizeof(PARTITION_CONTEXT) * mh);
-}
+  sizeof(PARTITION_CONTEXT) * mh)
+             ;}
 
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
-                     int mi_row, int mi_col, int output_enabled,
-                     BLOCK_SIZE_TYPE bsize, int sub_index) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -813,12 +795,11 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
   }
 }
 
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
-                      int mi_row, int mi_col, int output_enabled,
-                      BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+                      int output_enabled, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
   int bwl, bhl;
@@ -838,17 +819,17 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
 
   if (bsl == bwl && bsl == bhl) {
     if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
-        cpi->partition_count[pl][PARTITION_NONE]++;
+      cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
     if (output_enabled)
       cpi->partition_count[pl][PARTITION_VERT]++;
-    encode_b(cpi, tp, mi_row, mi_col,      output_enabled, c1, 0);
+    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
     encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
   } else if (bsl == bwl && bsl > bhl) {
     if (output_enabled)
       cpi->partition_count[pl][PARTITION_HORZ]++;
-    encode_b(cpi, tp, mi_row,      mi_col, output_enabled, c1, 0);
+    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
     encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
   } else {
     BLOCK_SIZE_TYPE subsize;
@@ -869,8 +850,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
   }
 
-  if (bsize >= BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+  if (bsize >= BLOCK_SIZE_SB8X8
+      && (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     update_partition_context(xd, c1, bsize);
   }
@@ -880,26 +861,28 @@ static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
                              BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
-  int bs = (1 << bsl) / 2;  //
   int block_row, block_col;
-  int row, col;
-
-  // this test function sets the entire macroblock to the same bsize
-  for (block_row = 0; block_row < 8; block_row += bs) {
-    for (block_col = 0; block_col < 8; block_col += bs) {
-      for (row = 0; row < bs; row++) {
-        for (col = 0; col < bs; col++) {
-          m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
-        }
-      }
+  for (block_row = 0; block_row < 8; ++block_row) {
+    for (block_col = 0; block_col < 8; ++block_col) {
+      m[block_row * mis + block_col].mbmi.sb_type = bsize;
+    }
+  }
+}
+static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int block_row, block_col;
+  for (block_row = 0; block_row < 8; ++block_row) {
+    for (block_col = 0; block_col < 8; ++block_col) {
+      m[block_row * mis + block_col].mbmi.sb_type =
+          p[block_row * mis + block_col].mbmi.sb_type;
     }
   }
 }
 
-static void set_block_size(VP9_COMMON *const cm,
-                           MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
-                           int mi_row, int mi_col) {
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
+                           BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+                           int mi_col) {
   int row, col;
   int bwl = b_width_log2(bsize);
   int bhl = b_height_log2(bsize);
@@ -911,10 +894,11 @@ static void set_block_size(VP9_COMMON *const cm,
     for (col = 0; col < bs; col++) {
       if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
         continue;
-      m2[row*mis+col].mbmi.sb_type = bsize;
+      m2[row * mis + col].mbmi.sb_type = bsize;
     }
   }
 }
+
 typedef struct {
   int64_t sum_square_error;
   int64_t sum_error;
@@ -922,11 +906,15 @@ typedef struct {
   int variance;
 } var;
 
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
 #define VT(TYPE, BLOCKSIZE) \
   typedef struct { \
-    var none; \
-    var horz[2]; \
-    var vert[2]; \
+    partition_variance vt; \
     BLOCKSIZE split[4]; } TYPE;
 
 VT(v8x8, var)
@@ -934,20 +922,67 @@ VT(v16x16, v8x8)
 VT(v32x32, v16x16)
 VT(v64x64, v32x32)
 
+typedef struct {
+  partition_variance *vt;
+  var *split[4];
+} vt_node;
+
 typedef enum {
   V16X16,
   V32X32,
   V64X64,
 } TREE_LEVEL;
 
+static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+  int i;
+  switch (block_size) {
+    case BLOCK_SIZE_SB64X64: {
+      v64x64 *vt = (v64x64 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].vt.none;
+      break;
+    }
+    case BLOCK_SIZE_SB32X32: {
+      v32x32 *vt = (v32x32 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].vt.none;
+      break;
+    }
+    case BLOCK_SIZE_MB16X16: {
+      v16x16 *vt = (v16x16 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].vt.none;
+      break;
+    }
+    case BLOCK_SIZE_SB8X8: {
+      v8x8 *vt = (v8x8 *) data;
+      node->vt = &vt->vt;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i];
+      break;
+    }
+    default:
+      node->vt = 0;
+      for (i = 0; i < 4; i++)
+        node->split[i] = 0;
+      assert(-1);
+  }
+}
+
 // Set variance values given sum square error, sum error, count.
 static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->count = c;
-  v->variance = 256
-      * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
-      / v->count;
+  if (c > 0)
+    v->variance = 256
+        * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+        / v->count;
+  else
+    v->variance = 0;
 }
 
 // Combine 2 variance structures by summing the sum_error, sum_square_error,
@@ -956,31 +991,95 @@ void sum_2_variances(var *r, var *a, var*b) {
   fill_variance(r, a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->count + b->count);
 }
-// Fill one level of our variance tree,  by summing the split sums into each of
-// the horizontal, vertical and none from split and recalculating variance.
-#define fill_variance_tree(VT) \
-  sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
-  sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
-  sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
-  sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
-  sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
-
-// Set the blocksize in the macroblock info structure if the variance is less
-// than our threshold to one of none, horz, vert.
-#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
-  if (VT.none.variance < threshold) { \
-    set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
-    ACTION; \
-  } \
-  if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
-    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
-    ACTION; \
-  } \
-  if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
-    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
-    ACTION; \
+
+static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+  vt_node node;
+  tree_to_node(data, block_size, &node);
+  sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
+  sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
+  sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
+  sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
+  sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
+}
+
+#if PERFORM_RANDOM_PARTITIONING
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+    BLOCK_SIZE_TYPE block_size, int mi_row,
+    int mi_col, int mi_size) {
+  VP9_COMMON * const cm = &cpi->common;
+  vt_node vt;
+  const int mis = cm->mode_info_stride;
+  int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
+
+  tree_to_node(data, block_size, &vt);
+
+  // split none is available only if we have more than half a block size
+  // in width and height inside the visible image
+  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
+      (rand() & 3) < 1) {
+    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+    return 1;
+  }
+
+  // vertical split is available on all but the bottom border
+  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+      && (rand() & 3) < 1) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+        mi_col);
+    return 1;
   }
 
+  // horizontal split is available on all but the right border
+  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+      && (rand() & 3) < 1) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+        mi_col);
+    return 1;
+  }
+
+  return 0;
+}
+
+#else
+
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+                               BLOCK_SIZE_TYPE block_size, int mi_row,
+                               int mi_col, int mi_size) {
+  VP9_COMMON * const cm = &cpi->common;
+  vt_node vt;
+  const int mis = cm->mode_info_stride;
+  int64_t threshold = 50 * cpi->common.base_qindex;
+
+  tree_to_node(data, block_size, &vt);
+
+  // split none is available only if we have more than half a block size
+  // in width and height inside the visible image
+  if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
+      && vt.vt->none.variance < threshold) {
+    set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+    return 1;
+  }
+
+  // vertical split is available on all but the bottom border
+  if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+      && vt.vt->vert[1].variance < threshold) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+                   mi_col);
+    return 1;
+  }
+
+  // horizontal split is available on all but the right border
+  if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+      && vt.vt->horz[1].variance < threshold) {
+    set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+                   mi_col);
+    return 1;
+  }
+
+  return 0;
+}
+#endif
+
 static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
                                 int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
@@ -993,8 +1092,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   v64x64 vt;
   unsigned char * s;
   int sp;
-  const unsigned char * d = xd->plane[0].pre->buf;
-  int dp = xd->plane[0].pre->stride;
+  const unsigned char * d;
+  int dp;
   int pixels_wide = 64, pixels_high = 64;
 
   vpx_memset(&vt, 0, sizeof(vt));
@@ -1014,81 +1113,89 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   // but this needs more experimentation.
   threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
 
-  // if ( cm->frame_type == KEY_FRAME ) {
   d = vp9_64x64_zeros;
   dp = 64;
-  // }
+  if (cm->frame_type != KEY_FRAME) {
+    int_mv nearest_mv, near_mv;
+    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0];
+    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
+
+    setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col,
+                     xd->scale_factor, xd->scale_factor_uv);
+    xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
+    xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
+    vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
+                          &nearest_mv, &near_mv);
+
+    xd->mode_info_context->mbmi.mv[0] = nearest_mv;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+
+  }
 
   // Fill in the entire tree of 8x8 variances for splits.
   for (i = 0; i < 4; i++) {
     const int x32_idx = ((i & 1) << 5);
     const int y32_idx = ((i >> 1) << 5);
     for (j = 0; j < 4; j++) {
-      const int x_idx = x32_idx + ((j & 1) << 4);
-      const int y_idx = y32_idx + ((j >> 1) << 4);
-      const uint8_t *st = s + y_idx * sp + x_idx;
-      const uint8_t *dt = d + y_idx * dp + x_idx;
-      unsigned int sse = 0;
-      int sum = 0;
+      const int x16_idx = x32_idx + ((j & 1) << 4);
+      const int y16_idx = y32_idx + ((j >> 1) << 4);
       v16x16 *vst = &vt.split[i].split[j];
-      sse = sum = 0;
-      if (x_idx < pixels_wide && y_idx < pixels_high)
-        vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);
-      fill_variance(&vst->split[0].none, sse, sum, 64);
-      sse = sum = 0;
-      if (x_idx + 8 < pixels_wide && y_idx < pixels_high)
-        vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);
-      fill_variance(&vst->split[1].none, sse, sum, 64);
-      sse = sum = 0;
-      if (x_idx < pixels_wide && y_idx + 8 < pixels_high)
-        vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);
-      fill_variance(&vst->split[2].none, sse, sum, 64);
-      sse = sum = 0;
-      if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)
-        vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,
-                            &sum);
-      fill_variance(&vst->split[3].none, sse, sum, 64);
+      for (k = 0; k < 4; k++) {
+        int x_idx = x16_idx + ((k & 1) << 3);
+        int y_idx = y16_idx + ((k >> 1) << 3);
+        unsigned int sse = 0;
+        int sum = 0;
+        if (x_idx < pixels_wide && y_idx < pixels_high)
+          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
+                              d + y_idx * dp + x_idx, dp, &sse, &sum);
+        fill_variance(&vst->split[k].vt.none, sse, sum, 64);
+      }
     }
   }
   // Fill the rest of the variance tree by summing the split partition
   // values.
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j])
+      fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16);
     }
-    fill_variance_tree(&vt.split[i])
+    fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32);
   }
-  fill_variance_tree(&vt)
-
-  // Now go through the entire structure,  splitting every blocksize until
+  fill_variance_tree(&vt, BLOCK_SIZE_SB64X64);
+  // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold,  or we
   // hit 8x8.
-  set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
-  for (i = 0; i < 4; ++i) {
-    const int x32_idx = ((i & 1) << 2);
-    const int y32_idx = ((i >> 1) << 2);
-    set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
-                continue);
-
-    for (j = 0; j < 4; ++j) {
-      const int x16_idx = ((j & 1) << 1);
-      const int y16_idx = ((j >> 1) << 1);
-      set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
-                  mi_col+x32_idx+x16_idx, continue);
-
-      for (k = 0; k < 4; ++k) {
-        const int x8_idx = (k & 1);
-        const int y8_idx = (k >> 1);
-        set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
-                       mi_row + y32_idx + y16_idx + y8_idx,
-                       mi_col + x32_idx + x16_idx + x8_idx);
+  if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col,
+                           4)) {
+    for (i = 0; i < 4; ++i) {
+      const int x32_idx = ((i & 1) << 2);
+      const int y32_idx = ((i >> 1) << 2);
+      if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
+        for (j = 0; j < 4; ++j) {
+          const int x16_idx = ((j & 1) << 1);
+          const int y16_idx = ((j >> 1) << 1);
+          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
+                                   BLOCK_SIZE_MB16X16,
+                                   (mi_row + y32_idx + y16_idx),
+                                   (mi_col + x32_idx + x16_idx), 1)) {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+                             (mi_row + y32_idx + y16_idx + y8_idx),
+                             (mi_col + x32_idx + x16_idx + x8_idx));
+            }
+          }
+        }
       }
     }
   }
 }
 static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
-                             int *rate, int *dist) {
+                             int *rate, int64_t *dist) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1098,18 +1205,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   int bsl = b_width_log2(bsize);
   int bh = (1 << bhl);
   int bs = (1 << bsl);
-  int bss = (1 << bsl)/4;
+  int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition;
   BLOCK_SIZE_TYPE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
-  int r = 0, d = 0;
+  int r = 0;
+  int64_t d = 0;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-
   // parse the partition type
   if ((bwl == bsl) && (bhl == bsl))
     partition = PARTITION_NONE;
@@ -1124,18 +1231,15 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
 
   subsize = get_subsize(bsize, partition);
 
-  // TODO(JBB): this restriction is here because pick_sb_modes can return
-  // r's that are INT_MAX meaning we can't select a mode / mv for this block.
-  // when the code is made to work for less than sb8x8 we need to come up with
-  // a solution to this problem.
-  assert(subsize >= BLOCK_SIZE_SB8X8);
-
-  if (bsize >= BLOCK_SIZE_SB8X8) {
-    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-    xd->above_seg_context = cm->above_seg_context + mi_col;
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    if (xd->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+  } else {
     *(get_sb_partitioning(x, bsize)) = subsize;
   }
-
   pl = partition_plane_context(xd, bsize);
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   switch (partition) {
@@ -1149,7 +1253,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
                     get_block_context(x, subsize));
       if (mi_row + (bh >> 1) <= cm->mi_rows) {
-        int rt, dt;
+        int rt;
+        int64_t dt;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *(get_sb_index(xd, subsize)) = 1;
@@ -1167,7 +1272,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
                     get_block_context(x, subsize));
       if (mi_col + (bs >> 1) <= cm->mi_cols) {
-        int rt, dt;
+        int rt;
+        int64_t dt;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *(get_sb_index(xd, subsize)) = 1;
@@ -1186,7 +1292,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
         int x_idx = (i & 1) * (bs >> 2);
         int y_idx = (i >> 1) * (bs >> 2);
         int jj = i >> 1, ii = i & 0x01;
-        int rt, dt;
+        int rt;
+        int64_t dt;
 
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
@@ -1206,17 +1313,6 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
       assert(0);
   }
 
-  // update partition context
-#if CONFIG_AB4X4
-  if (bsize >= BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
-#else
-  if (bsize > BLOCK_SIZE_SB8X8
-      && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
-#endif
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    update_partition_context(xd, subsize, bsize);
-  }
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   if (r < INT_MAX && d < INT_MAX)
@@ -1229,21 +1325,21 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previously rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
-                              int mi_row, int mi_col,
-                              BLOCK_SIZE_TYPE bsize,
-                              int *rate, int *dist) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+                              int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+                              int64_t *dist) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int bsl = b_width_log2(bsize), bs = 1 << bsl;
   int ms = bs / 2;
-  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   int i, pl;
   BLOCK_SIZE_TYPE subsize;
-  int srate = INT_MAX, sdist = INT_MAX;
+  int srate = INT_MAX;
+  int64_t sdist = INT_MAX;
 
   if (bsize < BLOCK_SIZE_SB8X8)
     if (xd->ab_index != 0) {
@@ -1256,121 +1352,132 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // PARTITION_SPLIT
-  if (bsize >= BLOCK_SIZE_SB8X8) {
-    int r4 = 0, d4 = 0;
-    subsize = get_subsize(bsize, PARTITION_SPLIT);
-    *(get_sb_partitioning(x, bsize)) = subsize;
+  if (!cpi->sf.use_partitions_greater_than
+      || (cpi->sf.use_partitions_greater_than
+          && bsize > cpi->sf.greater_than_block_size)) {
+    if (bsize >= BLOCK_SIZE_SB8X8) {
+      int r4 = 0;
+      int64_t d4 = 0;
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      *(get_sb_partitioning(x, bsize)) = subsize;
 
-    for (i = 0; i < 4; ++i) {
-      int x_idx = (i & 1) * (ms >> 1);
-      int y_idx = (i >> 1) * (ms >> 1);
-      int r = 0, d = 0;
+      for (i = 0; i < 4; ++i) {
+        int x_idx = (i & 1) * (ms >> 1);
+        int y_idx = (i >> 1) * (ms >> 1);
+        int r = 0;
+        int64_t d = 0;
 
-      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
-        continue;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
 
-      *(get_sb_index(xd, subsize)) = i;
-      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-                        &r, &d);
+        *(get_sb_index(xd, subsize)) = i;
+        rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
+                          &d);
 
-      r4 += r;
-      d4 += d;
+        r4 += r;
+        d4 += d;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      if (r4 < INT_MAX)
+        r4 += x->partition_cost[pl][PARTITION_SPLIT];
+      assert(r4 >= 0);
+      assert(d4 >= 0);
+      srate = r4;
+      sdist = d4;
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    if (r4 < INT_MAX)
-      r4 += x->partition_cost[pl][PARTITION_SPLIT];
-    assert(r4 >= 0);
-    assert(d4 >= 0);
-    srate = r4;
-    sdist = d4;
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
-
-  // PARTITION_HORZ
-  if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
-    int r2, d2;
-    int r = 0, d = 0;
-    subsize = get_subsize(bsize, PARTITION_HORZ);
-    *(get_sb_index(xd, subsize)) = 0;
-    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
-                  get_block_context(x, subsize));
-
-    if (mi_row + (ms >> 1) < cm->mi_rows) {
-      update_state(cpi, get_block_context(x, subsize), subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
-      *(get_sb_index(xd, subsize)) = 1;
-      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+  if (!cpi->sf.use_partitions_less_than
+      || (cpi->sf.use_partitions_less_than
+          && bsize <= cpi->sf.less_than_block_size)) {
+    // PARTITION_HORZ
+    if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+      int r2, r = 0;
+      int64_t d2, d = 0;
+      subsize = get_subsize(bsize, PARTITION_HORZ);
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
                     get_block_context(x, subsize));
-      r2 += r;
-      d2 += d;
-    }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    if (r2 < INT_MAX)
-      r2 += x->partition_cost[pl][PARTITION_HORZ];
-    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-      srate = r2;
-      sdist = d2;
-      *(get_sb_partitioning(x, bsize)) = subsize;
+
+      if (mi_row + (ms >> 1) < cm->mi_rows) {
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+                      get_block_context(x, subsize));
+        r2 += r;
+        d2 += d;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      if (r2 < INT_MAX)
+        r2 += x->partition_cost[pl][PARTITION_HORZ];
+      if (RDCOST(x->rdmult, x->rddiv, r2, d2)
+          < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+        srate = r2;
+        sdist = d2;
+        *(get_sb_partitioning(x, bsize)) = subsize;
+      }
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-  }
 
-  // PARTITION_VERT
-  if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
-    int r2, d2;
-    subsize = get_subsize(bsize, PARTITION_VERT);
-    *(get_sb_index(xd, subsize)) = 0;
-    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
-                  get_block_context(x, subsize));
-    if (mi_col + (ms >> 1) < cm->mi_cols) {
-      int r = 0, d = 0;
-      update_state(cpi, get_block_context(x, subsize), subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
-      *(get_sb_index(xd, subsize)) = 1;
-      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+    // PARTITION_VERT
+    if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
+      int r2;
+      int64_t d2;
+      subsize = get_subsize(bsize, PARTITION_VERT);
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
                     get_block_context(x, subsize));
-      r2 += r;
-      d2 += d;
-    }
-    set_partition_seg_context(cm, xd, mi_row, mi_col);
-    pl = partition_plane_context(xd, bsize);
-    if (r2 < INT_MAX)
-      r2 += x->partition_cost[pl][PARTITION_VERT];
-    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-      srate = r2;
-      sdist = d2;
-      *(get_sb_partitioning(x, bsize)) = subsize;
-    }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-  }
+      if (mi_col + (ms >> 1) < cm->mi_cols) {
+        int r = 0;
+        int64_t d = 0;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-  // PARTITION_NONE
-  if ((mi_row + (ms >> 1) < cm->mi_rows) &&
-      (mi_col + (ms >> 1) < cm->mi_cols)) {
-    int r, d;
-    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
-                  get_block_context(x, bsize));
-    if (bsize >= BLOCK_SIZE_SB8X8) {
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+                      get_block_context(x, subsize));
+        r2 += r;
+        d2 += d;
+      }
       set_partition_seg_context(cm, xd, mi_row, mi_col);
       pl = partition_plane_context(xd, bsize);
-      r += x->partition_cost[pl][PARTITION_NONE];
+      if (r2 < INT_MAX)
+        r2 += x->partition_cost[pl][PARTITION_VERT];
+      if (RDCOST(x->rdmult, x->rddiv, r2, d2)
+          < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+        srate = r2;
+        sdist = d2;
+        *(get_sb_partitioning(x, bsize)) = subsize;
+      }
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
 
-    if (RDCOST(x->rdmult, x->rddiv, r, d) <
-        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
-      srate = r;
-      sdist = d;
-      if (bsize >= BLOCK_SIZE_SB8X8)
-        *(get_sb_partitioning(x, bsize)) = bsize;
+    // PARTITION_NONE
+    if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+        (mi_col + (ms >> 1) < cm->mi_cols)) {
+      int r;
+      int64_t d;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                    get_block_context(x, bsize));
+      if (bsize >= BLOCK_SIZE_SB8X8) {
+        set_partition_seg_context(cm, xd, mi_row, mi_col);
+        pl = partition_plane_context(xd, bsize);
+        r += x->partition_cost[pl][PARTITION_NONE];
+      }
+
+      if (RDCOST(x->rdmult, x->rddiv, r, d)
+          < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+        srate = r;
+        sdist = d;
+        if (bsize >= BLOCK_SIZE_SB8X8)
+          *(get_sb_partitioning(x, bsize)) = bsize;
+      }
     }
   }
-
   *rate = srate;
   *dist = sdist;
 
@@ -1388,9 +1495,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi, int mi_row,
-                       TOKENEXTRA **tp, int *totalrate) {
-  VP9_COMMON *const cm = &cpi->common;
+static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
+                          int *totalrate) {
+  VP9_COMMON * const cm = &cpi->common;
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -1398,27 +1505,49 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row,
   vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) {
-    int dummy_rate, dummy_dist;
-    if (cpi->speed < 5) {
-      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
-                        &dummy_rate, &dummy_dist);
-    } else {
+  for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+      mi_col += 64 / MI_SIZE) {
+    int dummy_rate;
+    int64_t dummy_dist;
+    if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+        cpi->sf.use_one_partition_size_always ) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO *m = cm->mi + idx_str;
-      // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);
-      choose_partitioning(cpi, cm->mi, mi_row, mi_col);
-      rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
-                       &dummy_rate, &dummy_dist);
+      MODE_INFO *p = cm->prev_mi + idx_str;
+
+      if (cpi->sf.use_one_partition_size_always) {
+        set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+        set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+        rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                         &dummy_rate, &dummy_dist);
+      } else if (cpi->sf.partition_by_variance) {
+        choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+        rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                         &dummy_rate, &dummy_dist);
+      } else {
+        if ((cpi->common.current_video_frame & 1) == 0 || cm->prev_mi == 0
+            || cpi->common.show_frame == 0
+            || cpi->common.frame_type == KEY_FRAME
+            || cpi->is_src_frame_alt_ref) {
+          rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                            &dummy_rate, &dummy_dist);
+        } else {
+          copy_partitioning(cpi, m, p);
+          rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                           &dummy_rate, &dummy_dist);
+        }
+      }
+    } else {
+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                        &dummy_rate, &dummy_dist);
     }
   }
 }
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
 
   x->act_zbin_adj = 0;
   cpi->seg0_idx = 0;
@@ -1438,7 +1567,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // TODO(jkoleszar): are these initializations required?
   setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
-                   0, 0, NULL, NULL);
+                   0, 0, NULL, NULL );
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
   vp9_build_block_offsets(x);
@@ -1463,36 +1592,36 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
-                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
-  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
-                                       mi_cols_aligned_to_sb(cm));
+  vpx_memset(
+      cm->above_context[0], 0,
+      sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
+  vpx_memset(cm->above_seg_context, 0,
+             sizeof(PARTITION_CONTEXT) * mi_cols_aligned_to_sb(cm));
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
-    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
-    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
-    cpi->mb.optimize              = 0;
-    cpi->common.filter_level      = 0;
-    cpi->zbin_mode_boost_enabled  = 0;
-    cpi->common.txfm_mode         = ONLY_4X4;
+    cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
+    cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+    cpi->mb.optimize = 0;
+    cpi->common.filter_level = 0;
+    cpi->zbin_mode_boost_enabled = 0;
+    cpi->common.txfm_mode = ONLY_4X4;
   } else {
-    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
-    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
+    cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
+    cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
   }
 }
 
-
 static void encode_frame_internal(VP9_COMP *cpi) {
   int mi_row;
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int totalrate;
 
 //  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
@@ -1524,10 +1653,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(cpi->coef_counts);
   vp9_zero(cm->fc.eob_branch_counts);
 
-  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
-                           cm->y_dc_delta_q == 0 &&
-                           cm->uv_dc_delta_q == 0 &&
-                           cm->uv_ac_delta_q == 0;
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
+      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
   switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
 
   vp9_frame_init_quantizer(cpi);
@@ -1553,7 +1680,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   set_prev_mi(cm);
 
   {
-    struct vpx_usec_timer  emr_timer;
+    struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
 
     {
@@ -1570,9 +1697,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           // For each row of SBs in the frame
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
-               mi_row < cm->cur_tile_mi_row_end;
-               mi_row += 8)
+              mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
+
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <=
                  get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -1602,9 +1729,8 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
   if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
-    return (!!(ref_flags & VP9_GOLD_FLAG) +
-            !!(ref_flags & VP9_LAST_FLAG) +
-            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
   }
 }
 
@@ -1631,35 +1757,33 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
   }
 }
 
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
-                                   int mis, TX_SIZE txfm_max,
-                                   int bw, int bh, int mi_row, int mi_col,
-                                   BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
+                                   TX_SIZE txfm_max, int bw, int bh, int mi_row,
+                                   int mi_col, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MB_MODE_INFO * const mbmi = &mi->mbmi;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   if (mbmi->txfm_size > txfm_max) {
-    MACROBLOCK *const x = &cpi->mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
+    MACROBLOCK * const x = &cpi->mb;
+    MACROBLOCKD * const xd = &x->e_mbd;
     const int segment_id = mbmi->segment_id;
     const int ymbs = MIN(bh, cm->mi_rows - mi_row);
     const int xmbs = MIN(bw, cm->mi_cols - mi_col);
 
     xd->mode_info_context = mi;
-    assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
-           get_skip_flag(mi, mis, ymbs, xmbs));
+    assert(
+        vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) || get_skip_flag(mi, mis, ymbs, xmbs));
     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
   }
 }
 
 static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
-                                    TX_SIZE txfm_max,
-                                    int mi_row, int mi_col,
+                                    TX_SIZE txfm_max, int mi_row, int mi_col,
                                     BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
   const int mis = cm->mode_info_stride;
   int bwl, bhl;
   const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
@@ -1671,18 +1795,18 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
   bhl = mi_height_log2(mi->mbmi.sb_type);
 
   if (bwl == bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
-                           mi_row, mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+                           mi_col, bsize);
   } else if (bwl == bsl && bhl < bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
-                           mi_row, mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
+                           bsize);
     reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
                            mi_row + bs, mi_col, bsize);
   } else if (bwl < bsl && bhl == bsl) {
-    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
-                           mi_row, mi_col, bsize);
-    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
-                           mi_row, mi_col + bs, bsize);
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
+                           bsize);
+    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
+                           mi_col + bs, bsize);
   } else {
     BLOCK_SIZE_TYPE subsize;
     int n;
@@ -1700,32 +1824,30 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
     for (n = 0; n < 4; n++) {
       const int y_idx = n >> 1, x_idx = n & 0x01;
 
-      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
-                              txfm_max, mi_row + y_idx * bs,
-                              mi_col + x_idx * bs, subsize);
+      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
+                              mi_row + y_idx * bs, mi_col + x_idx * bs,
+                              subsize);
     }
   }
 }
 
 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
   int mi_row, mi_col;
   const int mis = cm->mode_info_stride;
   MODE_INFO *mi, *mi_ptr = cm->mi;
 
-  for (mi_row = 0; mi_row < cm->mi_rows;
-       mi_row += 8, mi_ptr += 8 * mis) {
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
     mi = mi_ptr;
-    for (mi_col = 0; mi_col < cm->mi_cols;
-         mi_col += 8, mi += 8) {
-      reset_skip_txfm_size_sb(cpi, mi, txfm_max,
-                              mi_row, mi_col, BLOCK_SIZE_SB64X64);
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) {
+      reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col,
+                              BLOCK_SIZE_SB64X64);
     }
   }
 }
 
 void vp9_encode_frame(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+  VP9_COMMON * const cm = &cpi->common;
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
@@ -1733,10 +1855,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // requires further work in the rd loop. For now the only supported encoder
   // side behaviour is where the ALT ref buffer has oppositie sign bias to
   // the other two.
-  if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-       cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
-      (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-       cm->ref_frame_sign_bias[LAST_FRAME])) {
+  if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
+      == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+      || (cm->ref_frame_sign_bias[ALTREF_FRAME]
+          == cm->ref_frame_sign_bias[LAST_FRAME])) {
     cm->allow_comp_inter_inter = 0;
   } else {
     cm->allow_comp_inter_inter = 1;
@@ -1770,14 +1892,14 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     /* prediction (compound, single or hybrid) mode selection */
     if (frame_type == 3 || !cm->allow_comp_inter_inter)
       pred_type = SINGLE_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
-                 cpi->rd_prediction_type_threshes[frame_type][0] &&
-             cpi->rd_prediction_type_threshes[frame_type][1] >
-                 cpi->rd_prediction_type_threshes[frame_type][2] &&
-             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+    else if (cpi->rd_prediction_type_threshes[frame_type][1]
+        > cpi->rd_prediction_type_threshes[frame_type][0]
+        && cpi->rd_prediction_type_threshes[frame_type][1]
+            > cpi->rd_prediction_type_threshes[frame_type][2]
+        && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
       pred_type = COMP_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
-                 cpi->rd_prediction_type_threshes[frame_type][2])
+    else if (cpi->rd_prediction_type_threshes[frame_type][0]
+        > cpi->rd_prediction_type_threshes[frame_type][2])
       pred_type = SINGLE_PREDICTION_ONLY;
     else
       pred_type = HYBRID_PREDICTION;
@@ -1790,43 +1912,44 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       cpi->mb.e_mbd.lossless = 1;
     } else
 #if 0
-    /* FIXME (rbultje): this code is disabled until we support cost updates
-     * while a frame is being encoded; the problem is that each time we
-     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
-     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
-     * further behind and not being chosen for subsequent frames either. This
-     * is essentially a local minimum problem that we can probably fix by
-     * estimating real costs more closely within a frame, perhaps by re-
-     * calculating costs on-the-fly as frame encoding progresses. */
-    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-      txfm_type = TX_MODE_SELECT;
-    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
-            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
-               ) {
-      txfm_type = ONLY_4X4;
-    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-      txfm_type = ALLOW_16X16;
-    } else
+      /* FIXME (rbultje): this code is disabled until we support cost updates
+       * while a frame is being encoded; the problem is that each time we
+       * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
+       * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
+       * further behind and not being chosen for subsequent frames either. This
+       * is essentially a local minimum problem that we can probably fix by
+       * estimating real costs more closely within a frame, perhaps by re-
+       * calculating costs on-the-fly as frame encoding progresses. */
+      if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+          cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
+          cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
+          cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+        txfm_type = TX_MODE_SELECT;
+      } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
+          && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
+      ) {
+        txfm_type = ONLY_4X4;
+      } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+        txfm_type = ALLOW_16X16;
+      } else
       txfm_type = ALLOW_8X8;
 #else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >
-                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-                    ALLOW_32X32 : TX_MODE_SELECT;
+      txfm_type =
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
+              > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+              ALLOW_32X32 : TX_MODE_SELECT;
 #endif
     cpi->common.txfm_mode = txfm_type;
     cpi->common.comp_pred_mode = pred_type;
     encode_frame_internal(cpi);
 
     for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-      const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
+      const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
       cpi->rd_prediction_type_threshes[frame_type][i] += diff;
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
@@ -1836,8 +1959,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       int diff;
       if (i == TX_MODE_SELECT)
         pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
-                     2048 * (TX_SIZE_MAX_SB - 1), 0);
-      diff = (int)(pd / cpi->common.MBs);
+            2048 * (TX_SIZE_MAX_SB - 1), 0);
+      diff = (int) (pd / cpi->common.MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
     }
@@ -1890,12 +2013,12 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       for (i = 0; i < TX_SIZE_CONTEXTS; i++)
         count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32];
 
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32 == 0) {
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0
+          && count32x32 == 0) {
         cpi->common.txfm_mode = ALLOW_8X8;
         reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
+          && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
         cpi->common.txfm_mode = ONLY_4X4;
         reset_skip_txfm_size(cpi, TX_4X4);
       } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
@@ -1957,18 +2080,17 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
   b = 4 * act + cpi->activity_avg;
 
   if (act > cpi->activity_avg)
-    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
+    x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1;
   else
-    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
+    x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
 #endif
 }
 
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD * const xd = &x->e_mbd;
   int n;
   MODE_INFO *mi = xd->mode_info_context;
   MB_MODE_INFO *mbmi = &mi->mbmi;
@@ -2015,10 +2137,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
   }
 
   if (mbmi->ref_frame[0] == INTRA_FRAME) {
-    vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
-                                    BLOCK_SIZE_SB8X8 : bsize);
-    vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
-                                     BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_intra_block_y(
+        cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_intra_block_uv(
+        cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
@@ -2032,12 +2154,12 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
 
     assert(cm->frame_type != KEY_FRAME);
 
-    setup_pre_planes(xd, ref_fb, second_ref_fb,
-                     mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
+    setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col,
+                     xd->scale_factor, xd->scale_factor_uv);
 
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
-                                  bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8
-                                                           : bsize);
+    vp9_build_inter_predictors_sb(
+        xd, mi_row, mi_col,
+        bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
   if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
@@ -2049,14 +2171,14 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                     (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context =
-        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
+    int mb_skip_context = (mi - 1)->mbmi.mb_skip_coeff
+        + (mi - mis)->mbmi.mb_skip_coeff;
 
     mbmi->mb_skip_coeff = 1;
     if (output_enabled)
       cm->fc.mbskip_count[mb_skip_context][1]++;
-    vp9_reset_sb_tokens_context(xd,
-                 (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_reset_sb_tokens_context(
+        xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
   // copy skip flag on all mb_mode_info contexts in this SB
@@ -2068,10 +2190,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
   }
 
   if (output_enabled) {
-    if (cm->txfm_mode == TX_MODE_SELECT &&
-        mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
-        !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff ||
-          vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
+    if (cm->txfm_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_SIZE_SB8X8
+        && !(mbmi->ref_frame[0] != INTRA_FRAME
+            && (mbmi->mb_skip_coeff
+                || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
       const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
       if (bsize >= BLOCK_SIZE_SB32X32) {
         cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++;
@@ -2083,7 +2205,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
     } else {
       int x, y;
       TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
-       // The new intra coding scheme requires no change of transform size
+      // The new intra coding scheme requires no change of transform size
       if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
         if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
           sz = TX_16X16;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4f45496df..2f133ccbc 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -22,10 +22,10 @@
 DECLARE_ALIGNED(16, extern const uint8_t,
                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-void vp9_subtract_block(int rows, int cols,
-                        int16_t *diff_ptr, int diff_stride,
-                        const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *pred_ptr, int pred_stride) {
+void vp9_subtract_block_c(int rows, int cols,
+                          int16_t *diff_ptr, ptrdiff_t diff_stride,
+                          const uint8_t *src_ptr, ptrdiff_t src_stride,
+                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 579690346..3042c9f7f 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -42,10 +42,6 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
-void vp9_subtract_block(int rows, int cols,
-                        int16_t *diff_ptr, int diff_stride,
-                        const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *pred_ptr, int pred_stride);
 void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index a582d183d..ea6aa296a 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -541,7 +541,7 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
   const MV diff = {mv->row - ref->row,
                    mv->col - ref->col};
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
-  usehp = usehp && vp9_use_nmv_hp(ref);
+  usehp = usehp && vp9_use_mv_hp(ref);
 
   write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5e26cd82a..522f89982 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -986,9 +986,11 @@ static int estimate_max_q(VP9_COMP *cpi,
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
+  // FIXME(jimbankoski): Once we settle on vp9 speed features we need to
+  // change this code.
   if (cpi->compressor_speed == 1)
     speed_correction = cpi->oxcf.cpu_used <= 5 ?
-                          1.04 + (cpi->oxcf.cpu_used * 0.04) :
+                          1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) :
                           1.25;
 
   // Try and pick a max Q that will be high enough to encode the
@@ -1051,7 +1053,7 @@ static int estimate_cq(VP9_COMP *cpi,
   // (reduced compression expected)
   if (cpi->compressor_speed == 1) {
     if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+      speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04);
     else
       speed_correction = 1.25;
   }
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 2e99736ce..0f1062313 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -366,7 +366,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
   }
 
   if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
   } else {
     usehp = 0;
   }
@@ -556,7 +556,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   }
 
   if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
   } else {
     usehp = 0;
   }
@@ -930,7 +930,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   }
 
   if (x->e_mbd.allow_high_precision_mv) {
-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
   } else {
     usehp = 0;
   }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 6a14df471..e02e73232 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -591,22 +591,25 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
   sf->thresh_mult[THR_COMP_SPLITLA  ] += speed_multiplier * 4500;
   sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
 
-  if (speed > 4) {
+  if (cpi->sf.skip_lots_of_modes) {
     for (i = 0; i < MAX_MODES; ++i)
       sf->thresh_mult[i] = INT_MAX;
 
-    sf->thresh_mult[THR_DC       ] = 0;
-    sf->thresh_mult[THR_TM       ] = 0;
-    sf->thresh_mult[THR_NEWMV    ] = 4000;
-    sf->thresh_mult[THR_NEWG     ] = 4000;
-    sf->thresh_mult[THR_NEWA     ] = 4000;
+    sf->thresh_mult[THR_DC] = 0;
+    sf->thresh_mult[THR_TM] = 0;
+    sf->thresh_mult[THR_NEWMV] = 4000;
+    sf->thresh_mult[THR_NEWG] = 4000;
+    sf->thresh_mult[THR_NEWA] = 4000;
     sf->thresh_mult[THR_NEARESTMV] = 0;
-    sf->thresh_mult[THR_NEARESTG ] = 0;
-    sf->thresh_mult[THR_NEARESTA ] = 0;
-    sf->thresh_mult[THR_NEARMV   ] = 2000;
-    sf->thresh_mult[THR_NEARG    ] = 2000;
-    sf->thresh_mult[THR_NEARA    ] = 2000;
+    sf->thresh_mult[THR_NEARESTG] = 0;
+    sf->thresh_mult[THR_NEARESTA] = 0;
+    sf->thresh_mult[THR_NEARMV] = 2000;
+    sf->thresh_mult[THR_NEARG] = 2000;
+    sf->thresh_mult[THR_NEARA] = 2000;
     sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+    sf->thresh_mult[THR_SPLITMV] = 2500;
+    sf->thresh_mult[THR_SPLITG] = 2500;
+    sf->thresh_mult[THR_SPLITA] = 2500;
     sf->recode_loop = 0;
   }
 
@@ -681,6 +684,18 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
   sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
   sf->adpative_rd_thresh = 0;
+  sf->use_lastframe_partitioning = 0;
+  sf->use_largest_txform = 0;
+  sf->use_8tap_always = 0;
+  sf->use_avoid_tested_higherror = 0;
+  sf->skip_lots_of_modes = 0;
+  sf->adjust_thresholds_by_speed = 0;
+  sf->partition_by_variance = 0;
+  sf->use_one_partition_size_always = 0;
+  sf->use_partitions_less_than = 0;
+  sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+  sf->use_partitions_greater_than = 0;
+  sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
 
 #if CONFIG_MULTIPLE_ARF
   // Switch segmentation off.
@@ -703,17 +718,51 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 #endif
       sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
       sf->adpative_rd_thresh = 1;
-      if (speed > 0) {
+      if (speed == 1) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->optimize_coefficients = 0;
         sf->first_step = 1;
+        sf->use_avoid_tested_higherror = 1;
+        sf->adjust_thresholds_by_speed = 1;
       }
-      break;
+      if (speed == 2) {
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->use_lastframe_partitioning = 1;
+        sf->first_step = 0;
+      }
+      if (speed == 3) {
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->partition_by_variance = 1;
+        sf->first_step = 0;
+      }
+      if (speed == 4) {
+        sf->first_step = 0;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->use_one_partition_size_always = 1;
+        sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+      }
+      if (speed == 2) {
+        sf->first_step = 0;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->use_partitions_less_than = 1;
+        sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+      }
+      if (speed == 3) {
+        sf->first_step = 0;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->use_partitions_greater_than = 1;
+        sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+      }
+
+     break;
 
   }; /* switch */
 
   // Set rd thresholds based on mode and speed setting
-  set_rd_speed_thresholds(cpi, mode, speed);
+  if(cpi->sf.adjust_thresholds_by_speed)
+    set_rd_speed_thresholds(cpi, mode, speed);
+  else
+    set_rd_speed_thresholds(cpi, mode, 0);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -2993,7 +3042,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
         !cpi->common.frame_parallel_decoding_mode) {
       vp9_adapt_mode_probs(&cpi->common);
       vp9_adapt_mode_context(&cpi->common);
-      vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+      vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
     }
   }
 
@@ -3327,7 +3376,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
     vp9_second_pass(cpi);
 
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-
+  //vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
 #ifdef DISABLE_RC_LONG_TERM_MEM
   cpi->twopass.bits_left -=  cpi->this_frame_target;
 #else
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index f5f1c0772..0811976d0 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -216,6 +216,19 @@ typedef struct {
   int static_segmentation;
   int comp_inter_joint_search_thresh;
   int adpative_rd_thresh;
+  int use_lastframe_partitioning;
+  int use_largest_txform;
+  int use_8tap_always;
+  int use_avoid_tested_higherror;
+  int skip_lots_of_modes;
+  int adjust_thresholds_by_speed;
+  int partition_by_variance;
+  int use_one_partition_size_always;
+  BLOCK_SIZE_TYPE always_this_block_size;
+  int use_partitions_greater_than;
+  BLOCK_SIZE_TYPE greater_than_block_size;
+  int use_partitions_less_than;
+  BLOCK_SIZE_TYPE less_than_block_size;
 } SPEED_FEATURES;
 
 enum BlockSize {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 53d8be775..ccbb624b0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -35,6 +35,153 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
                      uint16_t *eob_ptr,
                      const int *scan, int mul) {
   int i, rc, eob;
+  int zbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int zero_run = 0;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int zero_flag = n_coeffs;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = n_coeffs - 1; i >= 0; i--) {
+      rc = scan[i];
+      z = coeff_ptr[rc] * mul;
+
+      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
+        zero_flag--;
+      } else {
+        break;
+      }
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < zero_flag; i++) {
+      rc = scan[i];
+      z  = coeff_ptr[rc] * mul;
+
+      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+      zero_run += (zero_run < 15);
+
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = i;                                  // last nonzero coeffs
+          zero_run = 0;                             // set zero_run
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+// This function works well for large transform size.
+static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
+                            int16_t *coeff_ptr, int n_coeffs, int skip_block,
+                            int16_t *zbin_ptr, int16_t *round_ptr,
+                            int16_t *quant_ptr, uint8_t *quant_shift_ptr,
+                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                            int16_t *dequant_ptr, int zbin_oq_value,
+                            uint16_t *eob_ptr, const int *scan, int mul,
+                            int *idx_arr) {
+  int i, rc, eob;
+  int zbins[2], pzbins[2], nzbins[2], zbin;
+  int x, y, z, sz;
+  int zero_run = 0;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+  int idx = 0;
+  int pre_idx = 0;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+  eob = -1;
+
+  // Base ZBIN
+  zbins[0] = zbin_ptr[0] + zbin_oq_value;
+  zbins[1] = zbin_ptr[1] + zbin_oq_value;
+  // Positive and negative ZBIN
+  pzbins[0] = zbins[0]/mul;
+  pzbins[1] = zbins[1]/mul;
+  nzbins[0] = pzbins[0] * -1;
+  nzbins[1] = pzbins[1] * -1;
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      rc = scan[i];
+      z = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      rc = scan[idx_arr[i]];
+
+      // Calculate ZBIN
+      zero_run += idx_arr[i] - pre_idx;
+      if(zero_run > 15) zero_run = 15;
+      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+
+      pre_idx = idx_arr[i];
+      z = coeff_ptr[rc] * mul;
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;                           // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = idx_arr[i];                         // last nonzero coeffs
+          zero_run = -1;                            // set zero_run
+        }
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#if 0
+// Original quantize function
+static void quantize(int16_t *zbin_boost_orig_ptr,
+                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
+                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+                     uint8_t *quant_shift_ptr,
+                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     int16_t *dequant_ptr, int zbin_oq_value,
+                     uint16_t *eob_ptr,
+                     const int *scan, int mul) {
+  int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int zero_run = 0;
@@ -74,6 +221,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
 
   *eob_ptr = eob + 1;
 }
+#endif
 
 void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                   TX_TYPE tx_type) {
@@ -97,19 +245,40 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
       break;
   }
 
-  quantize(mb->plane[plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-           n_coeffs, mb->skip_block,
-           mb->plane[plane].zbin,
-           mb->plane[plane].round,
-           mb->plane[plane].quant,
-           mb->plane[plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-           xd->plane[plane].dequant,
-           mb->plane[plane].zbin_extra,
-           &xd->plane[plane].eobs[block],
-           scan, mul);
+  // Call different quantization for different transform size.
+  if (n_coeffs >= 1024) {
+    // Save index of picked coefficient in pre-scan pass.
+    int idx_arr[1024];
+
+    quantize_sparse(mb->plane[plane].zrun_zbin_boost,
+                    BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+                    n_coeffs, mb->skip_block,
+                    mb->plane[plane].zbin,
+                    mb->plane[plane].round,
+                    mb->plane[plane].quant,
+                    mb->plane[plane].quant_shift,
+                    BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+                    BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                    xd->plane[plane].dequant,
+                    mb->plane[plane].zbin_extra,
+                    &xd->plane[plane].eobs[block],
+                    scan, mul, idx_arr);
+  }
+  else {
+    quantize(mb->plane[plane].zrun_zbin_boost,
+             BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+             n_coeffs, mb->skip_block,
+             mb->plane[plane].zbin,
+             mb->plane[plane].round,
+             mb->plane[plane].quant,
+             mb->plane[plane].quant_shift,
+             BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+             BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+             xd->plane[plane].dequant,
+             mb->plane[plane].zbin_extra,
+             &xd->plane[plane].eobs[block],
+             scan, mul);
+  }
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9cb7ab0e1..a48e7dbb3 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -274,12 +274,14 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   }
 }
 
-int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
-  int i, error = 0;
+int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+                          intptr_t block_size) {
+  int i;
+  int64_t error = 0;
 
   for (i = 0; i < block_size; i++) {
     int this_diff = coeff[i] - dqcoeff[i];
-    error += this_diff * this_diff;
+    error += (unsigned)this_diff * this_diff;
   }
 
   return error;
@@ -417,7 +419,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
-                                     int *d, int *distortion,
+                                     int64_t *d, int64_t *distortion,
                                      int *s, int *skip,
                                      int64_t txfm_cache[NB_TXFM_MODES],
                                      TX_SIZE max_txfm_size) {
@@ -496,27 +498,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
 }
 
-static int block_error(int16_t *coeff, int16_t *dqcoeff,
-                       int block_size, int shift) {
-  int i;
-  int64_t error = 0;
-
-  for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += (unsigned)this_diff * this_diff;
-  }
-  error >>= shift;
-
-  return error > INT_MAX ? INT_MAX : (int)error;
-}
-
-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+                               int shift) {
   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                     16 << (bwl + bhl), shift);
+  return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+                         16 << (bwl + bhl)) >> shift;
 }
 
-static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+                                int shift) {
   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   int64_t sum = 0;
   int plane;
@@ -524,11 +514,10 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
     const int subsampling = x->e_mbd.plane[plane].subsampling_x +
                             x->e_mbd.plane[plane].subsampling_y;
-    sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                       16 << (bwl + bhl - subsampling), 0);
+    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+                           16 << (bwl + bhl - subsampling));
   }
-  sum >>= shift;
-  return sum > INT_MAX ? INT_MAX : (int)sum;
+  return sum >> shift;
 }
 
 struct rdcost_block_args {
@@ -586,7 +575,8 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
 }
 
 static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                     int *rate, int *distortion, int *skippable,
+                                     int *rate, int64_t *distortion,
+                                     int *skippable,
                                      BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   xd->mode_info_context->mbmi.txfm_size = tx_size;
@@ -602,11 +592,12 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
 }
 
 static void super_block_yrd(VP9_COMP *cpi,
-                            MACROBLOCK *x, int *rate, int *distortion,
+                            MACROBLOCK *x, int *rate, int64_t *distortion,
                             int *skip, BLOCK_SIZE_TYPE bs,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+  int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
+  int64_t d[TX_SIZE_MAX_SB];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
@@ -614,7 +605,7 @@ static void super_block_yrd(VP9_COMP *cpi,
   if (mbmi->ref_frame[0] > INTRA_FRAME)
     vp9_subtract_sby(x, bs);
 
-  if (cpi->speed > 4) {
+  if (cpi->sf.use_largest_txform) {
     if (bs >= BLOCK_SIZE_SB32X32) {
       mbmi->txfm_size = TX_32X32;
     } else if (bs >= BLOCK_SIZE_MB16X16) {
@@ -651,13 +642,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
-                                     int *bestdistortion,
+                                     int64_t *bestdistortion,
                                      BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
   int rate = 0;
-  int distortion;
+  int64_t distortion;
   VP9_COMMON *const cm = &cpi->common;
   const int src_stride = x->plane[0].src.stride;
   uint8_t *src, *dst;
@@ -777,7 +768,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
                                          int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
+                                         int64_t *Distortion, int64_t best_rd) {
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
@@ -785,7 +776,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   int bh = 1 << b_height_log2(bsize);
   int idx, idy;
   int cost = 0;
-  int distortion = 0;
+  int64_t distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
@@ -802,7 +793,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
       const int mis = xd->mode_info_stride;
       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
-      int UNINITIALIZED_IS_SAFE(d);
+      int64_t UNINITIALIZED_IS_SAFE(d);
       i = idy * 2 + idx;
 
       if (xd->frame_type == KEY_FRAME) {
@@ -844,14 +835,14 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
 
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
-                                      int *distortion, int *skippable,
+                                      int64_t *distortion, int *skippable,
                                       BLOCK_SIZE_TYPE bsize,
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   MACROBLOCKD *const xd = &x->e_mbd;
-  int this_rate, this_rate_tokenonly;
-  int this_distortion, s;
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion;
   int64_t best_rd = INT64_MAX, this_rd;
   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
   int i;
@@ -912,7 +903,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                      int *rate, int *distortion,
+                                      int *rate, int64_t *distortion,
                                       int *skippable, BLOCK_SIZE_TYPE bsize,
                                       TX_SIZE uv_tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -927,7 +918,7 @@ static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
 }
 
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
-                             int *rate, int *distortion, int *skippable,
+                             int *rate, int64_t *distortion, int *skippable,
                              BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -952,13 +943,13 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
-                                       int *distortion, int *skippable,
+                                       int64_t *distortion, int *skippable,
                                        BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate_tokenonly, this_rate;
-  int this_distortion, s;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion;
 
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
@@ -1101,7 +1092,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        MACROBLOCK *x,
                                        int i,
                                        int *labelyrate,
-                                       int *distortion,
+                                       int64_t *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
   int k;
@@ -1126,7 +1117,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
   raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
                             xd->plane[0].dst.buf,
                             xd->plane[0].dst.stride);
-  int thisdistortion = 0;
+  int64_t thisdistortion = 0;
   int thisrate = 0;
 
   *labelyrate = 0;
@@ -1189,7 +1180,7 @@ typedef struct {
 
   int64_t segment_rd;
   int r;
-  int d;
+  int64_t d;
   int segment_yrate;
   MB_PREDICTION_MODE modes[4];
   int_mv mvs[4], second_mvs[4];
@@ -1281,21 +1272,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
-  int i, j;
-  int br = 0, bd = 0;
+  int i, j, br = 0, rate = 0, sbr = 0, idx, idy;
+  int64_t bd = 0, sbd = 0;
   MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   const int label_count = 4;
   int64_t this_segment_rd = 0, other_segment_rd;
   int label_mv_thresh;
-  int rate = 0;
-  int sbr = 0, sbd = 0;
   int segmentyrate = 0;
   int best_eobs[4] = { 0 };
   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
-  int idx, idy;
   vp9_variance_fn_ptr_t *v_fn_ptr;
   ENTROPY_CONTEXT t_above[4], t_left[4];
   ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
@@ -1340,7 +1328,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         int64_t this_rd;
-        int distortion;
+        int64_t distortion;
         int labelyrate;
         ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
         const struct buf_2d orig_src = x->plane[0].src;
@@ -1527,7 +1515,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int64_t best_rd,
                                        int *returntotrate,
                                        int *returnyrate,
-                                       int *returndistortion,
+                                       int64_t *returndistortion,
                                        int *skippable, int mvthresh,
                                        int_mv seg_mvs[4][MAX_REF_FRAMES],
                                        int mi_row, int mi_col) {
@@ -1800,18 +1788,133 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   return scaled_ref_frame;
 }
 
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
-                                     int *rate, int *dist) {
-  // This function models the rate and distortion for a Laplacian
+static double linear_interpolate(double x, int ntab, double step,
+                                 const double *tab) {
+  double y = x / step;
+  int d = (int) y;
+  double a = y - d;
+  if (d >= ntab - 1)
+    return tab[ntab - 1];
+  else
+    return tab[d] * (1 - a) + tab[d + 1] * a;
+}
+
+static double model_rate_norm(double x) {
+  // Normalized rate
+  // This function models the rate for a Laplacian source
   // source with given variance when quantized with a uniform quantizer
   // with given stepsize. The closed form expressions are in:
   // Hang and Chen, "Source Model for transform video coder and its
   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
   // Sys. for Video Tech., April 1997.
-  // The function is implemented as piecewise approximation to the
-  // exact computation.
-  // TODO(debargha): Implement the functions by interpolating from a
-  // look-up table
+  static const double rate_tab_step = 0.125;
+  static const double rate_tab[] = {
+    256.0000, 4.944453, 3.949276, 3.371593,
+    2.965771, 2.654550, 2.403348, 2.193612,
+    2.014208, 1.857921, 1.719813, 1.596364,
+    1.484979, 1.383702, 1.291025, 1.205767,
+    1.126990, 1.053937, 0.985991, 0.922644,
+    0.863472, 0.808114, 0.756265, 0.707661,
+    0.662070, 0.619287, 0.579129, 0.541431,
+    0.506043, 0.472828, 0.441656, 0.412411,
+    0.384980, 0.359260, 0.335152, 0.312563,
+    0.291407, 0.271600, 0.253064, 0.235723,
+    0.219508, 0.204351, 0.190189, 0.176961,
+    0.164611, 0.153083, 0.142329, 0.132298,
+    0.122945, 0.114228, 0.106106, 0.098541,
+    0.091496, 0.084937, 0.078833, 0.073154,
+    0.067872, 0.062959, 0.058392, 0.054147,
+    0.050202, 0.046537, 0.043133, 0.039971,
+    0.037036, 0.034312, 0.031783, 0.029436,
+    0.027259, 0.025240, 0.023367, 0.021631,
+    0.020021, 0.018528, 0.017145, 0.015863,
+    0.014676, 0.013575, 0.012556, 0.011612,
+    0.010738, 0.009929, 0.009180, 0.008487,
+    0.007845, 0.007251, 0.006701, 0.006193,
+    0.005722, 0.005287, 0.004884, 0.004512,
+    0.004168, 0.003850, 0.003556, 0.003284,
+    0.003032, 0.002800, 0.002585, 0.002386,
+    0.002203, 0.002034, 0.001877, 0.001732,
+    0.001599, 0.001476, 0.001362, 0.001256,
+    0.001159, 0.001069, 0.000987, 0.000910,
+    0.000840, 0.000774, 0.000714, 0.000659,
+    0.000608, 0.000560, 0.000517, 0.000476,
+    0.000439, 0.000405, 0.000373, 0.000344,
+    0.000317, 0.000292, 0.000270, 0.000248,
+    0.000229, 0.000211, 0.000195, 0.000179,
+    0.000165, 0.000152, 0.000140, 0.000129,
+    0.000119, 0.000110, 0.000101, 0.000093,
+    0.000086, 0.000079, 0.000073, 0.000067,
+    0.000062, 0.000057, 0.000052, 0.000048,
+    0.000044, 0.000041, 0.000038, 0.000035,
+    0.000032, 0.000029, 0.000027, 0.000025,
+    0.000023, 0.000021, 0.000019, 0.000018,
+    0.000016, 0.000015, 0.000014, 0.000013,
+    0.000012, 0.000011, 0.000010, 0.000009,
+    0.000008, 0.000008, 0.000007, 0.000007,
+    0.000006, 0.000006, 0.000005, 0.000005,
+    0.000004, 0.000004, 0.000004, 0.000003,
+    0.000003, 0.000003, 0.000003, 0.000002,
+    0.000002, 0.000002, 0.000002, 0.000002,
+    0.000002, 0.000001, 0.000001, 0.000001,
+    0.000001, 0.000001, 0.000001, 0.000001,
+    0.000001, 0.000001, 0.000001, 0.000001,
+    0.000001, 0.000001, 0.000000, 0.000000,
+  };
+  const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);
+  assert(x >= 0.0);
+  return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab);
+}
+
+static double model_dist_norm(double x) {
+  // Normalized distortion
+  // This function models the normalized distortion for a Laplacian source
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance)
+  // Note the actual distortion is Dn * variance.
+  static const double dist_tab_step = 0.25;
+  static const double dist_tab[] = {
+    0.000000, 0.005189, 0.020533, 0.045381,
+    0.078716, 0.119246, 0.165508, 0.215979,
+    0.269166, 0.323686, 0.378318, 0.432034,
+    0.484006, 0.533607, 0.580389, 0.624063,
+    0.664475, 0.701581, 0.735418, 0.766092,
+    0.793751, 0.818575, 0.840761, 0.860515,
+    0.878045, 0.893554, 0.907238, 0.919281,
+    0.929857, 0.939124, 0.947229, 0.954306,
+    0.960475, 0.965845, 0.970512, 0.974563,
+    0.978076, 0.981118, 0.983750, 0.986024,
+    0.987989, 0.989683, 0.991144, 0.992402,
+    0.993485, 0.994417, 0.995218, 0.995905,
+    0.996496, 0.997002, 0.997437, 0.997809,
+    0.998128, 0.998401, 0.998635, 0.998835,
+    0.999006, 0.999152, 0.999277, 0.999384,
+    0.999475, 0.999553, 0.999619, 0.999676,
+    0.999724, 0.999765, 0.999800, 0.999830,
+    0.999855, 0.999877, 0.999895, 0.999911,
+    0.999924, 0.999936, 0.999945, 0.999954,
+    0.999961, 0.999967, 0.999972, 0.999976,
+    0.999980, 0.999983, 0.999985, 0.999988,
+    0.999989, 0.999991, 0.999992, 0.999994,
+    0.999995, 0.999995, 0.999996, 0.999997,
+    0.999997, 0.999998, 0.999998, 0.999998,
+    0.999999, 0.999999, 0.999999, 0.999999,
+    0.999999, 0.999999, 0.999999, 1.000000,
+  };
+  const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);
+  assert(x >= 0.0);
+  return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab);
+}
+
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+                                     int *rate, int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance)
   vp9_clear_system_state();
   if (var == 0 || n == 0) {
     *rate = 0;
@@ -1819,29 +1922,18 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
   } else {
     double D, R;
     double s2 = (double) var / n;
-    double s = sqrt(s2);
-    double x = qstep / s;
-    if (x > 1.0) {
-      double y = exp(-x / 2);
-      double y2 = y * y;
-      D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275;
-      R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017;
-    } else {
-      double x2 = x * x;
-      D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807;
-      if (x > 0.125)
-        R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x +
-                 0.1626989668625);
-      else
-        R = -1.442252874826093 * log(x) + 1.944647760719664;
-    }
+    double x = qstep / sqrt(s2);
+    // TODO(debargha): Make the modeling functions take (qstep^2 / s2)
+    // as argument rather than qstep / sqrt(s2) to obviate the need for
+    // the sqrt() operation.
+    D = model_dist_norm(x);
+    R = model_rate_norm(x);
     if (R < 0) {
-      *rate = 0;
-      *dist = var;
-    } else {
-      *rate = (n * R * 256 + 0.5);
-      *dist = (n * D * s2 + 0.5);
+      R = 0;
+      D = var;
     }
+    *rate = (n * R * 256 + 0.5);
+    *dist = (n * D * s2 + 0.5);
   }
   vp9_clear_system_state();
 }
@@ -1854,12 +1946,13 @@ static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
 
 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
-                            int *out_rate_sum, int *out_dist_sum) {
+                            int *out_rate_sum, int64_t *out_dist_sum) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
-  unsigned int sse, var;
-  int i, rate_sum = 0, dist_sum = 0;
+  unsigned int sse;
+  int i, rate_sum = 0;
+  int64_t dist_sum = 0;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
@@ -1869,17 +1962,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
     const int bw = plane_block_width(bsize, pd);
     const int bh = plane_block_height(bsize, pd);
     const enum BlockSize bs = get_block_size(bw, bh);
-    int rate, dist;
-    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                             pd->dst.buf, pd->dst.stride, &sse);
-    model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
+    int rate;
+    int64_t dist;
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
 
     rate_sum += rate;
     dist_sum += dist;
   }
 
   *out_rate_sum = rate_sum;
-  *out_dist_sum = dist_sum;
+  *out_dist_sum = dist_sum << 4;
 }
 
 static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
@@ -2134,9 +2228,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int64_t txfm_cache[],
-                                 int *rate2, int *distortion, int *skippable,
-                                 int *rate_y, int *distortion_y,
-                                 int *rate_uv, int *distortion_uv,
+                                 int *rate2, int64_t *distortion,
+                                 int *skippable,
+                                 int *rate_y, int64_t *distortion_y,
+                                 int *rate_uv, int64_t *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
                                  INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv *frame_mv,
@@ -2236,11 +2331,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (cpi->speed > 4) {
+  if (cpi->sf.use_8tap_always) {
     *best_filter = EIGHTTAP;
   } else {
     int i, newbest;
-    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    int tmp_rate_sum = 0;
+    int64_t tmp_dist_sum = 0;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
       const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
@@ -2255,7 +2351,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (interpolating_intpel_seen && is_intpel_interp) {
         rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        int rate_sum = 0, dist_sum = 0;
+        int rate_sum = 0;
+        int64_t dist_sum = 0;
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
         rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
@@ -2399,19 +2496,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate, int *returndist,
+                               int *returnrate, int64_t *returndist,
                                BLOCK_SIZE_TYPE bsize,
                                PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  int rate_y = 0, rate_uv;
-  int rate_y_tokenonly = 0, rate_uv_tokenonly;
-  int dist_y = 0, dist_uv;
-  int y_skip = 0, uv_skip;
+  int rate_y = 0, rate_uv = 0;
+  int rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+  int y_skip = 0, uv_skip = 0;
   int64_t txfm_cache[NB_TXFM_MODES], err;
   MB_PREDICTION_MODE mode;
   TX_SIZE txfm_size;
-  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
+  int rate4x4_y, rate4x4_y_tokenonly;
+  int64_t dist4x4_y;
   int64_t err4x4 = INT64_MAX;
   int i;
 
@@ -2462,7 +2560,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *returnrate,
-                                  int *returndistortion,
+                                  int64_t *returndistortion,
                                   BLOCK_SIZE_TYPE bsize,
                                   PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *cm = &cpi->common;
@@ -2497,7 +2595,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
-  int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
+  int64_t dist_uv[TX_SIZE_MAX_SB];
+  int skip_uv[TX_SIZE_MAX_SB];
   MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
   struct scale_factors scale_factor[4];
   unsigned int ref_frame_mask = 0;
@@ -2536,7 +2635,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     best_txfm_rd[i] = INT64_MAX;
 
   // Create a mask set to 1 for each frame used by a smaller resolution.
-  if (cpi->speed > 0) {
+  if (cpi->sf.use_avoid_tested_higherror) {
     switch (block_size) {
       case BLOCK_64X64:
         for (i = 0; i < 4; i++) {
@@ -2576,8 +2675,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
-  if (cpi->speed == 0
-      || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
+  if (!cpi->sf.use_avoid_tested_higherror
+      || (cpi->sf.use_avoid_tested_higherror
+          && (ref_frame_mask & (1 << INTRA_FRAME)))) {
     mbmi->mode = DC_PRED;
     mbmi->ref_frame[0] = INTRA_FRAME;
     for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
@@ -2599,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int disable_skip = 0;
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable;
     int64_t txfm_cache[NB_TXFM_MODES];
     int i;
@@ -2623,7 +2723,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
 
-    if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
+    if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
@@ -2786,11 +2886,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 = distortion_y + distortion_uv;
     } else if (this_mode == SPLITMV) {
       const int is_comp_pred = mbmi->ref_frame[1] > 0;
-      int rate, distortion;
+      int rate;
+      int64_t distortion;
       int64_t this_rd_thresh;
       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+      int64_t tmp_best_distortion = INT_MAX;
+      int tmp_best_skippable = 0;
       int switchable_filter_index;
       int_mv *second_ref = is_comp_pred ?
           &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index dcf5d00e9..67ef73db7 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -20,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *r, int *d, BLOCK_SIZE_TYPE bsize,
+                               int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
                                PICK_MODE_CONTEXT *ctx);
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
-                                  int *r, int *d, BLOCK_SIZE_TYPE bsize,
+                                  int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
                                   PICK_MODE_CONTEXT *ctx);
 
 void vp9_init_me_luts();
diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm
deleted file mode 100644
index 734cb61ca..000000000
--- a/vp9/encoder/x86/vp9_encodeopt.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_xmm) PRIVATE
-sym(vp9_block_error_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prologue
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        mov         rdi,        arg(1) ;dcoef_ptr
-
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm1,       [rdi]
-
-        movdqa      xmm2,       [rsi+16]
-        movdqa      xmm3,       [rdi+16]
-
-        psubw       xmm0,       xmm1
-        psubw       xmm2,       xmm3
-
-        pmaddwd     xmm0,       xmm0
-        pmaddwd     xmm2,       xmm2
-
-        paddd       xmm0,       xmm2
-
-        pxor        xmm5,       xmm5
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm5
-        punpckhdq   xmm1,       xmm5
-
-        paddd       xmm0,       xmm1
-        movdqa      xmm1,       xmm0
-
-        psrldq      xmm0,       8
-        paddd       xmm0,       xmm1
-
-        movq        rax,        xmm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp9_block_error_mmx) PRIVATE
-sym(vp9_block_error_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        movq        mm3,        [rsi]
-
-        movq        mm4,        [rdi]
-        movq        mm5,        [rsi+8]
-
-        movq        mm6,        [rdi+8]
-        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
-
-        movq        mm2,        mm7
-        psubw       mm5,        mm6
-
-        por         mm1,        mm2
-        pmaddwd     mm5,        mm5
-
-        pcmpeqw     mm1,        mm7
-        psubw       mm3,        mm4
-
-        pand        mm1,        mm3
-        pmaddwd     mm1,        mm1
-
-        paddd       mm1,        mm5
-        movq        mm3,        [rsi+16]
-
-        movq        mm4,        [rdi+16]
-        movq        mm5,        [rsi+24]
-
-        movq        mm6,        [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm3,        mm5
-
-        paddd       mm1,        mm3
-        movq        mm0,        mm1
-
-        psrlq       mm1,        32
-        paddd       mm0,        mm1
-
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 000000000..bb1ea71b9
--- /dev/null
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,57 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m0, [uqcq+sizeq*2]
+  mova      m2, [dqcq+sizeq*2]
+  mova      m1, [uqcq+sizeq*2+mmsize]
+  mova      m3, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m2, m0, m5
+  punpckhdq m0, m5
+  punpckldq m3, m1, m5
+  punpckhdq m1, m5
+  paddq     m4, m2
+  paddq     m4, m0
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm
new file mode 100644
index 000000000..19e2feb57
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -0,0 +1,1288 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 15
+                     times  8 dw  1
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 13
+                     times  8 dw  3
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 11
+                     times  8 dw  5
+                     times  8 dw 10
+                     times  8 dw  6
+                     times  8 dw  9
+                     times  8 dw  7
+                     times 16 dw  8
+                     times  8 dw  7
+                     times  8 dw  9
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  5
+                     times  8 dw 11
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  3
+                     times  8 dw 13
+                     times  8 dw  2
+                     times  8 dw 14
+                     times  8 dw  1
+                     times  8 dw 15
+
+bilin_filter_m_ssse3: times  8 db 16,  0
+                      times  8 db 15,  1
+                      times  8 db 14,  2
+                      times  8 db 13,  3
+                      times  8 db 12,  4
+                      times  8 db 11,  5
+                      times  8 db 10,  6
+                      times  8 db  9,  7
+                      times 16 db  8
+                      times  8 db  7,  9
+                      times  8 db  6, 10
+                      times  8 db  5, 11
+                      times  8 db  4, 12
+                      times  8 db  3, 13
+                      times  8 db  2, 14
+                      times  8 db  1, 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  paddw                %5, %3
+  pmaddwd              %3, %3
+  paddw                %5, %1
+  pmaddwd              %1, %1
+  paddd                %6, %3
+  paddd                %6, %1
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  movhlps              m3, m7
+  punpcklwd            m4, m6, m5
+  punpckhwd            m6, m5           ; sign-extend m6 word->dword
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  pshufd               m4, m6, 0x1
+  movd               [r1], m7           ; store sse
+  paddd                m6, m4
+  movd                rax, m6           ; store sum as return value
+%else ; mmsize == 8
+  pshufw               m4, m6, 0xe
+  pshufw               m3, m7, 0xe
+  paddw                m6, m4
+  paddd                m7, m3
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  punpcklwd            m6, m5           ; sign-extend m6 word->dword
+  movd               [r1], m7           ; store sse
+  pshufw               m4, m6, 0xe
+  paddd                m6, m4
+  movd                rax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+%ifdef PIC
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                              x_offset, y_offset, \
+                                              dst, dst_stride, \
+                                              sec, sec_stride, height, sse
+%define sec_str sec_strideq
+%else
+cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
+                                          dst, dst_stride, height, sse
+%endif
+%define h heightd
+%define bilin_filter sseq
+%else
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                                         x_offset, y_offset, \
+                                                         dst, dst_stride, \
+                                                         sec, sec_stride, \
+                                                         height, sse
+%if ARCH_X86_64
+%define h heightd
+%define sec_str sec_strideq
+%else
+%define h dword heightm
+%define sec_str sec_stridemp
+%endif
+%else
+cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
+                                          dst, dst_stride, height, sse
+%define h heightd
+%endif
+%define bilin_filter bilin_filter_m
+%endif
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+  ; could perhaps use it for something more productive then
+  pxor                 m5, m5           ; dedicated zero register
+%if %1 < 16
+  sar                   h, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  mova                 m1, [dstq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+%endif
+%else ; !avg
+  movh                 m2, [srcq+src_strideq]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+  punpckldq            m2, [srcq+src_strideq*2]
+%endif
+  movh                 m1, [dstq]
+%if mmsize == 16
+  movlhps              m0, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+%endif
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m4, [srcq+src_strideq*2]
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m2, [srcq+src_strideq]
+  movh                 m4, [srcq+src_strideq*2]
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movh                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movh                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+  punpckldq            m4, [srcq+src_strideq+1]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m2, [srcq+src_strideq]
+  movh                 m1, [dstq]
+  pavgb                m0, m4
+  movh                 m4, [srcq+src_strideq+1]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
+  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movh                 m2, [srcq]
+  movh                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+  punpckldq            m2, [srcq+src_strideq]
+  punpckldq            m3, [srcq+src_strideq+1]
+%endif
+  pavgb                m2, m3
+%if mmsize == 16
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+  pshufw               m4, m2, 0xe
+%endif
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m4, [srcq+src_strideq]
+  movh                 m1, [srcq+src_strideq+1]
+  pavgb                m2, m3
+  pavgb                m4, m1
+  pavgb                m0, m2
+  pavgb                m2, m4
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_other_loop:
+  movu                 m4, [srcq]
+  movu                 m2, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m2
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  punpcklbw            m0, m5
+  paddw                m2, m3
+  punpcklbw            m3, m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+%endif
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+%if notcpuflag(ssse3)
+  punpcklbw            m0, m5
+%endif
+.x_half_y_other_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+  pavgb                m2, m1
+  pavgb                m4, m3
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movh                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  paddw                m0, m1
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m2, m1
+  movh                 m1, [dstq]
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m2, [srcq+src_strideq]
+  movh                 m4, [srcq+src_strideq+1]
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  movh                 m1, [dstq]
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_x_a
+  pmaddubsw            m2, filter_x_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movh                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+%if cpuflag(ssse3)
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%else
+  punpckhbw            m2, m4, m5
+  punpckhbw            m1, m3, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m4, m3
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  psraw                m4, 4
+  psraw                m2, 4
+  punpckhbw            m3, m1, m5
+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+  ; have a 1-register shortage to be able to store the backup of the bilin
+  ; filtered second line as words as cache for the next line. Packing into
+  ; a byte costs 1 pack and 2 unpacks, but saves a register.
+  packuswb             m4, m2
+  punpcklbw            m1, m5
+  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  add                srcq, src_strideq
+  psraw                m0, 4
+.x_other_y_half_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  movh                 m1, [dstq]
+  paddw                m4, m3
+  movh                 m3, [dstq+dst_strideq]
+%endif
+  psraw                m2, 4
+  psraw                m4, 4
+  pavgw                m0, m2
+  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                m11, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  punpckhbw            m3, m1, m5
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  punpcklbw            m1, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  psraw                m0, 4
+%else
+  movu                 m3, [srcq]
+  movu                 m4, [srcq+1]
+  punpckhbw            m1, m3, m5
+  punpckhbw            m2, m4, m5
+  punpcklbw            m3, m5
+  punpcklbw            m4, m5
+  pmullw               m3, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m3, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m3, m4
+  paddw                m1, m2
+  psraw                m3, 4
+  psraw                m1, 4
+  packuswb             m4, m3, m1
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  pmullw               m2, filter_y_a
+  pmullw               m1, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  psraw                m0, 4
+%if cpuflag(ssse3)
+  packuswb             m0, m0
+%endif
+  add                srcq, src_strideq
+.x_other_y_other_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movh                 m3, [dstq+dst_strideq]
+  movh                 m1, [dstq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m2, m2
+  packuswb             m4, m4
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m1, m5
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  paddw                m4, m3
+  psraw                m2, 4
+  psraw                m4, 4
+  pmullw               m0, filter_y_a
+  pmullw               m3, m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m0, m3
+  movh                 m3, [dstq+dst_strideq]
+  paddw                m2, m1
+  movh                 m1, [dstq]
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   h
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
index 8a2a471f5..2ecc23e55 100644
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
@@ -8,292 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
 %include "vpx_ports/x86_abi_support.asm"
 
-%define xmm_filter_shift            7
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp9_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 ;void vp9_half_horiz_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
@@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/vp9/encoder/x86/vp9_subtract_mmx.asm b/vp9/encoder/x86/vp9_subtract_mmx.asm
deleted file mode 100644
index e9eda4fed..000000000
--- a/vp9/encoder/x86/vp9_subtract_mmx.asm
+++ /dev/null
@@ -1,432 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_mmx_impl) PRIVATE
-sym(vp9_subtract_b_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi],      mm0
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],mm0
-
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*4],        mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx) PRIVATE
-sym(vp9_subtract_mby_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            16
-            pxor        mm0,            mm0
-
-.submby_loop:
-
-            movq        mm1,            [rsi]
-            movq        mm3,            [rax]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi],          mm1
-            movq        [rdi+8],        mm2
-
-
-            movq        mm1,            [rsi+8]
-            movq        mm3,            [rax+8]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi+16],       mm1
-            movq        [rdi+24],       mm2
-
-
-            add         rdi,            32
-            add         rax,            16
-
-            lea         rsi,            [rsi+rdx]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx) PRIVATE
-sym(vp9_subtract_mbuv_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-    ;short *udiff = diff + 256;
-    ;short *vdiff = diff + 320;
-    ;unsigned char *upred = pred + 256;
-    ;unsigned char *vpred = pred + 320;
-
-        ;unsigned char  *z    = usrc;
-        ;unsigned short *diff = udiff;
-        ;unsigned char  *Predictor= upred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-        ;unsigned char  *z    = vsrc;
-        ;unsigned short *diff = vdiff;
-        ;unsigned char  *Predictor= vpred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(2) ;z = usrc
-            add     rdi,        320*2  ;diff = diff + 320 (shorts)
-            add     rax,        320    ;Predictor = pred + 320
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm
index 739d9487e..982408083 100644
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/vp9/encoder/x86/vp9_subtract_sse2.asm
@@ -8,349 +8,120 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_sse2_impl) PRIVATE
-sym(vp9_subtract_b_sse2_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi],      mm0
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*4], mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2) PRIVATE
-sym(vp9_subtract_mby_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            8      ; do two lines at one time
-
-.submby_loop:
-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
-
-            movdqa      xmm2,           xmm0
-            psubb       xmm0,           xmm1
-
-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm2,           [GLOBAL(t80)]
-            pcmpgtb     xmm1,           xmm2            ; obtain sign information
-
-            movdqa      xmm2,    xmm0
-            movdqa      xmm3,    xmm1
-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi],   xmm0
-            movdqa      XMMWORD PTR [rdi +16], xmm2
-
-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
-            movdqa      xmm5,           XMMWORD PTR [rax + 16]
-
-            movdqa      xmm6,           xmm4
-            psubb       xmm4,           xmm5
-
-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm6,           [GLOBAL(t80)]
-            pcmpgtb     xmm5,           xmm6            ; obtain sign information
-
-            movdqa      xmm6,    xmm4
-            movdqa      xmm7,    xmm5
-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi +32], xmm4
-            movdqa      XMMWORD PTR [rdi +48], xmm6
-
-            add         rdi,            64
-            add         rax,            32
-            lea         rsi,            [rsi+rdx*2]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2) PRIVATE
-sym(vp9_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            lea     rcx,        [rdx + rdx*2]
-
-            ;u
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-            ;v
-            mov     rsi,        arg(2) ;z = vsrc
-            add     rdi,        64*2  ;diff = diff + 320 (shorts)
-            add     rax,        64    ;Predictor = pred + 320
-
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-t80:
-    times 16 db 0x80
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  mova                  m1, [predq+%3]
+  mova                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  RET
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index 9f140c96b..d3dbefed8 100644
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-%define mmx_filter_shift            7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 896dd185d..2c5088134 100644
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -11,8 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%define xmm_filter_shift            7
-
 ;unsigned int vp9_get_mb_ss_sse2
 ;(
 ;    short *src_ptr
@@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
deleted file mode 100644
index 98a4a16f6..000000000
--- a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp9_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db  96, 32
-    times 8 db  88, 40
-    times 8 db  80, 48
-    times 8 db  72, 56
-    times 8 db  64, 64
-    times 8 db  56, 72
-    times 8 db  48, 80
-    times 8 db  40, 88
-    times 8 db  32, 96
-    times 8 db  24, 104
-    times 8 db  16, 112
-    times 8 db   8, 120
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index bad1cfa74..d1415606e 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -13,27 +13,6 @@
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern void filter_block1d_h6_mmx
-(
-  const unsigned char *src_ptr,
-  unsigned short *output_ptr,
-  unsigned int src_pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
-  const short *src_ptr,
-  unsigned char *output_ptr,
-  unsigned int pixels_per_line,
-  unsigned int pixel_step,
-  unsigned int output_height,
-  unsigned int output_width,
-  short *vp7_filter
-);
-
 extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
@@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
   unsigned int *SSE,
   int *Sum
 );
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
 
 unsigned int vp9_variance4x4_mmx(
   const unsigned char *src_ptr,
@@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
   return (var - (((unsigned int)avg * avg) >> 7));
 
 }
-
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse)
-
-{
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum0, &xxsum0
-  );
-
-
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr + 8, src_pixels_per_line,
-    dst_ptr + 8, dst_pixels_per_line, 8,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum1, &xxsum1
-  );
-
-  xsum0 += xsum1;
-  xxsum0 += xxsum1;
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
-                                         ref_ptr, recon_stride, sse);
-}
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 67ca9257c..b4ff8509c 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -9,29 +9,11 @@
  */
 
 #include "vpx_config.h"
+
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  const short *HFilter,
-  const short *VFilter,
-  int *sum,
-  unsigned int *sumsquared
-);
-
 extern unsigned int vp9_get4x4var_mmx
 (
   const unsigned char *src_ptr,
@@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
   unsigned int *SSE,
   int *Sum
 );
-void vp9_filter_block2d_bil_var_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
 void vp9_half_horiz_vert_variance8x_h_sse2
 (
   const unsigned char *ref_ptr,
@@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
   unsigned int *sumsquared
 );
 
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
 typedef unsigned int (*get_var_sse2) (
   const unsigned char *src_ptr,
   int source_stride,
@@ -375,347 +343,162 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
   return (var - (((int64_t)avg * avg) >> 11));
 }
 
-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-  vp9_filter_block2d_bil4x4_var_mmx(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line,
-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
-    &xsum, &xxsum
-  );
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+#define DECL(w, opt) \
+int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+                                        ptrdiff_t src_stride, \
+                                        int x_offset, int y_offset, \
+                                        const uint8_t *dst, \
+                                        ptrdiff_t dst_stride, \
+                                        int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
 
-
-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse, int *avg) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0
-    );
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum1, &xxsum1
-    );
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  *avg = xsum0;
-}
-
-unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg;
-  unsigned int sse;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse, &avg);
-  *sse_ptr = sse;
-
-  return (sse - (((unsigned int) avg * avg) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg0, avg1, avg2, avg3;
-  unsigned int sse0, sse1, sse2, sse3;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse0, &avg0);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse1, &avg1);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sse0 += sse1 + sse2 + sse3;
-  avg0 += avg1 + avg2 + avg3;
-  *sse_ptr = sse0;
-
-  return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
-}
-
-unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
-                                              int src_pixels_per_line,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse_ptr) {
-  int avg0, avg1, avg2, avg3, avg4;
-  unsigned int sse0, sse1, sse2, sse3, sse4;
-
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse0, &avg0);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse3, &avg3);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3;
-  sse0 += sse1 + sse2 + sse3;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  src_ptr += 16 * src_pixels_per_line;
-  dst_ptr += 16 * dst_pixels_per_line;
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr, dst_pixels_per_line,
-                               &sse1, &avg1);
-  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 16, dst_pixels_per_line,
-                               &sse2, &avg2);
-  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 32, dst_pixels_per_line,
-                               &sse3, &avg3);
-  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
-                               yoffset, dst_ptr + 48, dst_pixels_per_line,
-                               &sse4, &avg4);
-  avg0 += avg1 + avg2 + avg3 + avg4;
-  sse0 += sse1 + sse2 + sse3 + sse4;
-  *sse_ptr = sse0;
-
-  return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_sse2(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
-                                   yoffset, dst_ptr, dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum1, &xxsum1);
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum;
-  unsigned int xxsum;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance8x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum, &xxsum);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum, &xxsum);
-  }
-
-  *sse = xxsum;
-  return (xxsum - (((unsigned int)xsum * xsum) >> 7));
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16,  8, 16, 4, 3, opt1,); \
+FN(8,  16,  8, 3, 4, opt1,); \
+FN(8,   8,  8, 3, 3, opt1,); \
+FN(8,   4,  8, 3, 2, opt1,); \
+FN(4,   8,  4, 2, 3, opt2,); \
+FN(4,   4,  4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
 
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16,  8, 16, 4, 3, opt1,); \
+FN(8,  16,  8, 3, 4, opt1,); \
+FN(8,   8,  8, 3, 3, opt1,); \
+FN(8,   4,  8, 3, 2, opt1,); \
+FN(4,   8,  4, 2, 3, opt2,); \
+FN(4,   4,  4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
 
 unsigned int vp9_variance_halfpixvar16x16_h_wmt(
   const unsigned char *src_ptr,
diff --git a/vp9/encoder/x86/vp9_variance_ssse3.c b/vp9/encoder/x86/vp9_variance_ssse3.c
deleted file mode 100644
index 882acad78..000000000
--- a/vp9/encoder/x86/vp9_variance_ssse3.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int  xoffset,
-  int  yoffset,
-  int *sum,
-  unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  // note we could avoid these if statements if the calling function
-  // just called the appropriate functions inside.
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 16,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-
-) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_ssse3(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 8,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c
deleted file mode 100644
index 6016e14eb..000000000
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/x86/vp9_dct_mmx.h"
-
-// TODO(jimbankoski) Consider rewriting the c to take the same values rather
-// than going through these pointer conversions
-#if 0 && HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
-  vp9_short_fdct4x4_mmx(input,   output,    pitch);
-  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = *(bd->base_dst) + bd->dst;
-  // TODO(jingning): The prototype function in c has been changed. Need to
-  // modify the mmx and sse versions.
-  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if 0 && HAVE_SSE2
-void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-                              short *diff, unsigned char *predictor,
-                              int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = *(bd->base_dst) + bd->dst;
-  // TODO(jingning): The prototype function in c has been changed. Need to
-  // modify the mmx and sse versions.
-  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 4bed6c0d7..a1e93753d 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -73,27 +73,24 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c