31 files changed, 566 insertions, 683 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index a09f33ed9..0f197e330 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -338,6 +338,7 @@ typedef struct macroblockd {
   signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
   /* 0 = Intra, Last, GF, ARF */
   signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
+
   /* 0 = ZERO_MV, MV */
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
   /* 0 = ZERO_MV, MV */
@@ -404,34 +405,15 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
   int bwl = b_width_log2(sb_type);
   int bhl = b_height_log2(sb_type);
   int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
-  int i;
+  char pcvalue[2] = {~(0xe << boffset), ~(0xf <<boffset)};
+
+  assert(MAX(bwl, bhl) <= bsl);
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  if ((bwl == bsl) && (bhl == bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xf << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xf << boffset);
-  } else if ((bwl == bsl) && (bhl < bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xe << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xf << boffset);
-  }  else if ((bwl < bsl) && (bhl == bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xf << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xe << boffset);
-  } else if ((bwl < bsl) && (bhl < bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xe << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xe << boffset);
-  } else {
-    assert(0);
-  }
+  vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
+  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }
 
 static INLINE int partition_plane_context(MACROBLOCKD *xd,
@@ -504,53 +486,25 @@ static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
   return subsize;
 }
 
-// transform mapping
-static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
-  switch (bmode) {
-    case TM_PRED :
-    case D135_PRED :
-      return ADST_ADST;
-
-    case V_PRED :
-    case D117_PRED :
-    case D63_PRED:
-      return ADST_DCT;
-
-    case H_PRED :
-    case D153_PRED :
-    case D27_PRED :
-      return DCT_ADST;
+extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
 
-    default:
-      return DCT_DCT;
-  }
-}
-
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
+static INLINE TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
   MODE_INFO *const mi = xd->mode_info_context;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
     return DCT_DCT;
 
-  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    return txfm_map(mi->bmi[ib].as_mode.first);
-  } else {
-    assert(mbmi->mode <= TM_PRED);
-    return txfm_map(mbmi->mode);
-  }
+  return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+                       mi->bmi[ib].as_mode.first : mbmi->mode];
 }
 
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd) {
-  return xd->mode_info_context->mbmi.mode <= TM_PRED
-             ? txfm_map(xd->mode_info_context->mbmi.mode)
-             : DCT_DCT;
+static INLINE TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd) {
+  return mode2txfm_map[xd->mode_info_context->mbmi.mode];
 }
 
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd) {
-  return xd->mode_info_context->mbmi.mode <= TM_PRED
-             ? txfm_map(xd->mode_info_context->mbmi.mode)
-             : DCT_DCT;
+static INLINE TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd) {
+  return  mode2txfm_map[xd->mode_info_context->mbmi.mode];
 }
 
 void vp9_setup_block_dptrs(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 0d7babf97..9a14ab149 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -60,6 +60,18 @@ static INLINE int multiple8(int value) {
   return (value + 7) & ~7;
 }
 
+static int get_unsigned_bits(unsigned int num_values) {
+  int cat = 0;
+  if (num_values <= 1)
+    return 0;
+  num_values--;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
+}
+
 #define SYNC_CODE_0 0x49
 #define SYNC_CODE_1 0x83
 #define SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 7ab1114bd..ec61f990f 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -641,31 +641,6 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-void vp9_full_to_model_count(unsigned int *model_count,
-                             unsigned int *full_count) {
-  int n;
-  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
-  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
-  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
-  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
-    model_count[TWO_TOKEN] += full_count[n];
-  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
-}
-
-void vp9_full_to_model_counts(
-    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
-  int i, j, k, l;
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < REF_TYPES; ++j)
-      for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
-          vp9_full_to_model_count(model_count[i][j][k][l],
-                                  full_count[i][j][k][l]);
-        }
-}
-
 static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
                              int count_sat, int update_factor) {
   vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 149ed8b42..e9a47daed 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -174,10 +174,6 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
                                           [PREV_COEF_CONTEXTS]
                                           [UNCONSTRAINED_NODES][2];
-extern void vp9_full_to_model_count(unsigned int *model_count,
-                                    unsigned int *full_count);
-extern void vp9_full_to_model_counts(
-    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 000e284ee..e8696fefa 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -198,7 +198,7 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
   }
 }
 
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+void vp9_inc_mv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
                        int usehp) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   mvctx->joints[j]++;
@@ -226,79 +226,6 @@ void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
   counts_to_context(&nmv_count->comps[1], usehp);
 }
 
-void vp9_counts_to_nmv_context(
-    nmv_context_counts *nmv_count,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]) {
-  int i, j, k;
-  vp9_counts_process(nmv_count, usehp);
-  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   prob->joints,
-                                   branch_ct_joint,
-                                   nmv_count->joints, 0);
-  for (i = 0; i < 2; ++i) {
-    const uint32_t s0 = nmv_count->comps[i].sign[0];
-    const uint32_t s1 = nmv_count->comps[i].sign[1];
-
-    prob->comps[i].sign = get_binary_prob(s0, s1);
-    branch_ct_sign[i][0] = s0;
-    branch_ct_sign[i][1] = s1;
-    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     nmv_count->comps[i].classes, 0);
-    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
-                                     branch_ct_class0[i],
-                                     nmv_count->comps[i].class0, 0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
-      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
-
-      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
-      branch_ct_bits[i][j][0] = b0;
-      branch_ct_bits[i][j][1] = b1;
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (k = 0; k < CLASS0_SIZE; ++k) {
-      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
-                                       branch_ct_class0_fp[i][k],
-                                       nmv_count->comps[i].class0_fp[k], 0);
-    }
-    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
-                                     branch_ct_fp[i],
-                                     nmv_count->comps[i].fp, 0);
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
-      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
-      const uint32_t hp0 = nmv_count->comps[i].hp[0];
-      const uint32_t hp1 = nmv_count->comps[i].hp[1];
-
-      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
-      branch_ct_class0_hp[i][0] = c0_hp0;
-      branch_ct_class0_hp[i][1] = c0_hp1;
-
-      prob->comps[i].hp = get_binary_prob(hp0, hp1);
-      branch_ct_hp[i][0] = hp0;
-      branch_ct_hp[i][1] = hp1;
-    }
-  }
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                 vp9_tree tree,
                                 vp9_prob this_probs[],
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 0fc20dbfc..de18dfb7a 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -121,22 +121,10 @@ typedef struct {
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                       int usehp);
+void vp9_inc_mv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+                int usehp);
 extern const nmv_context vp9_default_nmv_context;
-void vp9_counts_to_nmv_context(
-    nmv_context_counts *NMVcount,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]);
+
 void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
 
 #endif  // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index a11c1bae0..2989b9ccc 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -16,6 +16,24 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vpx_mem/vpx_mem.h"
 
+const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
+    DCT_DCT,    // DC
+    ADST_DCT,   // V
+    DCT_ADST,   // H
+    DCT_DCT,    // D45
+    ADST_ADST,  // D135
+    ADST_DCT,   // D117
+    DCT_ADST,   // D153
+    DCT_ADST,   // D27
+    ADST_DCT,   // D63
+    ADST_ADST,  // TM
+    DCT_DCT,    // NEARESTMV
+    DCT_DCT,    // NEARMV
+    DCT_DCT,    // ZEROMV
+    DCT_DCT     // NEWMV
+};
+
+
 static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
                           int bw, int bh,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
@@ -300,6 +318,7 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                             int bwl_in,
                             TX_SIZE tx_size,
                             int mode,
+                            uint8_t *reference, int ref_stride,
                             uint8_t *predictor, int pre_stride) {
   const int bwl = bwl_in - tx_size;
   const int wmask = (1 << bwl) - 1;
@@ -309,7 +328,7 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
   const int txfm_block_size = 4 << tx_size;
 
   assert(bwl >= 0);
-  vp9_build_intra_predictors(predictor, pre_stride,
+  vp9_build_intra_predictors(reference, ref_stride,
                              predictor, pre_stride,
                              mode,
                              txfm_block_size,
@@ -317,12 +336,3 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                              have_top, have_left,
                              have_right);
 }
-
-void vp9_intra4x4_predict(MACROBLOCKD *xd,
-                          int block_idx,
-                          BLOCK_SIZE_TYPE bsize,
-                          int mode,
-                          uint8_t *predictor, int pre_stride) {
-  vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4,
-                          mode, predictor, pre_stride);
-}
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index f5f5f42c4..e369a7192 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -25,6 +25,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                             int block_idx,
                             int bwl_in,
                             TX_SIZE tx_size,
-                            int mode,
+                            int mode, uint8_t *ref, int ref_stride,
                             uint8_t *predictor, int pre_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index d71def519..9daf908d5 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -57,9 +57,6 @@ specialize vp9_copy_mem8x4 mmx
 prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"
 specialize void vp9_build_intra_predictors
 
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra4x4_predict;
-
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 4f076dc90..020e9c6cc 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -21,8 +21,10 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_treereader.h"
 
+
 #if CONFIG_DEBUG
 #include <assert.h>
 #endif
@@ -201,7 +203,7 @@ static int read_mv_component(vp9_reader *r,
   return sign ? -mag : mag;
 }
 
-static void update_nmv(vp9_reader *r, vp9_prob *const p,
+static void update_mv(vp9_reader *r, vp9_prob *const p,
                        const vp9_prob upd_p) {
   if (vp9_read(r, upd_p)) {
 #ifdef LOW_PRECISION_MV_UPDATE
@@ -212,8 +214,7 @@ static void update_nmv(vp9_reader *r, vp9_prob *const p,
   }
 }
 
-static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,
-                          int usehp) {
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) {
   int i, j, k;
 
 #ifdef MV_GROUP_UPDATE
@@ -221,33 +222,33 @@ static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,
     return;
 #endif
   for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
+    update_mv(r, &mvc->joints[j], VP9_NMV_UPDATE_PROB);
 
   for (i = 0; i < 2; ++i) {
-    update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
+    update_mv(r, &mvc->comps[i].sign, VP9_NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j)
       for (k = 0; k < 3; ++k)
-        update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+        update_mv(r, &mvc->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < 3; ++j)
-      update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
-      update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].hp, VP9_NMV_UPDATE_PROB);
     }
   }
 }
@@ -320,8 +321,7 @@ static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-        fc->switchable_interp_prob[j][i] = vp9_read_prob_diff_update(r,
-                                             fc->switchable_interp_prob[j][i]);
+        vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
@@ -329,8 +329,7 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < VP9_INTER_MODES - 1; ++j)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-        fc->inter_mode_probs[i][j] = vp9_read_prob_diff_update(r,
-                                       fc->inter_mode_probs[i][j]);
+        vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
 static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -355,16 +354,14 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-        cm->fc.intra_inter_prob[i] =
-            vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
+        vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
 
     if (cm->allow_comp_inter_inter) {
       cm->comp_pred_mode = read_comp_pred_mode(r);
       if (cm->comp_pred_mode == HYBRID_PREDICTION)
         for (i = 0; i < COMP_INTER_CONTEXTS; i++)
           if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            cm->fc.comp_inter_prob[i] =
-                vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]);
+            vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
     } else {
       cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
     }
@@ -372,39 +369,29 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++) {
         if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          cm->fc.single_ref_prob[i][0] =
-              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]);
+          vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+
         if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          cm->fc.single_ref_prob[i][1] =
-              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]);
+          vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
       }
 
     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++)
         if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          cm->fc.comp_ref_prob[i] =
-              vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]);
+          vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 
     // VP9_INTRA_MODES
-    for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
-      for (i = 0; i < VP9_INTRA_MODES - 1; ++i) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-          cm->fc.y_mode_prob[j][i] =
-              vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]);
-        }
-      }
-    }
-    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) {
-      for (i = 0; i < PARTITION_TYPES - 1; ++i) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
-          cm->fc.partition_prob[INTER_FRAME][j][i] =
-              vp9_read_prob_diff_update(r,
-                  cm->fc.partition_prob[INTER_FRAME][j][i]);
-        }
-      }
-    }
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+      for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
+
+    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < PARTITION_TYPES - 1; ++i)
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
 
-    read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);
+    read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
   }
 }
 
@@ -461,7 +448,7 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
   if (mv_joint_horizontal(j))
     diff.col = read_mv_component(r, &ctx->comps[1], usehp);
 
-  vp9_increment_nmv(&diff, ref, counts, usehp);
+  vp9_inc_mv(&diff, ref, counts, usehp);
 
   mv->row = diff.row + ref->row;
   mv->col = diff.col + ref->col;
@@ -781,8 +768,7 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
     if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-      cm->fc.mbskip_probs[k] =
-          vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
+      vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
 
   mb_mode_mv_init(pbi, r);
 }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index a87cfd3c5..08acbf31d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -30,6 +30,7 @@
 #include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
@@ -49,6 +50,11 @@ static int read_is_valid(const uint8_t *start, size_t len,
   return start + len > start && start + len <= end;
 }
 
+static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
+  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
+  return data > max ? max : data;
+}
+
 static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
   if (lossless) {
     pc->txfm_mode = ONLY_4X4;
@@ -56,133 +62,25 @@ static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
     pc->txfm_mode = vp9_read_literal(r, 2);
     if (pc->txfm_mode == ALLOW_32X32)
       pc->txfm_mode += vp9_read_bit(r);
+
     if (pc->txfm_mode == TX_MODE_SELECT) {
       int i, j;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            pc->fc.tx_probs_8x8p[i][j] =
-                vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]);
-        }
-      }
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            pc->fc.tx_probs_16x16p[i][j] =
-                vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]);
-        }
-      }
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+        for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
           if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            pc->fc.tx_probs_32x32p[i][j] =
-                vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]);
-        }
-      }
-    }
-  }
-}
-
-static int get_unsigned_bits(unsigned int num_values) {
-  int cat = 0;
-  if (num_values <= 1)
-    return 0;
-  num_values--;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
-static int inv_recenter_nonneg(int v, int m) {
-  if (v > 2 * m)
-    return v;
-
-  return v % 2 ? m - (v + 1) / 2 : m + v / 2;
-}
-
-static int decode_uniform(vp9_reader *r, int n) {
-  int v;
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  if (!l)
-    return 0;
+            vp9_diff_update_prob(r, &pc->fc.tx_probs_8x8p[i][j]);
 
-  v = vp9_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
-}
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+        for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+            vp9_diff_update_prob(r, &pc->fc.tx_probs_16x16p[i][j]);
 
-static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
-  int i = 0, mk = 0, word;
-  while (1) {
-    const int b = i ? k + i - 1 : k;
-    const int a = 1 << b;
-    if (num_syms <= mk + 3 * a) {
-      word = decode_uniform(r, num_syms - mk) + mk;
-      break;
-    } else {
-      if (vp9_read_bit(r)) {
-        i++;
-        mk += a;
-      } else {
-        word = vp9_read_literal(r, b) + mk;
-        break;
-      }
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+        for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+            vp9_diff_update_prob(r, &pc->fc.tx_probs_32x32p[i][j]);
     }
   }
-  return word;
-}
-
-static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
-  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
-  return data > max ? max : data;
-}
-
-static int merge_index(int v, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (v < max1) {
-    v = v * modulus + modulus / 2;
-  } else {
-    int w;
-    v -= max1;
-    w = v;
-    v += (v + modulus - modulus / 2) / modulus;
-    while (v % modulus == modulus / 2 ||
-           w != v - (v + modulus - modulus / 2) / modulus) v++;
-  }
-  return v;
-}
-
-static int inv_remap_prob(int v, int m) {
-  const int n = 255;
-
-  v = merge_index(v, n - 1, MODULUS_PARAM);
-  m--;
-  if ((m << 1) <= n) {
-    return 1 + inv_recenter_nonneg(v + 1, m);
-  } else {
-    return n - inv_recenter_nonneg(v + 1, n - 1 - m);
-  }
-}
-
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) {
-  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
-  return (vp9_prob)inv_remap_prob(delp, oldp);
-}
-
-void vp9_init_dequantizer(VP9_COMMON *pc) {
-  int q;
-
-  for (q = 0; q < QINDEX_RANGE; q++) {
-    // DC value
-    pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
-    pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
-
-    // AC values
-    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
-    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
-  }
 }
 
 static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
@@ -261,6 +159,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, pd->dst.stride,
                           dst, pd->dst.stride);
 
   // Early exit if there are no coefficients
@@ -540,7 +439,7 @@ static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size,
               vp9_prob *const p = coef_probs[i][j][k][l] + m;
 
               if (vp9_read(r, VP9_COEF_UPDATE_PROB))
-                *p = vp9_read_prob_diff_update(r, *p);
+                vp9_diff_update_prob(r, p);
             }
           }
         }
@@ -678,11 +577,9 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
-  if (xd->lossless) {
-    xd->itxm_add          = vp9_idct_add_lossless_c;
-  } else {
-    xd->itxm_add          = vp9_idct_add;
-  }
+
+  xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
+                              : vp9_idct_add;
 }
 
 static INTERPOLATIONFILTERTYPE read_interp_filter_type(
@@ -1088,6 +985,20 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   return vp9_rb_read_literal(rb, 16);
 }
 
+void vp9_init_dequantizer(VP9_COMMON *pc) {
+  int q;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    // DC value
+    pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
+    pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
+
+    // AC values
+    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
+    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
+  }
+}
+
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
   vp9_reader header_bc, residual_bc;
diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index 66e951d10..00b6d674d 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -17,6 +17,5 @@ struct VP9Decompressor;
 
 void vp9_init_dequantizer(struct VP9Common *pc);
 int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
new file mode 100644
index 000000000..0f204dfd2
--- /dev/null
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -0,0 +1,85 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/decoder/vp9_dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+  if (v > 2 * m)
+    return v;
+
+  return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+}
+
+static int decode_uniform(vp9_reader *r, int n) {
+  int v;
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (!l)
+    return 0;
+
+  v = vp9_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
+}
+
+
+static int merge_index(int v, int n, int modulus) {
+  int max1 = (n - 1 - modulus / 2) / modulus + 1;
+  if (v < max1) {
+    v = v * modulus + modulus / 2;
+  } else {
+    int w;
+    v -= max1;
+    w = v;
+    v += (v + modulus - modulus / 2) / modulus;
+    while (v % modulus == modulus / 2 ||
+           w != v - (v + modulus - modulus / 2) / modulus) v++;
+  }
+  return v;
+}
+
+static int inv_remap_prob(int v, int m) {
+  const int n = 255;
+
+  v = merge_index(v, n - 1, MODULUS_PARAM);
+  m--;
+  if ((m << 1) <= n) {
+    return 1 + inv_recenter_nonneg(v + 1, m);
+  } else {
+    return n - inv_recenter_nonneg(v + 1, n - 1 - m);
+  }
+}
+
+static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
+  int i = 0, mk = 0, word;
+  while (1) {
+    const int b = i ? k + i - 1 : k;
+    const int a = 1 << b;
+    if (num_syms <= mk + 3 * a) {
+      word = decode_uniform(r, num_syms - mk) + mk;
+      break;
+    } else {
+      if (vp9_read_bit(r)) {
+        i++;
+        mk += a;
+      } else {
+        word = vp9_read_literal(r, b) + mk;
+        break;
+      }
+    }
+  }
+  return word;
+}
+
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+  *p = (vp9_prob)inv_remap_prob(delp, *p);
+}
diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
new file mode 100644
index 000000000..aeb9399d0
--- /dev/null
+++ b/vp9/decoder/vp9_dsubexp.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_DSUBEXP_H_
+#define VP9_DECODER_VP9_DSUBEXP_H_
+
+#include "vp9/decoder/vp9_dboolhuff.h"
+
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
+
+#endif  // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d1122c2b1..decd5a2f3 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -175,16 +175,6 @@ int recenter_nonneg(int v, int m) {
     return ((m - v) << 1) - 1;
 }
 
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if ((num_values--) <= 1) return 0;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
 void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
                              int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
@@ -752,11 +742,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
-            vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+            vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
                           nmvc, xd->allow_high_precision_mv);
 
             if (mi->ref_frame[1] > INTRA_FRAME)
-              vp9_encode_mv(bc,
+              vp9_encode_mv(cpi, bc,
                             &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
                             &mi->best_second_mv.as_mv,
                             nmvc, xd->allow_high_precision_mv);
@@ -767,12 +757,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #ifdef ENTROPY_STATS
       active_section = 5;
 #endif
-      vp9_encode_mv(bc,
+      vp9_encode_mv(cpi, bc,
                     &mi->mv[0].as_mv, &mi->best_mv.as_mv,
                     nmvc, xd->allow_high_precision_mv);
 
       if (mi->ref_frame[1] > INTRA_FRAME)
-        vp9_encode_mv(bc,
+        vp9_encode_mv(cpi, bc,
                       &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
                       nmvc, xd->allow_high_precision_mv);
     }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 00fa6704c..168b46012 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -397,24 +397,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
 #endif
   } else {
-    /*
-     // Reduce the activation RD thresholds for the best choice mode
-     if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
-     (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
-     {
-     int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
-     cpi->rd_thresh_mult[mb_mode_index] =
-     (cpi->rd_thresh_mult[mb_mode_index]
-     >= (MIN_THRESHMULT + best_adjustment)) ?
-     cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
-     MIN_THRESHMULT;
-     cpi->rd_threshes[mb_mode_index] =
-     (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
-     * cpi->rd_thresh_mult[mb_mode_index];
-
-     }
-     */
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
     if (mbmi->ref_frame[0] != INTRA_FRAME
@@ -1548,8 +1530,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
                    0, 0, NULL, NULL );
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
-  vp9_build_block_offsets(x);
-
   vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
@@ -2023,9 +2003,6 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
 }
 
-void vp9_build_block_offsets(MACROBLOCK *x) {
-}
-
 static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index d37bdca36..399196927 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -15,8 +15,6 @@
 struct macroblock;
 struct yv12_buffer_config;
 
-void vp9_build_block_offsets(struct macroblock *x);
-
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
                           int mb_row, int mb_col);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index ccd84b39c..e13ffbdcd 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -78,7 +78,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 
 
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
 
 struct vp9_token_state {
@@ -643,6 +642,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, pd->dst.stride,
                           dst, pd->dst.stride);
   vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
                      src, p->src.stride, dst, pd->dst.stride);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index ea6aa296a..08ef6910d 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -152,12 +152,8 @@ static int update_nmv_savings(const unsigned int ct[2],
   }
 }
 
-static int update_nmv(
-  vp9_writer *const bc,
-  const unsigned int ct[2],
-  vp9_prob *const cur_p,
-  const vp9_prob new_p,
-  const vp9_prob upd_p) {
+static int update_mv(vp9_writer *bc, const unsigned int ct[2],
+                     vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
 
 #ifdef LOW_PRECISION_MV_UPDATE
   vp9_prob mod_p = new_p | 1;
@@ -188,6 +184,80 @@ static int update_nmv(
   }
 }
 
+static void counts_to_nmv_context(
+    nmv_context_counts *nmv_count,
+    nmv_context *prob,
+    int usehp,
+    unsigned int (*branch_ct_joint)[2],
+    unsigned int (*branch_ct_sign)[2],
+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+    unsigned int (*branch_ct_fp)[4 - 1][2],
+    unsigned int (*branch_ct_class0_hp)[2],
+    unsigned int (*branch_ct_hp)[2]) {
+  int i, j, k;
+  vp9_counts_process(nmv_count, usehp);
+  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
+                                   prob->joints,
+                                   branch_ct_joint,
+                                   nmv_count->joints, 0);
+  for (i = 0; i < 2; ++i) {
+    const uint32_t s0 = nmv_count->comps[i].sign[0];
+    const uint32_t s1 = nmv_count->comps[i].sign[1];
+
+    prob->comps[i].sign = get_binary_prob(s0, s1);
+    branch_ct_sign[i][0] = s0;
+    branch_ct_sign[i][1] = s1;
+    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
+                                     prob->comps[i].classes,
+                                     branch_ct_classes[i],
+                                     nmv_count->comps[i].classes, 0);
+    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
+                                     prob->comps[i].class0,
+                                     branch_ct_class0[i],
+                                     nmv_count->comps[i].class0, 0);
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
+      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
+
+      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
+      branch_ct_bits[i][j][0] = b0;
+      branch_ct_bits[i][j][1] = b1;
+    }
+  }
+  for (i = 0; i < 2; ++i) {
+    for (k = 0; k < CLASS0_SIZE; ++k) {
+      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+                                       prob->comps[i].class0_fp[k],
+                                       branch_ct_class0_fp[i][k],
+                                       nmv_count->comps[i].class0_fp[k], 0);
+    }
+    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+                                     prob->comps[i].fp,
+                                     branch_ct_fp[i],
+                                     nmv_count->comps[i].fp, 0);
+  }
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
+      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
+      const uint32_t hp0 = nmv_count->comps[i].hp[0];
+      const uint32_t hp1 = nmv_count->comps[i].hp[1];
+
+      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
+      branch_ct_class0_hp[i][0] = c0_hp0;
+      branch_ct_class0_hp[i][1] = c0_hp1;
+
+      prob->comps[i].hp = get_binary_prob(hp0, hp1);
+      branch_ct_hp[i][0] = hp0;
+      branch_ct_hp[i][1] = hp1;
+    }
+  }
+}
+
+
 void print_nmvcounts(nmv_context_counts tnmvcounts) {
   int i, j, k;
   printf("\nCounts =\n  { ");
@@ -253,11 +323,11 @@ void print_nmvstats() {
   unsigned int branch_ct_class0_hp[2][2];
   unsigned int branch_ct_hp[2][2];
   int i, j, k;
-  vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                            branch_ct_class0, branch_ct_bits,
-                            branch_ct_class0_fp, branch_ct_fp,
-                            branch_ct_class0_hp, branch_ct_hp);
+  counts_to_nmv_context(&tnmvcounts, &prob, 1,
+                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
+                        branch_ct_class0, branch_ct_bits,
+                        branch_ct_class0_fp, branch_ct_fp,
+                        branch_ct_class0_hp, branch_ct_hp);
 
   printf("\nCounts =\n  { ");
   for (j = 0; j < MV_JOINTS; ++j)
@@ -394,6 +464,8 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   unsigned int branch_ct_fp[2][4 - 1][2];
   unsigned int branch_ct_class0_hp[2][2];
   unsigned int branch_ct_hp[2][2];
+  nmv_context *mvc = &cpi->common.fc.nmvc;
+
 #ifdef MV_GROUP_UPDATE
   int savings = 0;
 #endif
@@ -402,11 +474,11 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   if (!cpi->dummy_packing)
     add_nmvcount(&tnmvcounts, &cpi->NMVcount);
 #endif
-  vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                            branch_ct_class0, branch_ct_bits,
-                            branch_ct_class0_fp, branch_ct_fp,
-                            branch_ct_class0_hp, branch_ct_hp);
+  counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
+                        branch_ct_class0, branch_ct_bits,
+                        branch_ct_class0_fp, branch_ct_fp,
+                        branch_ct_class0_hp, branch_ct_hp);
   /* write updates if they help */
 #ifdef MV_GROUP_UPDATE
   for (j = 0; j < MV_JOINTS - 1; ++j) {
@@ -475,68 +547,52 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
   vp9_write_bit(bc, 1);
 #endif
 
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    update_nmv(bc, branch_ct_joint[j],
-               &cpi->common.fc.nmvc.joints[j],
-               prob.joints[j],
-               VP9_NMV_UPDATE_PROB);
-  }
+  for (j = 0; j < MV_JOINTS - 1; ++j)
+    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
+              VP9_NMV_UPDATE_PROB);
+
   for (i = 0; i < 2; ++i) {
-    update_nmv(bc, branch_ct_sign[i],
-               &cpi->common.fc.nmvc.comps[i].sign,
-               prob.comps[i].sign,
-               VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      update_nmv(bc, branch_ct_classes[i][j],
-                 &cpi->common.fc.nmvc.comps[i].classes[j],
-                 prob.comps[i].classes[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      update_nmv(bc, branch_ct_class0[i][j],
-                 &cpi->common.fc.nmvc.comps[i].class0[j],
-                 prob.comps[i].class0[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      update_nmv(bc, branch_ct_bits[i][j],
-                 &cpi->common.fc.nmvc.comps[i].bits[j],
-                 prob.comps[i].bits[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
+    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
+              prob.comps[i].sign, VP9_NMV_UPDATE_PROB);
+    for (j = 0; j < MV_CLASSES - 1; ++j)
+      update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
+                prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+
+    for (j = 0; j < CLASS0_SIZE - 1; ++j)
+      update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
+                prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
+                prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB);
   }
+
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j) {
       int k;
-      for (k = 0; k < 3; ++k) {
-        update_nmv(bc, branch_ct_class0_fp[i][j][k],
-                   &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
-                   prob.comps[i].class0_fp[j][k],
-                   VP9_NMV_UPDATE_PROB);
-      }
-    }
-    for (j = 0; j < 3; ++j) {
-      update_nmv(bc, branch_ct_fp[i][j],
-                 &cpi->common.fc.nmvc.comps[i].fp[j],
-                 prob.comps[i].fp[j],
-                 VP9_NMV_UPDATE_PROB);
+      for (k = 0; k < 3; ++k)
+        update_mv(bc, branch_ct_class0_fp[i][j][k],
+                  &mvc->comps[i].class0_fp[j][k],
+                  prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
     }
+
+    for (j = 0; j < 3; ++j)
+      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
+                prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB);
   }
+
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(bc, branch_ct_class0_hp[i],
-                 &cpi->common.fc.nmvc.comps[i].class0_hp,
-                 prob.comps[i].class0_hp,
-                 VP9_NMV_UPDATE_PROB);
-      update_nmv(bc, branch_ct_hp[i],
-                 &cpi->common.fc.nmvc.comps[i].hp,
-                 prob.comps[i].hp,
-                 VP9_NMV_UPDATE_PROB);
+      update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
+                prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+      update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
+                prob.comps[i].hp, VP9_NMV_UPDATE_PROB);
     }
   }
 }
 
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+                   const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,
                    mv->col - ref->col};
@@ -549,6 +605,13 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
 
   if (mv_joint_horizontal(j))
     encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled and it is an arf/non shown frame
+  // then keep track of the largest motion vector component used.
+  if (cpi->sf.auto_mv_step_size && !cpi->common.show_frame) {
+    cpi->max_mv_magnitude = MAX((MAX(abs(mv->row), abs(mv->col)) >> 3),
+                                cpi->max_mv_magnitude);
+  }
 }
 
 void vp9_build_nmv_cost_table(int *mvjoint,
@@ -582,29 +645,29 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
         if (pi->bmi[i].mode == NEWMV) {
           mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);
           mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);
-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
-                            x->e_mbd.allow_high_precision_mv);
+          vp9_inc_mv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
+                     x->e_mbd.allow_high_precision_mv);
           if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
             mv.row = pi->bmi[i].second_mv.as_mv.row -
                          second_best_ref_mv->as_mv.row;
             mv.col = pi->bmi[i].second_mv.as_mv.col -
                          second_best_ref_mv->as_mv.col;
-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
-                              x->e_mbd.allow_high_precision_mv);
+            vp9_inc_mv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
+                       x->e_mbd.allow_high_precision_mv);
           }
         }
       }
     }
   } else if (mbmi->mode == NEWMV) {
-    mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
-    mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
-    vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
+    mv.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
+    mv.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
+    vp9_inc_mv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
                       x->e_mbd.allow_high_precision_mv);
     if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
-      mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
-      vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
-                        x->e_mbd.allow_high_precision_mv);
+      mv.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
+      mv.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
+      vp9_inc_mv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
+                 x->e_mbd.allow_high_precision_mv);
     }
   }
 }
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index cb25d85ee..56aaeee8d 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -16,7 +16,7 @@
 
 void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
 
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
 void vp9_build_nmv_cost_table(int *mvjoint,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 522f89982..d25d78178 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -521,8 +521,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   xd->mode_info_context = cm->mi;
 
-  vp9_build_block_offsets(x);
-
   vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_frame_init_quantizer(cpi);
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 65fdcbe50..1b8f87318 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -35,8 +35,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   int_mv ref_full;
 
   // Further step/diamond searches as necessary
-  int step_param = cpi->sf.first_step +
+  int step_param = cpi->sf.reduce_first_step_size +
       (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
+  step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
   vp9_clamp_mv_min_max(x, ref_mv);
 
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 212dce3b8..5b7bed463 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -38,16 +38,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
     x->mv_row_max = row_max;
 }
 
-int vp9_init_search_range(int width, int height) {
+int vp9_init_search_range(VP9_COMP *cpi, int size) {
   int sr = 0;
-  int frm = MIN(width, height);
 
-  while ((frm << sr) < MAX_FULL_PEL_VAL)
+  // Minimum search size no matter what the passed in value.
+  size = MAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
   if (sr)
     sr--;
 
+  sr += cpi->sf.reduce_first_step_size;
+  sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
   return sr;
 }
 
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 28b2efd28..c13ea7597 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -24,15 +24,15 @@
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-int vp9_init_search_range(int width, int height);
-
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
                            int *mvcost[2], int weight, int ishp);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
 void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
-// Runs sequence of diamond searches in smaller steps for RD
 struct VP9_COMP;
+int vp9_init_search_range(struct VP9_COMP *cpi, int size);
+
+// Runs sequence of diamond searches in smaller steps for RD
 int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index ab7a41e28..3b09b9f11 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -671,6 +671,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     cpi->mode_chosen_counts[i] = 0;
   }
 
+  // Initialize cpi->max_mv_magnitude if appropriate.
+  if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only ||
+    (cpi->common.show_frame == 0)) {
+    cpi->max_mv_magnitude = 0;
+  }
+
   // best quality defaults
   sf->RD = 1;
   sf->search_method = NSTEP;
@@ -680,7 +686,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->half_pixel_search = 1;
   sf->iterative_sub_pixel = 1;
   sf->optimize_coefficients = !cpi->oxcf.lossless;
-  sf->first_step = 0;
+  sf->reduce_first_step_size = 0;
+  sf->auto_mv_step_size = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
   sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
   sf->adpative_rd_thresh = 0;
@@ -716,46 +723,47 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 #else
       sf->static_segmentation = 0;
 #endif
-      sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+      sf->comp_inter_joint_search_thresh = BLOCK_SIZE_MB16X16;
+      sf->auto_mv_step_size = 1;
+      sf->use_avoid_tested_higherror = 1;
       sf->adpative_rd_thresh = 1;
+
       if (speed == 1) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
-        sf->optimize_coefficients = 0;
-        sf->first_step = 1;
-        sf->use_avoid_tested_higherror = 1;
-        sf->adjust_thresholds_by_speed = 1;
         sf->use_largest_txform = !(cpi->common.frame_type == KEY_FRAME ||
                                    cpi->common.intra_only ||
                                    cpi->common.show_frame == 0);
       }
       if (speed == 2) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->adjust_thresholds_by_speed = 1;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+        sf->reduce_first_step_size = 1;
+        sf->optimize_coefficients = 0;
         sf->use_lastframe_partitioning = 1;
-        sf->first_step = 0;
       }
       if (speed == 3) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->partition_by_variance = 1;
-        sf->first_step = 0;
+        sf->reduce_first_step_size = 1;
       }
       if (speed == 4) {
-        sf->first_step = 0;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+        sf->reduce_first_step_size = 1;
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->use_one_partition_size_always = 1;
         sf->always_this_block_size = BLOCK_SIZE_MB16X16;
       }
-      if (speed == 2) {
-        sf->first_step = 0;
+/*      if (speed == 2) {
+        sf->reduce_first_step_size = 0;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
         sf->use_partitions_less_than = 1;
         sf->less_than_block_size = BLOCK_SIZE_MB16X16;
       }
       if (speed == 3) {
-        sf->first_step = 0;
+        sf->reduce_first_step_size = 0;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
         sf->use_partitions_greater_than = 1;
         sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
-      }
+      }*/
 
      break;
 
@@ -2393,6 +2401,32 @@ static void release_scaled_references(VP9_COMP *cpi) {
     cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
 }
 
+static void full_to_model_count(unsigned int *model_count,
+                                unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
+static void full_to_model_counts(
+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+  int i, j, k, l;
+  for (i = 0; i < BLOCK_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          if (l >= 3 && k == 0)
+            continue;
+          full_to_model_count(model_count[i][j][k][l],
+                              full_count[i][j][k][l]);
+        }
+}
+
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       unsigned long *size,
                                       unsigned char *dest,
@@ -3040,8 +3074,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    vp9_full_to_model_counts(cpi->common.fc.coef_counts[t],
-                             cpi->coef_counts[t]);
+    full_to_model_counts(cpi->common.fc.coef_counts[t],
+                         cpi->coef_counts[t]);
   if (!cpi->common.error_resilient_mode &&
       !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 0811976d0..63b015549 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -210,12 +210,14 @@ typedef struct {
   int quarter_pixel_search;
   int thresh_mult[MAX_MODES];
   int max_step_search_steps;
-  int first_step;
+  int reduce_first_step_size;
+  int auto_mv_step_size;
   int optimize_coefficients;
   int search_best_filter;
   int static_segmentation;
   int comp_inter_joint_search_thresh;
   int adpative_rd_thresh;
+  int skip_encode_sb;
   int use_lastframe_partitioning;
   int use_largest_txform;
   int use_8tap_always;
@@ -471,6 +473,8 @@ typedef struct VP9_COMP {
   SPEED_FEATURES sf;
   int error_bins[1024];
 
+  unsigned int max_mv_magnitude;
+
   // Data used for real time conferencing mode to help determine if it would be good to update the gf
   int inter_zz_count;
   int gf_bad_count;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index ccbb624b0..8deeea13d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -21,19 +21,13 @@
 extern int enc_debug;
 #endif
 
-static INLINE int plane_idx(int plane) {
-  return plane == 0 ? 0 :
-         plane == 1 ? 16 : 20;
-}
-
 static void quantize(int16_t *zbin_boost_orig_ptr,
                      int16_t *coeff_ptr, int n_coeffs, int skip_block,
                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
                      uint8_t *quant_shift_ptr,
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      int16_t *dequant_ptr, int zbin_oq_value,
-                     uint16_t *eob_ptr,
-                     const int *scan, int mul) {
+                     uint16_t *eob_ptr, const int *scan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
@@ -56,7 +50,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
     // Pre-scan pass
     for (i = n_coeffs - 1; i >= 0; i--) {
       rc = scan[i];
-      z = coeff_ptr[rc] * mul;
+      z = coeff_ptr[rc];
 
       if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
         zero_flag--;
@@ -69,7 +63,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
     // skippable. Note: zero_flag can be zero.
     for (i = 0; i < zero_flag; i++) {
       rc = scan[i];
-      z  = coeff_ptr[rc] * mul;
+      z  = coeff_ptr[rc];
 
       zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
       zero_run += (zero_run < 15);
@@ -83,7 +77,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
             >> quant_shift_ptr[rc != 0];            // quantize (x)
         x  = (y ^ sz) - sz;                         // get the sign back
         qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
 
         if (y) {
           eob = i;                                  // last nonzero coeffs
@@ -102,10 +96,10 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
                             int16_t *quant_ptr, uint8_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                             int16_t *dequant_ptr, int zbin_oq_value,
-                            uint16_t *eob_ptr, const int *scan, int mul,
+                            uint16_t *eob_ptr, const int *scan,
                             int *idx_arr) {
   int i, rc, eob;
-  int zbins[2], pzbins[2], nzbins[2], zbin;
+  int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
   int zero_run = 0;
   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
@@ -120,21 +114,18 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
   // Base ZBIN
   zbins[0] = zbin_ptr[0] + zbin_oq_value;
   zbins[1] = zbin_ptr[1] + zbin_oq_value;
-  // Positive and negative ZBIN
-  pzbins[0] = zbins[0]/mul;
-  pzbins[1] = zbins[1]/mul;
-  nzbins[0] = pzbins[0] * -1;
-  nzbins[1] = pzbins[1] * -1;
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
 
   if (!skip_block) {
     // Pre-scan pass
     for (i = 0; i < n_coeffs; i++) {
       rc = scan[i];
-      z = coeff_ptr[rc];
+      z = coeff_ptr[rc] * 2;
 
       // If the coefficient is out of the base ZBIN range, keep it for
       // quantization.
-      if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0])
+      if (z >= zbins[rc != 0] || z <= nzbins[rc != 0])
         idx_arr[idx++] = i;
     }
 
@@ -149,7 +140,7 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
       zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
 
       pre_idx = idx_arr[i];
-      z = coeff_ptr[rc] * mul;
+      z = coeff_ptr[rc] * 2;
       sz = (z >> 31);                               // sign of z
       x  = (z ^ sz) - sz;                           // x = abs(z)
 
@@ -160,7 +151,7 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
 
         x  = (y ^ sz) - sz;                         // get the sign back
         qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2;  // dequantized value
 
         if (y) {
           eob = idx_arr[i];                         // last nonzero coeffs
@@ -171,62 +162,10 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
   }
   *eob_ptr = eob + 1;
 }
-#if 0
-// Original quantize function
-static void quantize(int16_t *zbin_boost_orig_ptr,
-                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
-                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
-                     uint8_t *quant_shift_ptr,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                     int16_t *dequant_ptr, int zbin_oq_value,
-                     uint16_t *eob_ptr,
-                     const int *scan, int mul) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  int zero_run = 0;
-  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
-
-  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
-  eob = -1;
-
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      rc   = scan[i];
-      z    = coeff_ptr[rc] * mul;
-
-      zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
-      zero_run += (zero_run < 15);
-
-      sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;                           // x = abs(z)
-
-      if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
-        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-            >> quant_shift_ptr[rc != 0];            // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
-
-        if (y) {
-          eob = i;                                  // last nonzero coeffs
-          zero_run = 0;
-        }
-      }
-    }
-  }
-
-  *eob_ptr = eob + 1;
-}
-#endif
 
 void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                   TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  const int mul = n_coeffs == 1024 ? 2 : 1;
   const int *scan;
 
   // These contexts may be available in the caller
@@ -262,7 +201,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                     xd->plane[plane].dequant,
                     mb->plane[plane].zbin_extra,
                     &xd->plane[plane].eobs[block],
-                    scan, mul, idx_arr);
+                    scan, idx_arr);
   }
   else {
     quantize(mb->plane[plane].zrun_zbin_boost,
@@ -277,7 +216,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
              xd->plane[plane].dequant,
              mb->plane[plane].zbin_extra,
              &xd->plane[plane].eobs[block],
-             scan, mul);
+             scan);
   }
 }
 
@@ -299,7 +238,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
            xd->plane[pb_idx.plane].dequant,
            mb->plane[pb_idx.plane].zbin_extra,
            &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
+           pt_scan);
 }
 
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index b2a19c61d..87f72894c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -227,7 +227,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
         cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+        if (cpi->sf.adpative_rd_thresh)
+          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+        else
+          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   } else {
@@ -247,7 +251,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
           cpi->rd_threshes[bsize][i] = INT_MAX;
         }
         cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+        if (cpi->sf.adpative_rd_thresh)
+          cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+        else
+          cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
     }
   }
@@ -494,23 +502,26 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 
 static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
                                int shift) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+  struct macroblockd_plane *p = &x->e_mbd.plane[0];
+  const int bw = plane_block_width(bsize, p);
+  const int bh = plane_block_height(bsize, p);
   return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                         16 << (bwl + bhl)) >> shift;
+                         bw * bh) >> shift;
 }
 
 static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
                                 int shift) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   int64_t sum = 0;
   int plane;
 
   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    const int subsampling = x->e_mbd.plane[plane].subsampling_x +
-                            x->e_mbd.plane[plane].subsampling_y;
+    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
+    const int bw = plane_block_width(bsize, p);
+    const int bh = plane_block_height(bsize, p);
     sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                           16 << (bwl + bhl - subsampling));
+                           bw * bh);
   }
+
   return sum >> shift;
 }
 
@@ -644,7 +655,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   int rate = 0;
   int64_t distortion;
   VP9_COMMON *const cm = &cpi->common;
-  const int src_stride = x->plane[0].src.stride;
+  struct macroblock_plane *p = &x->plane[0];
+  struct macroblockd_plane *pd = &xd->plane[0];
+  const int src_stride = p->src.stride;
   uint8_t *src, *dst;
   int16_t *src_diff, *coeff;
 
@@ -678,18 +691,20 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         block = ib + idy * 2 + idx;
         xd->mode_info_context->bmi[block].as_mode.first = mode;
         src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        x->plane[0].src.buf, src_stride);
+                                        p->src.buf, src_stride);
         src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                             x->plane[0].src_diff);
+                                             p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
         dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride);
-        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
-                             dst, xd->plane[0].dst.stride);
+                                        pd->dst.buf,
+                                        pd->dst.stride);
+        vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
+                                TX_4X4, mode,
+                                dst, pd->dst.stride,
+                                dst, pd->dst.stride);
         vp9_subtract_block(4, 4, src_diff, 8,
                            src, src_stride,
-                           dst, xd->plane[0].dst.stride);
+                           dst, pd->dst.stride);
 
         tx_type = get_tx_type_4x4(xd, block);
         if (tx_type != DCT_DCT) {
@@ -702,15 +717,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
         ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
                              tempa + idx, templ + idy, TX_4X4, 16);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                         block, 16), 16) >> 2;
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
+                                                          block, 16), 16) >> 2;
 
         if (best_tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
-                               dst, xd->plane[0].dst.stride, best_tx_type);
+          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+                               dst, pd->dst.stride, best_tx_type);
         else
-          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
-                             dst, xd->plane[0].dst.stride);
+          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+                             dst, pd->dst.stride);
       }
     }
 
@@ -730,7 +745,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         for (idx = 0; idx < bw; ++idx) {
           block = ib + idy * 2 + idx;
           vpx_memcpy(best_dqcoeff[idy * 2 + idx],
-                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                     BLOCK_OFFSET(pd->dqcoeff, block, 16),
                      sizeof(best_dqcoeff[0]));
         }
       }
@@ -742,18 +757,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       block = ib + idy * 2 + idx;
       xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
       dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      xd->plane[0].dst.buf,
-                                      xd->plane[0].dst.stride);
+                                      pd->dst.buf,
+                                      pd->dst.stride);
 
-      vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
-                           dst, xd->plane[0].dst.stride);
+      vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
+                              *best_mode, dst, pd->dst.stride,
+                              dst, pd->dst.stride);
       // inverse transform
       if (best_tx_type != DCT_DCT)
         vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                             xd->plane[0].dst.stride, best_tx_type);
+                            pd->dst.stride, best_tx_type);
       else
         xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                           xd->plane[0].dst.stride);
+                           pd->dst.stride);
     }
   }
 
@@ -1092,25 +1108,22 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
   int k;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  const int bw = plane_block_width(bsize, &xd->plane[0]);
+  const int bh = plane_block_height(bsize, &xd->plane[0]);
   int idx, idy;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            x->plane[0].src.buf, src_stride);
-  int16_t* src_diff =
-  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            x->plane[0].src_diff);
+  uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 x->plane[0].src.buf,
+                                                 src_stride);
+  int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                x->plane[0].src_diff);
   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-  uint8_t* const pre =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            xd->plane[0].pre[0].buf,
-                            xd->plane[0].pre[0].stride);
-  uint8_t* const dst =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            xd->plane[0].dst.buf,
-                            xd->plane[0].dst.stride);
+  uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 xd->plane[0].pre[0].buf,
+                                                 xd->plane[0].pre[0].stride);
+  uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 xd->plane[0].dst.buf,
+                                                 xd->plane[0].dst.stride);
   int64_t thisdistortion = 0;
   int thisrate = 0;
 
@@ -1123,7 +1136,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                             xd->plane[0].dst.stride,
                             &xd->mode_info_context->bmi[i].as_mv[0],
                             &xd->scale_factor[0],
-                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix,
+                            bw, bh, 0 /* no avg */, &xd->subpix,
                             MV_PRECISION_Q3);
 
   // TODO(debargha): Make this work properly with the
@@ -1137,17 +1150,17 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
     vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
                               dst, xd->plane[0].dst.stride,
                               &xd->mode_info_context->bmi[i].as_mv[1],
-                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
+                              &xd->scale_factor[1], bw, bh, 1,
                               &xd->subpix, MV_PRECISION_Q3);
   }
 
-  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+  vp9_subtract_block(bh, bw, src_diff, 8,
                      src, src_stride,
                      dst, xd->plane[0].dst.stride);
 
   k = i;
-  for (idy = 0; idy < bh; ++idy) {
-    for (idx = 0; idx < bw; ++idx) {
+  for (idy = 0; idy < bh / 4; ++idy) {
+    for (idx = 0; idx < bw / 4; ++idx) {
       k += (idy * 2 + idx);
       src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
                                            x->plane[0].src_diff);
@@ -1989,12 +2002,11 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
-  int further_steps, step_param = cpi->sf.first_step;
+  int further_steps, step_param;
   int sadpb = x->sadperbit16;
   int_mv mvp_full;
   int ref = mbmi->ref_frame[0];
   int_mv ref_mv = mbmi->ref_mvs[ref][0];
-  int sr = 0;
   const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
 
   int tmp_col_min = x->mv_col_min;
@@ -2018,7 +2030,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   vp9_clamp_mv_min_max(x, &ref_mv);
 
-  sr = vp9_init_search_range(cpi->common.width, cpi->common.height);
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+    step_param = vp9_init_search_range(cpi, cpi->max_mv_magnitude);
+  } else {
+    step_param = vp9_init_search_range(
+                   cpi, MIN(cpi->common.width, cpi->common.height));
+  }
 
   // mvp_full.as_int = ref_mv[0].as_int;
   mvp_full.as_int =
@@ -2027,9 +2046,6 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.as_mv.col >>= 3;
   mvp_full.as_mv.row >>= 3;
 
-  // adjust search range according to sr from mv prediction
-  step_param = MAX(step_param, sr);
-
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
@@ -2233,13 +2249,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int_mv *frame_mv,
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES]) {
-  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  const enum BlockSize uv_block_size = get_plane_block_size(bsize,
-                                                            &xd->plane[1]);
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const int is_comp_pred = (mbmi->ref_frame[1] > 0);
   const int num_refs = is_comp_pred ? 2 : 1;
@@ -2370,13 +2381,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int p;
 
         for (p = 0; p < MAX_MB_PLANE; p++) {
-          const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
-          const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+          struct macroblockd_plane *pd = &xd->plane[p];
+          const int bw = plane_block_width(bsize, pd);
+          const int bh = plane_block_height(bsize, pd);
           int i;
 
-          for (i = 0; i < y; i++)
-            vpx_memcpy(&tmp_buf[p][64 * i],
-                       xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
+          for (i = 0; i < bh; i++)
+            vpx_memcpy(&tmp_buf[p][64 * i], pd->dst.buf + i * pd->dst.stride,
+                                   bw);
         }
         pred_exists = 1;
       }
@@ -2394,13 +2406,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int p;
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
-      const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
-      const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+      struct macroblockd_plane *pd = &xd->plane[p];
+      const int bw = plane_block_width(bsize, pd);
+      const int bh = plane_block_height(bsize, pd);
       int i;
 
-      for (i = 0; i < y; i++)
-        vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
-                   &tmp_buf[p][64 * i], x);
+      for (i = 0; i < bh; i++)
+        vpx_memcpy(pd->dst.buf + i * pd->dst.stride, &tmp_buf[p][64 * i], bw);
     }
   } else {
     // Handles the special case when a filter that is not in the
@@ -2414,36 +2426,37 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
     x->skip = 1;
   else if (x->encode_breakout) {
+    const enum BlockSize y_size = get_plane_block_size(bsize, &xd->plane[0]);
+    const enum BlockSize uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+
     unsigned int var, sse;
-    int threshold = (xd->plane[0].dequant[1]
-                     * xd->plane[0].dequant[1] >> 4);
+    int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
+
 
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                     x->plane[0].src.stride,
-                                     xd->plane[0].dst.buf,
-                                     xd->plane[0].dst.stride,
-                                     &sse);
+    var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                                 &sse);
 
     if ((int)sse < threshold) {
       unsigned int q2dc = xd->plane[0].dequant[0];
-      /* If there is no codeable 2nd order dc
-         or a very small uniform pixel change change */
+      // If there is no codeable 2nd order dc
+      // or a very small uniform pixel change change
       if ((sse - var < q2dc * q2dc >> 4) ||
           (sse / 2 > var && sse - var < 64)) {
         // Check u and v to make sure skip is ok
         int sse2;
         unsigned int sse2u, sse2v;
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride, &sse2u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride, &sse2v);
+        var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                      x->plane[1].src.stride,
+                                      xd->plane[1].dst.buf,
+                                      xd->plane[1].dst.stride, &sse2u);
+        var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                      x->plane[2].src.stride,
+                                      xd->plane[2].dst.buf,
+                                      xd->plane[2].dst.stride, &sse2v);
         sse2 = sse2u + sse2v;
 
         if (sse2 * 2 < threshold) {
@@ -2451,7 +2464,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           *distortion = sse + sse2;
           *rate2 = 500;
 
-          /* for best_yrd calculation */
+          // for best_yrd calculation
           *rate_uv = 0;
           *distortion_uv = sse2;
 
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 872bf267a..c1bd93bf5 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -148,9 +148,10 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   // Further step/diamond searches as necessary
   if (cpi->speed < 8)
-    step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);
+    step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0);
   else
-    step_param = cpi->sf.first_step + 2;
+    step_param = cpi->sf.reduce_first_step_size + 2;
+  step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 484afce73..cc7d45243 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -375,7 +375,7 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
 }
 
 // load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) {
+static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
   in[0]  = _mm_load_si128((__m128i *)(input + 0 * stride));
   in[1]  = _mm_load_si128((__m128i *)(input + 1 * stride));
   in[2]  = _mm_load_si128((__m128i *)(input + 2 * stride));
@@ -396,7 +396,7 @@ static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) {
 }
 
 // write 8x8 array
-static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) {
+static INLINE void write_buffer_8x8(int16_t *output, __m128i *res) {
   __m128i sign0 = _mm_srai_epi16(res[0], 15);
   __m128i sign1 = _mm_srai_epi16(res[1], 15);
   __m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -435,7 +435,7 @@ static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) {
 }
 
 // perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i res[8]) {
+static INLINE void array_transpose_8x8(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
   const __m128i tr0_2 = _mm_unpackhi_epi16(res[0], res[1]);
@@ -486,7 +486,7 @@ static INLINE void array_transpose_8x8(__m128i res[8]) {
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_sse2(__m128i in[8]) {
+void fdct8_1d_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -626,7 +626,7 @@ void fdct8_1d_sse2(__m128i in[8]) {
   array_transpose_8x8(in);
 }
 
-void fadst8_1d_sse2(__m128i in[8]) {
+void fadst8_1d_sse2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 7ae3219ca..7ede9c20d 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -33,6 +33,8 @@ VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))