48 files changed, 1834 insertions, 2809 deletions
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 092d9ff21..3ab4cc3a9 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -151,14 +151,6 @@ typedef enum {
 
 #define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-#define ACTIVE_HT 110                // quantization stepsize threshold
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM16X16
-#define ACTIVE_HT16 300
-#endif
-
 typedef enum {
   B_DC_PRED,          /* average of above and left pixels */
   B_TM_PRED,
@@ -182,50 +174,6 @@ typedef enum {
   B_MODE_COUNT
 } B_PREDICTION_MODE;
 
-#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
-// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
-static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
-  B_PREDICTION_MODE b_mode;
-  switch (mode) {
-    case DC_PRED:
-      b_mode = B_DC_PRED;
-      break;
-    case V_PRED:
-      b_mode = B_VE_PRED;
-      break;
-    case H_PRED:
-      b_mode = B_HE_PRED;
-      break;
-    case TM_PRED:
-      b_mode = B_TM_PRED;
-      break;
-    case D45_PRED:
-      b_mode = B_LD_PRED;
-      break;
-    case D135_PRED:
-      b_mode = B_RD_PRED;
-      break;
-    case D117_PRED:
-      b_mode = B_VR_PRED;
-      break;
-    case D153_PRED:
-      b_mode = B_HD_PRED;
-      break;
-    case D27_PRED:
-      b_mode = B_VL_PRED;
-      break;
-    case D63_PRED:
-      b_mode = B_HU_PRED;
-      break;
-    default :
-      // for debug purpose, to be removed after full testing
-      assert(0);
-      break;
-  }
-  return b_mode;
-}
-#endif
-
 #define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
 #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
 
@@ -416,8 +364,6 @@ typedef struct MacroBlockD {
   vp8_subpix_fn_t  subpixel_predict_avg16x16;
   int allow_high_precision_mv;
 
-  void *current_bc;
-
   int corrupted;
 
 #if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
@@ -438,68 +384,148 @@ typedef struct MacroBlockD {
   int_mv ref_mv[MAX_MV_REFS];
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
   int q_index;
-#endif
 
 } MACROBLOCKD;
 
+#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
+#define ACTIVE_HT 110                // quantization stepsize threshold
+#endif
+
+#if CONFIG_HYBRIDTRANSFORM8X8
+#define ACTIVE_HT8 300
+#endif
+
+#if CONFIG_HYBRIDTRANSFORM16X16
+#define ACTIVE_HT16 300
+#endif
+
+// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
+static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
+  B_PREDICTION_MODE b_mode;
+  switch (mode) {
+    case DC_PRED:
+      b_mode = B_DC_PRED;
+      break;
+    case V_PRED:
+      b_mode = B_VE_PRED;
+      break;
+    case H_PRED:
+      b_mode = B_HE_PRED;
+      break;
+    case TM_PRED:
+      b_mode = B_TM_PRED;
+      break;
+    case D45_PRED:
+      b_mode = B_LD_PRED;
+      break;
+    case D135_PRED:
+      b_mode = B_RD_PRED;
+      break;
+    case D117_PRED:
+      b_mode = B_VR_PRED;
+      break;
+    case D153_PRED:
+      b_mode = B_HD_PRED;
+      break;
+    case D27_PRED:
+      b_mode = B_HU_PRED;
+      break;
+    case D63_PRED:
+      b_mode = B_VL_PRED;
+      break;
+    default :
+      // for debug purpose, to be removed after full testing
+      assert(0);
+      break;
+  }
+  return b_mode;
+}
+
 #if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
 // transform mapping
-static void txfm_map(BLOCKD *b, B_PREDICTION_MODE bmode) {
+static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
   // map transform type
+  TX_TYPE tx_type;
   switch (bmode) {
     case B_TM_PRED :
     case B_RD_PRED :
-      b->bmi.as_mode.tx_type = ADST_ADST;
+      tx_type = ADST_ADST;
       break;
 
     case B_VE_PRED :
     case B_VR_PRED :
-      b->bmi.as_mode.tx_type = ADST_DCT;
+      tx_type = ADST_DCT;
       break;
 
     case B_HE_PRED :
     case B_HD_PRED :
     case B_HU_PRED :
-      b->bmi.as_mode.tx_type = DCT_ADST;
+      tx_type = DCT_ADST;
       break;
 
     default :
-      b->bmi.as_mode.tx_type = DCT_DCT;
+      tx_type = DCT_DCT;
       break;
   }
+  return tx_type;
 }
+#endif
 
-static TX_TYPE get_tx_type(MACROBLOCKD *xd, const BLOCKD *b) {
+#if CONFIG_HYBRIDTRANSFORM
+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode == B_PRED &&
+      xd->q_index < ACTIVE_HT) {
+    tx_type = txfm_map(b->bmi.as_mode.first);
+  }
+  return tx_type;
+}
+#endif
+
+#if CONFIG_HYBRIDTRANSFORM8X8
+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
+      xd->q_index < ACTIVE_HT8) {
+    tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));
+  }
+  return tx_type;
+}
+#endif
+
+#if CONFIG_HYBRIDTRANSFORM16X16
+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
+  TX_TYPE tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
+      xd->q_index < ACTIVE_HT16) {
+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
+  }
+  return tx_type;
+}
+#endif
+
+#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || \
+    CONFIG_HYBRIDTRANSFORM16X16
+static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
   TX_TYPE tx_type = DCT_DCT;
   int ib = (b - xd->block);
-  if (ib >= 16) return tx_type;
+  if (ib >= 16)
+    return tx_type;
 #if CONFIG_HYBRIDTRANSFORM16X16
   if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
-    if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
-        xd->q_index < ACTIVE_HT16)
-      tx_type = b->bmi.as_mode.tx_type;
-    return tx_type;
+    tx_type = get_tx_type_16x16(xd, b);
   }
 #endif
 #if CONFIG_HYBRIDTRANSFORM8X8
   if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {
-    BLOCKD *bb;
     ib = (ib & 8) + ((ib & 4) >> 1);
-    bb = xd->block + ib;
-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED)
-      tx_type = bb->bmi.as_mode.tx_type;
-    return tx_type;
+    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
   }
 #endif
 #if CONFIG_HYBRIDTRANSFORM
   if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {
-    if (xd->mode_info_context->mbmi.mode == B_PRED &&
-        xd->q_index < ACTIVE_HT) {
-      tx_type = b->bmi.as_mode.tx_type;
-    }
-    return tx_type;
+    tx_type = get_tx_type_4x4(xd, b);
   }
 #endif
   return tx_type;
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index 4af3ecf15..b9dfb344f 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -37,9 +37,9 @@ extern const int vp8_i8x8_block[4];
 #define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
 #define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 13+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
-
 #define MAX_ENTROPY_TOKENS 12
 #define ENTROPY_NODES 11
+#define EOSB_TOKEN              127     /* Not signalled, encoder only */
 
 extern const vp8_tree_index vp8_coef_tree[];
 
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index cd7b87adf..345a7c1c0 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -159,45 +159,34 @@ static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
   if (!(b & 3)) {
     /* On L edge, get from MB to left of us */
     --cur_mb;
-    switch (cur_mb->mbmi.mode) {
-      case DC_PRED:
-        return B_DC_PRED;
-      case V_PRED:
-        return B_VE_PRED;
-      case H_PRED:
-        return B_HE_PRED;
-      case TM_PRED:
-        return B_TM_PRED;
-      case I8X8_PRED:
-      case B_PRED:
-        return (cur_mb->bmi + b + 3)->as_mode.first;
-      default:
-        return B_DC_PRED;
+
+    if (cur_mb->mbmi.mode < I8X8_PRED) {
+      return pred_mode_conv(cur_mb->mbmi.mode);
+    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
+      return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);
+    } else if (cur_mb->mbmi.mode == B_PRED) {
+      return ((cur_mb->bmi + 3 + b)->as_mode.first);
+    } else {
+      return B_DC_PRED;
     }
   }
   return (cur_mb->bmi + b - 1)->as_mode.first;
 }
 
-static B_PREDICTION_MODE above_block_mode(const MODE_INFO
-                                          *cur_mb, int b, int mi_stride) {
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
+                                          int b, int mi_stride) {
   if (!(b >> 2)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
-    switch (cur_mb->mbmi.mode) {
-      case DC_PRED:
-        return B_DC_PRED;
-      case V_PRED:
-        return B_VE_PRED;
-      case H_PRED:
-        return B_HE_PRED;
-      case TM_PRED:
-        return B_TM_PRED;
-      case I8X8_PRED:
-      case B_PRED:
-        return (cur_mb->bmi + b + 12)->as_mode.first;
-      default:
-        return B_DC_PRED;
+    if (cur_mb->mbmi.mode < I8X8_PRED) {
+      return pred_mode_conv(cur_mb->mbmi.mode);
+    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
+      return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);
+    } else if (cur_mb->mbmi.mode == B_PRED) {
+      return ((cur_mb->bmi + 12 + b)->as_mode.first);
+    } else {
+      return B_DC_PRED;
     }
   }
 
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 7b3ac36a5..d705fec32 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -189,6 +189,7 @@ void vp8_ihtllm_c(short *input, short *output, int pitch,
     // pointers to vertical and horizontal transforms
     float *ptv, *pth;
 
+    assert(tx_type != DCT_DCT);
     // load and convert residual array into floating-point
     for(j = 0; j < tx_dim; j++) {
       for(i = 0; i < tx_dim; i++) {
diff --git a/vp8/common/pred_common.c b/vp8/common/pred_common.c
index cb80a0f7e..a32389433 100644
--- a/vp8/common/pred_common.c
+++ b/vp8/common/pred_common.c
@@ -15,8 +15,8 @@
 // TBD prediction functions for various bitstream signals
 
 // Returns a context number for the given MB prediction signal
-unsigned char get_pred_context(VP8_COMMON *const cm,
-                               MACROBLOCKD *const xd,
+unsigned char get_pred_context(const VP8_COMMON *const cm,
+                               const MACROBLOCKD *const xd,
                                PRED_ID pred_id) {
   int pred_context;
   MODE_INFO *m = xd->mode_info_context;
@@ -106,8 +106,8 @@ unsigned char get_pred_context(VP8_COMMON *const cm,
 
 // This function returns a context probability for coding a given
 // prediction signal
-vp8_prob get_pred_prob(VP8_COMMON *const cm,
-                       MACROBLOCKD *const xd,
+vp8_prob get_pred_prob(const VP8_COMMON *const cm,
+                       const MACROBLOCKD *const xd,
                        PRED_ID pred_id) {
   vp8_prob pred_probability;
   int pred_context;
@@ -146,10 +146,10 @@ vp8_prob get_pred_prob(VP8_COMMON *const cm,
 
 // This function returns a context probability ptr for coding a given
 // prediction signal
-vp8_prob *get_pred_probs(VP8_COMMON *const cm,
-                         MACROBLOCKD *const xd,
+const vp8_prob *get_pred_probs(const VP8_COMMON *const cm,
+                         const MACROBLOCKD *const xd,
                          PRED_ID pred_id) {
-  vp8_prob *pred_probability;
+  const vp8_prob *pred_probability;
   int pred_context;
 
   // Get the appropriate prediction context
@@ -191,7 +191,7 @@ vp8_prob *get_pred_probs(VP8_COMMON *const cm,
 
 // This function returns the status of the given prediction signal.
 // I.e. is the predicted value for the given signal correct.
-unsigned char get_pred_flag(MACROBLOCKD *const xd,
+unsigned char get_pred_flag(const MACROBLOCKD *const xd,
                             PRED_ID pred_id) {
   unsigned char pred_flag = 0;
 
@@ -260,14 +260,14 @@ void set_pred_flag(MACROBLOCKD *const xd,
 // peredict various bitstream signals.
 
 // Macroblock segment id prediction function
-unsigned char get_pred_mb_segid(VP8_COMMON *const cm, int MbIndex) {
+unsigned char get_pred_mb_segid(const VP8_COMMON *const cm, int MbIndex) {
   // Currently the prediction for the macroblock segment ID is
   // the value stored for this macroblock in the previous frame.
   return cm->last_frame_seg_map[MbIndex];
 }
 
-MV_REFERENCE_FRAME get_pred_ref(VP8_COMMON *const cm,
-                                MACROBLOCKD *const xd) {
+MV_REFERENCE_FRAME get_pred_ref(const VP8_COMMON *const cm,
+                                const MACROBLOCKD *const xd) {
   MODE_INFO *m = xd->mode_info_context;
 
   MV_REFERENCE_FRAME left;
diff --git a/vp8/common/pred_common.h b/vp8/common/pred_common.h
index f4992f555..402e0235f 100644
--- a/vp8/common/pred_common.h
+++ b/vp8/common/pred_common.h
@@ -28,19 +28,19 @@ typedef enum {
 } PRED_ID;
 
 
-extern unsigned char get_pred_context(VP8_COMMON *const cm,
-                                      MACROBLOCKD *const xd,
+extern unsigned char get_pred_context(const VP8_COMMON *const cm,
+                                      const MACROBLOCKD *const xd,
                                       PRED_ID pred_id);
 
-extern vp8_prob get_pred_prob(VP8_COMMON *const cm,
-                              MACROBLOCKD *const xd,
+extern vp8_prob get_pred_prob(const VP8_COMMON *const cm,
+                              const MACROBLOCKD *const xd,
                               PRED_ID pred_id);
 
-extern vp8_prob *get_pred_probs(VP8_COMMON *const cm,
-                                MACROBLOCKD *const xd,
+extern const vp8_prob *get_pred_probs(const VP8_COMMON *const cm,
+                                const MACROBLOCKD *const xd,
                                 PRED_ID pred_id);
 
-extern unsigned char get_pred_flag(MACROBLOCKD *const xd,
+extern unsigned char get_pred_flag(const MACROBLOCKD *const xd,
                                    PRED_ID pred_id);
 
 extern void set_pred_flag(MACROBLOCKD *const xd,
@@ -48,10 +48,10 @@ extern void set_pred_flag(MACROBLOCKD *const xd,
                           unsigned char pred_flag);
 
 
-extern unsigned char get_pred_mb_segid(VP8_COMMON *const cm, int MbIndex);
+extern unsigned char get_pred_mb_segid(const VP8_COMMON *const cm, int MbIndex);
 
-extern MV_REFERENCE_FRAME get_pred_ref(VP8_COMMON *const cm,
-                                       MACROBLOCKD *const xd);
+extern MV_REFERENCE_FRAME get_pred_ref(const VP8_COMMON *const cm,
+                                       const MACROBLOCKD *const xd);
 extern void compute_mod_refprobs(VP8_COMMON *const cm);
 
 #endif /* __INC_PRED_COMMON_H__ */
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 1c4cdb386..a41d233ab 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -294,7 +294,7 @@ void vp8_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf
   }
 }
 
-static void build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
+void vp8_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
   unsigned char *ptr_base;
   unsigned char *ptr;
   unsigned char *pred_ptr = d->predictor;
@@ -319,8 +319,8 @@ static void build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
  * come from an earlier call to build_inter_predictors_4b()) with the
  * predictor of the second reference frame / motion vector.
  */
-static void build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                         BLOCKD *d, int pitch) {
+void vp8_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
+                                      BLOCKD *d, int pitch) {
   unsigned char *ptr_base;
   unsigned char *ptr;
   unsigned char *pred_ptr = d->predictor;
@@ -985,16 +985,16 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
     }
 
 
-    build_inter_predictors4b(xd, &blockd[ 0], 16);
-    build_inter_predictors4b(xd, &blockd[ 2], 16);
-    build_inter_predictors4b(xd, &blockd[ 8], 16);
-    build_inter_predictors4b(xd, &blockd[10], 16);
+    vp8_build_inter_predictors4b(xd, &blockd[ 0], 16);
+    vp8_build_inter_predictors4b(xd, &blockd[ 2], 16);
+    vp8_build_inter_predictors4b(xd, &blockd[ 8], 16);
+    vp8_build_inter_predictors4b(xd, &blockd[10], 16);
 
     if (mbmi->second_ref_frame) {
-      build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
-      build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
-      build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
-      build_2nd_inter_predictors4b(xd, &blockd[10], 16);
+      vp8_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
+      vp8_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
+      vp8_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
+      vp8_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
     }
   } else {
     for (i = 0; i < 16; i += 2) {
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 55044e5aa..c78611920 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -59,6 +59,11 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
 extern void vp8_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
                                              vp8_subpix_fn_t sppf);
 
+extern void vp8_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
+                                         int pitch);
+extern void vp8_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
+                                             BLOCKD *d, int pitch);
+
 extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
 extern void vp8_setup_interp_filters(MACROBLOCKD *xd,
                                      INTERPOLATIONFILTERTYPE filter,
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index 0ba0a2cff..dfbaf137b 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -298,18 +298,19 @@ void vp8_comp_intra4x4_predict_c(BLOCKD *x,
 void vp8_intra_prediction_down_copy(MACROBLOCKD *xd) {
   unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
                                xd->block[0].dst_stride + 16;
+  unsigned int *src_ptr = (unsigned int *)
+      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
 
-  unsigned int *src_ptr = (unsigned int *)above_right;
-  unsigned int *dst_ptr0 =
-    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
+  unsigned int *dst_ptr0 = (unsigned int *)above_right;
   unsigned int *dst_ptr1 =
-    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
+    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
   unsigned int *dst_ptr2 =
+    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
+  unsigned int *dst_ptr3 =
     (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
 
   *dst_ptr0 = *src_ptr;
   *dst_ptr1 = *src_ptr;
   *dst_ptr2 = *src_ptr;
+  *dst_ptr3 = *src_ptr;
 }
-
-
diff --git a/vp8/common/rtcd.c b/vp8/common/rtcd.c
index a7bb92ce4..01dad4691 100644
--- a/vp8/common/rtcd.c
+++ b/vp8/common/rtcd.c
@@ -11,20 +11,95 @@
 #define RTCD_C
 #include "vpx_rtcd.h"
 
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+static void once(void (*func)(void))
+{
+    static CRITICAL_SECTION *lock;
+    static LONG waiters;
+    static int done;
+    void *lock_ptr = &lock;
+
+    /* If the initialization is complete, return early. This isn't just an
+     * optimization, it prevents races on the destruction of the global
+     * lock.
+     */
+    if(done)
+        return;
+
+    InterlockedIncrement(&waiters);
+
+    /* Get a lock. We create one and try to make it the one-true-lock,
+     * throwing it away if we lost the race.
+     */
+
+    {
+        /* Scope to protect access to new_lock */
+        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
+        InitializeCriticalSection(new_lock);
+        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
+        {
+            DeleteCriticalSection(new_lock);
+            free(new_lock);
+        }
+    }
+
+    /* At this point, we have a lock that can be synchronized on. We don't
+     * care which thread actually performed the allocation.
+     */
+
+    EnterCriticalSection(lock);
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    LeaveCriticalSection(lock);
+
+    /* Last one out should free resources. The destructed objects are
+     * protected by checking if(done) above.
+     */
+    if(!InterlockedDecrement(&waiters))
+    {
+        DeleteCriticalSection(lock);
+        free(lock);
+        lock = NULL;
+    }
+}
+
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+    static pthread_once_t lock = PTHREAD_ONCE_INIT;
+    pthread_once(&lock, func);
+}
+
+
+#else
 /* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
  * so as long as your platform provides atomic loads/stores of pointers
  * no synchronization is strictly necessary.
  */
 
-static void once(void (*func)(void)) {
-  static int done;
+static void once(void (*func)(void))
+{
+    static int done;
 
-  if(!done) {
-    func();
-    done = 1;
-  }
+    if(!done)
+    {
+        func();
+        done = 1;
+    }
 }
+#endif
+
 
-void vpx_rtcd() {
-  once(setup_rtcd_internal);
+void vpx_rtcd()
+{
+    once(setup_rtcd_internal);
 }
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 8d7318007..4e7debaf4 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -174,3 +174,201 @@ vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
 vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
 vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
 
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
+
+
+# variance
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+
+prototype unsigned int vp8_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance32x32
+
+prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance16x16 mmx sse2
+vp8_variance16x16_sse2=vp8_variance16x16_wmt
+vp8_variance16x16_mmx=vp8_variance16x16_mmx
+
+prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance16x8 mmx sse2
+vp8_variance16x8_sse2=vp8_variance16x8_wmt
+vp8_variance16x8_mmx=vp8_variance16x8_mmx
+
+prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance8x16 mmx sse2
+vp8_variance8x16_sse2=vp8_variance8x16_wmt
+vp8_variance8x16_mmx=vp8_variance8x16_mmx
+
+prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance8x8 mmx sse2
+vp8_variance8x8_sse2=vp8_variance8x8_wmt
+vp8_variance8x8_mmx=vp8_variance8x8_mmx
+
+prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance4x4 mmx sse2
+vp8_variance4x4_sse2=vp8_variance4x4_wmt
+vp8_variance4x4_mmx=vp8_variance4x4_mmx
+
+prototype unsigned int vp8_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance32x32
+
+prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x16 sse2 mmx ssse3
+vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x16 sse2 mmx
+vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x8 sse2 mmx ssse3
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_ssse3;
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x8 sse2 mmx
+vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance4x4 sse2 mmx
+vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp8_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad32x32
+
+prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad16x16 mmx sse2 sse3
+vp8_sad16x16_sse2=vp8_sad16x16_wmt
+
+prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad16x8 mmx sse2
+vp8_sad16x8_sse2=vp8_sad16x8_wmt
+
+prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad8x16 mmx sse2
+vp8_sad8x16_sse2=vp8_sad8x16_wmt
+
+prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad8x8 mmx sse2
+vp8_sad8x8_sse2=vp8_sad8x8_wmt
+
+prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp8_sad4x4 mmx sse2
+vp8_sad4x4_sse2=vp8_sad4x4_wmt
+
+prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_h mmx sse2
+vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
+
+prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_v mmx sse2
+vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
+
+prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_hv mmx sse2
+vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
+
+prototype unsigned int vp8_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar32x32_h
+
+prototype unsigned int vp8_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar32x32_v
+
+prototype unsigned int vp8_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar32x32_hv
+
+prototype void vp8_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad32x32x3
+
+prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x3 sse3 ssse3
+
+prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x3 sse3 ssse3
+
+prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x3 sse3
+
+prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x3 sse3
+
+prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x3 sse3
+
+prototype void vp8_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad32x32x8
+
+prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x16x8 sse4
+
+prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x8x8 sse4
+
+prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x16x8 sse4
+
+prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x8x8 sse4
+
+prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad4x4x8 sse4
+
+prototype void vp8_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad32x32x4d
+
+prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x4d sse3
+
+prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x4d sse3
+
+prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x4d sse3
+
+prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x4d sse3
+
+prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x4d sse3
+
+#
+# Block copy
+#
+case $arch in
+    x86*)
+    prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    specialize vp8_copy32xn sse2 sse3
+    ;;
+esac
+
+prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+specialize vp8_sub_pixel_mse16x16 sse2 mmx
+vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
+specialize vp8_mse16x16 mmx sse2
+vp8_mse16x16_sse2=vp8_mse16x16_wmt
+
+prototype unsigned int vp8_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_mse32x32
+
+prototype unsigned int vp8_get_mb_ss "const short *"
+specialize vp8_get_mb_ss mmx sse2
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+    prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
+
+    prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+fi
+# end encoder functions
diff --git a/vp8/common/seg_common.c b/vp8/common/seg_common.c
index b616391ba..a11fe87e9 100644
--- a/vp8/common/seg_common.c
+++ b/vp8/common/seg_common.c
@@ -19,7 +19,7 @@ const int vp8_seg_feature_data_bits[SEG_LVL_MAX] =
 // the coding mechanism is still subject to change so these provide a
 // convenient single point of change.
 
-int segfeature_active(MACROBLOCKD *xd,
+int segfeature_active(const MACROBLOCKD *xd,
                       int segment_id,
                       SEG_LVL_FEATURES feature_id) {
   // Return true if mask bit set and segmentation enabled.
@@ -66,7 +66,7 @@ void set_segdata(MACROBLOCKD *xd,
   xd->segment_feature_data[segment_id][feature_id] = seg_data;
 }
 
-int get_segdata(MACROBLOCKD *xd,
+int get_segdata(const MACROBLOCKD *xd,
                 int segment_id,
                 SEG_LVL_FEATURES feature_id) {
   return xd->segment_feature_data[segment_id][feature_id];
@@ -126,7 +126,7 @@ void set_segref(MACROBLOCKD *xd,
     (1 << ref_frame);
 }
 
-int check_segref(MACROBLOCKD *xd,
+int check_segref(const MACROBLOCKD *xd,
                  int segment_id,
                  MV_REFERENCE_FRAME ref_frame) {
   return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
diff --git a/vp8/common/seg_common.h b/vp8/common/seg_common.h
index 74131926f..59f40d112 100644
--- a/vp8/common/seg_common.h
+++ b/vp8/common/seg_common.h
@@ -15,7 +15,7 @@
 #ifndef __INC_SEG_COMMON_H__
 #define __INC_SEG_COMMON_H__ 1
 
-int segfeature_active(MACROBLOCKD *xd,
+int segfeature_active(const MACROBLOCKD *xd,
                       int segment_id,
                       SEG_LVL_FEATURES feature_id);
 
@@ -42,7 +42,7 @@ void set_segdata(MACROBLOCKD *xd,
                  SEG_LVL_FEATURES feature_id,
                  int seg_data);
 
-int get_segdata(MACROBLOCKD *xd,
+int get_segdata(const MACROBLOCKD *xd,
                 int segment_id,
                 SEG_LVL_FEATURES feature_id);
 
@@ -73,7 +73,7 @@ void set_segref(MACROBLOCKD *xd,
                 int segment_id,
                 MV_REFERENCE_FRAME ref_frame);
 
-int check_segref(MACROBLOCKD *xd,
+int check_segref(const MACROBLOCKD *xd,
                  int segment_id,
                  MV_REFERENCE_FRAME ref_frame);
 
diff --git a/vp8/decoder/asm_dec_offsets.c b/vp8/decoder/asm_dec_offsets.c
index 1a6090b2b..8551bab10 100644
--- a/vp8/decoder/asm_dec_offsets.c
+++ b/vp8/decoder/asm_dec_offsets.c
@@ -24,7 +24,6 @@ DEFINE(detok_A,                                 offsetof(DETOK, A));
 DEFINE(detok_L,                                 offsetof(DETOK, L));
 
 DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
-DEFINE(detok_current_bc,                        offsetof(DETOK, current_bc));
 DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
 DEFINE(detok_eob,                               offsetof(DETOK, eob));
 
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index fa15af9e9..72ff126f2 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -18,7 +18,7 @@
 #include "vp8/common/seg_common.h"
 #include "vp8/common/pred_common.h"
 #include "vp8/common/entropy.h"
-
+#include "vp8/decoder/decodemv.h"
 #if CONFIG_DEBUG
 #include <assert.h>
 #endif
@@ -73,12 +73,12 @@ static void vp8_read_mb_segid(vp8_reader *r, MB_MODE_INFO *mi,
 }
 
 extern const int vp8_i8x8_block[4];
-static void vp8_kfread_modes(VP8D_COMP *pbi,
-                             MODE_INFO *m,
-                             int mb_row,
-                             int mb_col) {
-  VP8_COMMON *const cm = & pbi->common;
-  vp8_reader *const bc = & pbi->bc;
+static void kfread_modes(VP8D_COMP *pbi,
+                         MODE_INFO *m,
+                         int mb_row,
+                         int mb_col,
+                         BOOL_DECODER* const bc) {
+  VP8_COMMON *const cm = &pbi->common;
   const int mis = pbi->common.mode_info_stride;
   int map_index = mb_row * pbi->common.mb_cols + mb_col;
   MB_PREDICTION_MODE y_mode;
@@ -97,7 +97,7 @@ static void vp8_kfread_modes(VP8D_COMP *pbi,
                           m->mbmi.segment_id, SEG_LVL_EOB) ||
        (get_segdata(&pbi->mb,
                     m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {
-    MACROBLOCKD *const xd  = & pbi->mb;
+    MACROBLOCKD *const xd  = &pbi->mb;
     m->mbmi.mb_skip_coeff = vp8_read(bc, get_pred_prob(cm, xd, PRED_MBSKIP));
   } else {
     if (segfeature_active(&pbi->mb,
@@ -458,7 +458,7 @@ static MV_REFERENCE_FRAME read_ref_frame(VP8D_COMP *pbi,
   int seg_ref_active;
   int seg_ref_count = 0;
 
-  VP8_COMMON *const cm = & pbi->common;
+  VP8_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
   seg_ref_active = segfeature_active(xd,
@@ -597,9 +597,9 @@ static const unsigned char mbsplit_fill_offset[4][16] = {
 };
 
 #if CONFIG_SWITCHABLE_INTERP
-static void read_switchable_interp_probs(VP8D_COMP *pbi) {
-  VP8_COMMON *const cm = & pbi->common;
-  vp8_reader *const bc = & pbi->bc;
+static void read_switchable_interp_probs(VP8D_COMP* const pbi,
+                                         BOOL_DECODER* const bc) {
+  VP8_COMMON *const cm = &pbi->common;
   int i, j;
   for (j = 0; j <= VP8_SWITCHABLE_FILTERS; ++j) {
   //for (j = 0; j <= 0; ++j) {
@@ -612,25 +612,20 @@ static void read_switchable_interp_probs(VP8D_COMP *pbi) {
 }
 #endif
 
-static void mb_mode_mv_init(VP8D_COMP *pbi) {
-  VP8_COMMON *const cm = & pbi->common;
-  vp8_reader *const bc = & pbi->bc;
+static void mb_mode_mv_init(VP8D_COMP *pbi, vp8_reader *bc) {
+  VP8_COMMON *const cm = &pbi->common;
 #if CONFIG_NEWMVENTROPY
   nmv_context *const nmvc = &pbi->common.fc.nmvc;
 #else
   MV_CONTEXT *const mvc = pbi->common.fc.mvc;
   MV_CONTEXT_HP *const mvc_hp = pbi->common.fc.mvc_hp;
 #endif
-  MACROBLOCKD *const xd  = & pbi->mb;
-
-  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
-  if (pbi->common.mb_no_coeff_skip) {
-    int k;
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-      cm->mbskip_pred_probs[k] = (vp8_prob)vp8_read_literal(bc, 8);
-  }
+  MACROBLOCKD *const xd  = &pbi->mb;
 
-  if (cm->frame_type != KEY_FRAME) {
+  if (cm->frame_type == KEY_FRAME) {
+    if (!cm->kf_ymode_probs_update)
+      cm->kf_ymode_probs_index = vp8_read_literal(bc, 3);
+  } else {
 #if CONFIG_PRED_FILTER
     cm->pred_filter_mode = (vp8_prob)vp8_read_literal(bc, 2);
 
@@ -639,7 +634,7 @@ static void mb_mode_mv_init(VP8D_COMP *pbi) {
 #endif
 #if CONFIG_SWITCHABLE_INTERP
     if (cm->mcomp_filter_type == SWITCHABLE)
-      read_switchable_interp_probs(pbi);
+      read_switchable_interp_probs(pbi, bc);
 #endif
     // Decode the baseline probabilities for decoding reference frame
     cm->prob_intra_coded = (vp8_prob)vp8_read_literal(bc, 8);
@@ -681,10 +676,10 @@ static void mb_mode_mv_init(VP8D_COMP *pbi) {
 // the bitstream or if the value is temporally predicted asserts the predicted
 // value
 static void read_mb_segment_id(VP8D_COMP *pbi,
-                               int mb_row, int mb_col) {
-  vp8_reader *const bc = & pbi->bc;
-  VP8_COMMON *const cm = & pbi->common;
-  MACROBLOCKD *const xd  = & pbi->mb;
+                               int mb_row, int mb_col,
+                               BOOL_DECODER* const bc) {
+  VP8_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
   MODE_INFO *mi = xd->mode_info_context;
   MB_MODE_INFO *mbmi = &mi->mbmi;
   int index = mb_row * pbi->common.mb_cols + mb_col;
@@ -753,9 +748,9 @@ static void read_mb_segment_id(VP8D_COMP *pbi,
 
 static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                              MODE_INFO *prev_mi,
-                             int mb_row, int mb_col) {
-  VP8_COMMON *const cm = & pbi->common;
-  vp8_reader *const bc = & pbi->bc;
+                             int mb_row, int mb_col,
+                             BOOL_DECODER* const bc) {
+  VP8_COMMON *const cm = &pbi->common;
 #if CONFIG_NEWMVENTROPY
   nmv_context *const nmvc = &pbi->common.fc.nmvc;
 #else
@@ -763,9 +758,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   MV_CONTEXT_HP *const mvc_hp = pbi->common.fc.mvc_hp;
 #endif
   const int mis = pbi->common.mode_info_stride;
-  MACROBLOCKD *const xd  = & pbi->mb;
+  MACROBLOCKD *const xd  = &pbi->mb;
 
-  int_mv *const mv = & mbmi->mv;
+  int_mv *const mv = &mbmi->mv;
   int mb_to_left_edge;
   int mb_to_right_edge;
   int mb_to_top_edge;
@@ -795,7 +790,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   xd->prev_mode_info_context = prev_mi;
 
   // Read the macroblock segment id.
-  read_mb_segment_id(pbi, mb_row, mb_col);
+  read_mb_segment_id(pbi, mb_row, mb_col, bc);
 
   if (pbi->common.mb_no_coeff_skip &&
       (!segfeature_active(xd,
@@ -1239,6 +1234,14 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
       do {
         mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)vp8_read_bmode(bc, pbi->common.fc.bmode_prob);
+        /*
+        {
+          int p;
+          for (p = 0; p < VP8_BINTRAMODES - 1; ++p)
+            printf(" %d", pbi->common.fc.bmode_prob[p]);
+          printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);
+        }
+        */
         pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;
 #if CONFIG_COMP_INTRA_PRED
         if (use_comp_pred) {
@@ -1302,106 +1305,28 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   }
 }
 
-void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
-  int i;
+void vpx_decode_mode_mvs_init(VP8D_COMP *pbi, BOOL_DECODER* const bc) {
   VP8_COMMON *cm = &pbi->common;
-  MODE_INFO *mi = cm->mi;
-  MACROBLOCKD *const xd  = &pbi->mb;
-  int sb_row, sb_col;
-  int sb_rows = (cm->mb_rows + 1) >> 1;
-  int sb_cols = (cm->mb_cols + 1) >> 1;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
-  MODE_INFO *prev_mi = cm->prev_mi;
-
-  mb_mode_mv_init(pbi);
 
-  if (cm->frame_type == KEY_FRAME && !cm->kf_ymode_probs_update) {
-    cm->kf_ymode_probs_index = vp8_read_literal(&pbi->bc, 3);
-  }
-
-  for (sb_row = 0; sb_row < sb_rows; sb_row++) {
-    int mb_col = 0;
-    int mb_row = (sb_row << 1);
-
-    for (sb_col = 0; sb_col < sb_cols; sb_col++) {
-#if CONFIG_SUPERBLOCKS
-      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, cm->sb_coded);
-#endif
-      for (i = 0; i < 4; i++) {
-
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * cm->mode_info_stride + dx;
-
-        if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
-          /* next macroblock */
-          mb_row += dy;
-          mb_col += dx;
-          mi += offset_extended;
-          prev_mi += offset_extended;
-          continue;
-        }
-#if CONFIG_SUPERBLOCKS
-        if (i)
-          mi->mbmi.encoded_as_sb = 0;
-#endif
-
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = mi;
-        xd->prev_mode_info_context = prev_mi;
-
-        pbi->mb.mb_to_top_edge = -((mb_row * 16)) << 3;
-        pbi->mb.mb_to_bottom_edge =
-            ((pbi->common.mb_rows - 1 - mb_row) * 16) << 3;
-
-        if (cm->frame_type == KEY_FRAME)
-          vp8_kfread_modes(pbi, mi, mb_row, mb_col);
-        else
-          read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row,
-                           mb_col);
-
-#if CONFIG_SUPERBLOCKS
-        if (mi->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          mi[1] = mi[cm->mode_info_stride] =
-            mi[cm->mode_info_stride + 1] = mi[0];
-          mi += 2;
-          prev_mi += 2;
-          break;
-        }
-#endif
-
-        /* next macroblock */
-        mb_row += dy;
-        mb_col += dx;
-        mi += offset_extended;
-        prev_mi += offset_extended;
-      }
-    }
-
-    mi += cm->mode_info_stride + (1 - (cm->mb_cols & 0x1));
-    prev_mi += cm->mode_info_stride + (1 - (cm->mb_cols & 0x1));
+  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
+  if (pbi->common.mb_no_coeff_skip) {
+    int k;
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+      cm->mbskip_pred_probs[k] = (vp8_prob)vp8_read_literal(bc, 8);
   }
-}
 
-void vpx_decode_mode_mvs_init(VP8D_COMP *pbi){
-  VP8_COMMON *cm = &pbi->common;
-  mb_mode_mv_init(pbi);
-  if (cm->frame_type == KEY_FRAME && !cm->kf_ymode_probs_update)
-    cm->kf_ymode_probs_index = vp8_read_literal(&pbi->bc, 3);
+  mb_mode_mv_init(pbi, bc);
 }
 void vpx_decode_mb_mode_mv(VP8D_COMP *pbi,
                            MACROBLOCKD *xd,
                            int mb_row,
-                           int mb_col){
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
   MODE_INFO *mi = xd->mode_info_context;
   MODE_INFO *prev_mi = xd->prev_mode_info_context;
 
   if (pbi->common.frame_type == KEY_FRAME)
-    vp8_kfread_modes(pbi, mi, mb_row, mb_col);
+    kfread_modes(pbi, mi, mb_row, mb_col, bc);
   else
-    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col);
+    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
 }
diff --git a/vp8/decoder/decodemv.h b/vp8/decoder/decodemv.h
index 17bbb5b67..bfb815f6f 100644
--- a/vp8/decoder/decodemv.h
+++ b/vp8/decoder/decodemv.h
@@ -11,9 +11,9 @@
 
 #include "onyxd_int.h"
 
-void vp8_decode_mode_mvs(VP8D_COMP *);
-void vpx_decode_mb_mode_mv(VP8D_COMP *pbi,
-                           MACROBLOCKD *xd,
+void vpx_decode_mb_mode_mv(VP8D_COMP* const pbi,
+                           MACROBLOCKD* const xd,
                            int mb_row,
-                           int mb_col);
-void vpx_decode_mode_mvs_init(VP8D_COMP *pbi);
+                           int mb_col,
+                           BOOL_DECODER* const bc);
+void vpx_decode_mode_mvs_init(VP8D_COMP* const pbi, BOOL_DECODER* const bc);
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index cd9b7d247..01739c0db 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -77,7 +77,7 @@ static vp8_prob read_prob_diff_update(vp8_reader *const bc, int oldp) {
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi) {
   int i;
   int Q;
-  VP8_COMMON *const pc = & pbi->common;
+  VP8_COMMON *const pc = &pbi->common;
 
   for (Q = 0; Q < QINDEX_RANGE; Q++) {
     pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
@@ -98,7 +98,7 @@ void vp8cx_init_de_quantizer(VP8D_COMP *pbi) {
 void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   int i;
   int QIndex;
-  VP8_COMMON *const pc = & pbi->common;
+  VP8_COMMON *const pc = &pbi->common;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
 
   // Set the Q baseline allowing for any segment level adjustment
@@ -115,17 +115,13 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     }
   } else
     QIndex = pc->base_qindex;
+  xd->q_index = QIndex;
 
   /* Set up the block level dequant pointers */
   for (i = 0; i < 16; i++) {
     xd->block[i].dequant = pc->Y1dequant[QIndex];
   }
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
-  xd->q_index = QIndex;
-#endif
-
-
 #if CONFIG_LOSSLESS
   if (!QIndex) {
     pbi->common.rtcd.idct.idct1        = vp8_short_inv_walsh4x4_1_x8_c;
@@ -206,25 +202,21 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
 }
 
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
-                              unsigned int mb_col) {
+                              unsigned int mb_col,
+                              BOOL_DECODER* const bc) {
   int eobtotal = 0;
   MB_PREDICTION_MODE mode;
   int i;
-  int tx_type;
+  int tx_size;
+#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || \
+    CONFIG_HYBRIDTRANSFORM16X16
+  TX_TYPE tx_type;
+#endif
 #if CONFIG_SUPERBLOCKS
   VP8_COMMON *pc = &pbi->common;
   int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-  int QIndex;
-  int active_ht;
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int active_ht16;
-#endif
-
   // re-initialize macroblock dequantizer before detokenization
   if (xd->segmentation_enabled)
     mb_init_dequantizer(pbi, xd);
@@ -235,43 +227,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
   }
 #endif
 
-  tx_type = xd->mode_info_context->mbmi.txfm_size;
+  tx_size = xd->mode_info_context->mbmi.txfm_size;
   mode = xd->mode_info_context->mbmi.mode;
 
-#if CONFIG_HYBRIDTRANSFORM
-  // parse transform types for intra 4x4 mode
-  QIndex = xd->q_index;
-  active_ht = (QIndex < ACTIVE_HT);
-  if (mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      BLOCKD *b = &xd->block[i];
-      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-      if(active_ht)
-        txfm_map(b, b_mode);
-    } // loop over 4x4 blocks
-  }
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM8X8
-  if (mode == I8X8_PRED) {
-    for (i = 0; i < 4; i++) {
-      int ib = vp8_i8x8_block[i];
-      BLOCKD *b = &xd->block[ib];
-      int i8x8mode = b->bmi.as_mode.first;
-      txfm_map(b, pred_mode_conv(i8x8mode));
-    }
-  }
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM16X16
-  active_ht16 = (QIndex < ACTIVE_HT16);
-  if (mode < I8X8_PRED) {
-    BLOCKD *b = &xd->block[0];
-    if(active_ht16)
-      txfm_map(b, pred_mode_conv(mode));
-  }
-#endif
-
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp8_reset_mb_tokens_context(xd);
 #if CONFIG_SUPERBLOCKS
@@ -283,17 +241,18 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       xd->left_context--;
     }
 #endif
-  } else if (!vp8dx_bool_error(xd->current_bc)) {
+  } else if (!vp8dx_bool_error(bc)) {
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
       xd->eobs[i] = 0;
     }
-    if (tx_type == TX_16X16)
-      eobtotal = vp8_decode_mb_tokens_16x16(pbi, xd);
-    else if (tx_type == TX_8X8)
-      eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);
-    else
-      eobtotal = vp8_decode_mb_tokens(pbi, xd);
+    if (tx_size == TX_16X16) {
+      eobtotal = vp8_decode_mb_tokens_16x16(pbi, xd, bc);
+    } else if (tx_size == TX_8X8) {
+      eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd, bc);
+    } else {
+      eobtotal = vp8_decode_mb_tokens(pbi, xd, bc);
+    }
   }
 
   //mode = xd->mode_info_context->mbmi.mode;
@@ -305,8 +264,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV
       && mode != I8X8_PRED
-      && !vp8dx_bool_error(xd->current_bc)
-     ) {
+      && !vp8dx_bool_error(bc)) {
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
      * */
@@ -338,13 +296,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       if (mode != B_PRED) {
         vp8_build_intra_predictors_mby(xd);
       }
-#if 0
-      // Intra-modes requiring recon data from top-right
-      // MB have been temporarily disabled.
-      else {
-        vp8_intra_prediction_down_copy(xd);
-      }
-#endif
     }
   } else {
 #if CONFIG_SUPERBLOCKS
@@ -380,13 +331,17 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
 #if CONFIG_HYBRIDTRANSFORM8X8
-        vp8_ht_dequant_idct_add_8x8_c(b->bmi.as_mode.tx_type,
-                                      q, dq, pre, dst, 16, stride);
-        q += 64;
+        tx_type = get_tx_type(xd, &xd->block[idx]);
+        if (tx_type != DCT_DCT) {
+          vp8_ht_dequant_idct_add_8x8_c(tx_type,
+                                        q, dq, pre, dst, 16, stride);
+        } else {
+          vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+        }
 #else
         vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
-        q += 64;
 #endif
+        q += 64;
       } else {
         for (j = 0; j < 4; j++) {
           b = &xd->block[ib + iblock[j]];
@@ -396,7 +351,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       }
 
       b = &xd->block[16 + i];
-	  vp8_intra_uv4x4_predict(b, i8x8mode, b->predictor);
+      vp8_intra_uv4x4_predict(b, i8x8mode, b->predictor);
       DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, b->dequant,
                                               b->predictor,
                                               *(b->base_dst) + b->dst, 8,
@@ -409,6 +364,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                                               b->dst_stride);
     }
   } else if (mode == B_PRED) {
+    vp8_intra_prediction_down_copy(xd);
     for (i = 0; i < 16; i++) {
       BLOCKD *b = &xd->block[i];
       int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
@@ -425,16 +381,17 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 #endif
 
 #if CONFIG_HYBRIDTRANSFORM
-      if(active_ht)
-        vp8_ht_dequant_idct_add_c( (TX_TYPE)b->bmi.as_mode.tx_type, b->qcoeff,
-                                   b->dequant, b->predictor,
-                                   *(b->base_dst) + b->dst, 16, b->dst_stride);
-      else
+      tx_type = get_tx_type(xd, b);
+      if (tx_type != DCT_DCT) {
+        vp8_ht_dequant_idct_add_c(tx_type, b->qcoeff,
+                                  b->dequant, b->predictor,
+                                  *(b->base_dst) + b->dst, 16, b->dst_stride);
+      } else {
         vp8_dequant_idct_add_c(b->qcoeff, b->dequant, b->predictor,
                                *(b->base_dst) + b->dst, 16, b->dst_stride);
+      }
 #else
-      if (xd->eobs[i] > 1)
-      {
+      if (xd->eobs[i] > 1) {
         DEQUANT_INVOKE(&pbi->dequant, idct_add)
             (b->qcoeff, b->dequant,  b->predictor,
              *(b->base_dst) + b->dst, 16, b->dst_stride);
@@ -454,15 +411,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
   } else {
     BLOCKD *b = &xd->block[24];
 
-    if (tx_type == TX_16X16) {
+    if (tx_size == TX_16X16) {
 #if CONFIG_HYBRIDTRANSFORM16X16
-      if (mode < I8X8_PRED && active_ht16) {
-        BLOCKD *bd = &xd->block[0];
-        TX_TYPE txfm;
-        txfm_map(bd, pred_mode_conv(mode));
-        txfm = bd->bmi.as_mode.tx_type;
-
-        vp8_ht_dequant_idct_add_16x16_c(txfm, xd->qcoeff,
+      BLOCKD *bd = &xd->block[0];
+      tx_type = get_tx_type(xd, bd);
+      if (tx_type != DCT_DCT) {
+        vp8_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
                                         xd->block[0].dequant, xd->predictor,
                                         xd->dst.y_buffer, 16, xd->dst.y_stride);
       } else {
@@ -475,8 +429,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                                    xd->predictor, xd->dst.y_buffer,
                                    16, xd->dst.y_stride);
 #endif
-    }
-    else if (tx_type == TX_8X8) {
+    } else if (tx_size == TX_8X8) {
 #if CONFIG_SUPERBLOCKS
       void *orig = xd->mode_info_context;
       int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
@@ -492,7 +445,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
           xd->mode_info_context += (n & 1);
           xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
           if (!orig_skip_flag) {
-            eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);
+            eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd, bc);
             if (eobtotal == 0) // skip loopfilter
               xd->mode_info_context->mbmi.mb_skip_coeff = 1;
           } else {
@@ -561,10 +514,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 #if CONFIG_SUPERBLOCKS
   if (!xd->mode_info_context->mbmi.encoded_as_sb) {
 #endif
-    if ((tx_type == TX_8X8 &&
+    if ((tx_size == TX_8X8 &&
          xd->mode_info_context->mbmi.mode != I8X8_PRED)
-        || tx_type == TX_16X16
-       )
+        || tx_size == TX_16X16)
       DEQUANT_INVOKE(&pbi->dequant, idct_add_uv_block_8x8) //
           (xd->qcoeff + 16 * 16, xd->block[16].dequant,
            xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
@@ -604,7 +556,8 @@ FILE *vpxlog = 0;
 
 /* Decode a row of Superblocks (2x2 region of MBs) */
 static void
-decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
+decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd,
+              BOOL_DECODER* const bc) {
   int i;
   int sb_col;
   int mb_row, mb_col;
@@ -627,8 +580,7 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
     MODE_INFO *mi = xd->mode_info_context;
 
 #if CONFIG_SUPERBLOCKS
-    if (pbi->interleaved_decoding)
-      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, pc->sb_coded);
+    mi->mbmi.encoded_as_sb = vp8_read(bc, pc->sb_coded);
 #endif
 
     // Process the 4 MBs within the SB in the order:
@@ -638,6 +590,8 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       int dx = col_delta[i];
       int offset_extended = dy * xd->mode_info_stride + dx;
 
+      xd->mb_index = i;
+
       mi = xd->mode_info_context;
       if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
         // MB lies outside frame, skip on to next
@@ -677,8 +631,7 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       if (i)
         mi->mbmi.encoded_as_sb = 0;
 #endif
-      if(pbi->interleaved_decoding)
-        vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
+      vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
 
       update_blockd_bmi(xd);
 
@@ -726,7 +679,7 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
         mi[pc->mode_info_stride + 1] = mi[0];
       }
 #endif
-      decode_macroblock(pbi, xd, mb_col);
+      decode_macroblock(pbi, xd, mb_col, bc);
 #if CONFIG_SUPERBLOCKS
       if (xd->mode_info_context->mbmi.encoded_as_sb) {
         mi[1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
@@ -736,7 +689,7 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
 #endif
 
       /* check if the boolean decoder has suffered an error */
-      xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+      xd->corrupted |= vp8dx_bool_error(bc);
 
 #if CONFIG_SUPERBLOCKS
       if (mi->mbmi.encoded_as_sb) {
@@ -775,21 +728,17 @@ static int read_is_valid(const unsigned char *start,
 
 
 static void setup_token_decoder(VP8D_COMP *pbi,
-                                const unsigned char *cx_data) {
+                                const unsigned char *cx_data,
+                                BOOL_DECODER* const bool_decoder) {
   VP8_COMMON          *pc = &pbi->common;
   const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
-  vp8_reader          *bool_decoder;
   const unsigned char *partition;
 
   ptrdiff_t            partition_size;
   ptrdiff_t            bytes_left;
 
-  // Dummy read for now
-  vp8_read_literal(&pbi->bc, 2);
-
   // Set up pointers to token partition
   partition = cx_data;
-  bool_decoder = &pbi->bc2;
   bytes_left = user_data_end - partition;
   partition_size = bytes_left;
 
@@ -809,8 +758,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
 }
 
 static void init_frame(VP8D_COMP *pbi) {
-  VP8_COMMON *const pc = & pbi->common;
-  MACROBLOCKD *const xd  = & pbi->mb;
+  VP8_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
 
   if (pc->frame_type == KEY_FRAME) {
     /* Various keyframe initializations */
@@ -887,8 +836,8 @@ static void init_frame(VP8D_COMP *pbi) {
 static void read_coef_probs2(VP8D_COMP *pbi) {
   const vp8_prob grpupd = 192;
   int i, j, k, l;
-  vp8_reader *const bc = & pbi->bc;
-  VP8_COMMON *const pc = & pbi->common;
+  vp8_reader *const bc = &pbi->bc;
+  VP8_COMMON *const pc = &pbi->common;
   for (l = 0; l < ENTROPY_NODES; l++) {
     if (vp8_read(bc, grpupd)) {
       // printf("Decoding %d\n", l);
@@ -928,10 +877,9 @@ static void read_coef_probs2(VP8D_COMP *pbi) {
 }
 #endif
 
-static void read_coef_probs(VP8D_COMP *pbi) {
+static void read_coef_probs(VP8D_COMP *pbi, BOOL_DECODER* const bc) {
   int i, j, k, l;
-  vp8_reader *const bc = & pbi->bc;
-  VP8_COMMON *const pc = & pbi->common;
+  VP8_COMMON *const pc = &pbi->common;
 
   {
     if (vp8_read_bit(bc)) {
@@ -1055,9 +1003,9 @@ static void read_coef_probs(VP8D_COMP *pbi) {
 }
 
 int vp8_decode_frame(VP8D_COMP *pbi) {
-  vp8_reader *const bc = & pbi->bc;
-  VP8_COMMON *const pc = & pbi->common;
-  MACROBLOCKD *const xd  = & pbi->mb;
+  BOOL_DECODER header_bc, residual_bc;
+  VP8_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
   const unsigned char *data = (const unsigned char *)pbi->Source;
   const unsigned char *data_end = data + pbi->source_sz;
   ptrdiff_t first_partition_length_in_bytes = 0;
@@ -1143,33 +1091,51 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
 
   init_frame(pbi);
 
-  if (vp8dx_start_decode(bc, data, data_end - data))
+  if (vp8dx_start_decode(&header_bc, data, first_partition_length_in_bytes))
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
   if (pc->frame_type == KEY_FRAME) {
-    pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
-    pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
+    pc->clr_type    = (YUV_TYPE)vp8_read_bit(&header_bc);
+    pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(&header_bc);
   }
 
   /* Is segmentation enabled */
-  xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
+  xd->segmentation_enabled = (unsigned char)vp8_read_bit(&header_bc);
 
   if (xd->segmentation_enabled) {
     // Read whether or not the segmentation map is being explicitly
     // updated this frame.
-    xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
+    xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(&header_bc);
 
     // If so what method will be used.
-    if (xd->update_mb_segmentation_map)
-      pc->temporal_update = (unsigned char)vp8_read_bit(bc);
+    if (xd->update_mb_segmentation_map) {
+      // Which macro block level features are enabled
 
+      // Read the probs used to decode the segment id for each macro
+      // block.
+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+          xd->mb_segment_tree_probs[i] = vp8_read_bit(&header_bc) ?
+              (vp8_prob)vp8_read_literal(&header_bc, 8) : 255;
+      }
+
+      // Read the prediction probs needed to decode the segment id
+      pc->temporal_update = (unsigned char)vp8_read_bit(&header_bc);
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        if (pc->temporal_update) {
+          pc->segment_pred_probs[i] = vp8_read_bit(&header_bc) ?
+              (vp8_prob)vp8_read_literal(&header_bc, 8) : 255;
+        } else {
+          pc->segment_pred_probs[i] = 255;
+        }
+      }
+    }
     // Is the segment data being updated
-    xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
+    xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(&header_bc);
 
     if (xd->update_mb_segmentation_data) {
       int data;
 
-      xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc);
+      xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(&header_bc);
 
       clearall_segfeatures(xd);
 
@@ -1180,11 +1146,11 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
 
 #if CONFIG_FEATUREUPDATES
           // feature updated?
-          if (vp8_read_bit(bc)) {
+          if (vp8_read_bit(&header_bc)) {
             int active = 1;
 
             if (segfeature_active(xd, i, j))
-              active = vp8_read_bit(bc);
+              active = vp8_read_bit(&header_bc);
 
             // Is the feature enabled
             if (active) {
@@ -1192,11 +1158,11 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
               enable_segfeature(xd, i, j);
 
               data = (signed char)vp8_read_literal(
-                       bc, seg_feature_data_bits(j));
+                       &header_bc, seg_feature_data_bits(j));
 
               // Is the segment data signed..
               if (is_segfeature_signed(j)) {
-                if (vp8_read_bit(bc))
+                if (vp8_read_bit(&header_bc))
                   data = - data;
               }
             } else
@@ -1207,16 +1173,16 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
 
 #else
           // Is the feature enabled
-          if (vp8_read_bit(bc)) {
+          if (vp8_read_bit(&header_bc)) {
             // Update the feature data and mask
             enable_segfeature(xd, i, j);
 
             data = (signed char)vp8_read_literal(
-                     bc, seg_feature_data_bits(j));
+                     &header_bc, seg_feature_data_bits(j));
 
             // Is the segment data signed..
             if (is_segfeature_signed(j)) {
-              if (vp8_read_bit(bc))
+              if (vp8_read_bit(&header_bc))
                 data = - data;
             }
           } else
@@ -1227,38 +1193,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
         }
       }
     }
-
-    if (xd->update_mb_segmentation_map) {
-      // Which macro block level features are enabled
-      vpx_memset(xd->mb_segment_tree_probs, 255,
-                 sizeof(xd->mb_segment_tree_probs));
-      vpx_memset(pc->segment_pred_probs, 255,
-                 sizeof(pc->segment_pred_probs));
-
-      // Read the probs used to decode the segment id for each macro
-      // block.
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-        // If not explicitly set value is defaulted to 255 by
-        // memset above
-        if (vp8_read_bit(bc))
-          xd->mb_segment_tree_probs[i] =
-            (vp8_prob)vp8_read_literal(bc, 8);
-      }
-
-      // If predictive coding of segment map is enabled read the
-      // prediction probabilities.
-      if (pc->temporal_update) {
-        // Read the prediction probs needed to decode the segment id
-        // when predictive coding enabled
-        for (i = 0; i < PREDICTION_PROBS; i++) {
-          // If not explicitly set value is defaulted to 255 by
-          // memset above
-          if (vp8_read_bit(bc))
-            pc->segment_pred_probs[i] =
-              (vp8_prob)vp8_read_literal(bc, 8);
-        }
-      }
-    }
   }
 
   // Read common prediction model status flag probability updates for the
@@ -1270,81 +1204,84 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     pc->ref_pred_probs[2] = 40;
   } else {
     for (i = 0; i < PREDICTION_PROBS; i++) {
-      if (vp8_read_bit(bc))
-        pc->ref_pred_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
+      if (vp8_read_bit(&header_bc))
+        pc->ref_pred_probs[i] = (vp8_prob)vp8_read_literal(&header_bc, 8);
     }
   }
 
 #if CONFIG_SUPERBLOCKS
-  pc->sb_coded = vp8_read_literal(bc, 8);
+  pc->sb_coded = vp8_read_literal(&header_bc, 8);
 #endif
 
   /* Read the loop filter level and type */
 #if CONFIG_TX_SELECT
-  pc->txfm_mode = vp8_read_literal(bc, 2);
+  pc->txfm_mode = vp8_read_literal(&header_bc, 2);
   if (pc->txfm_mode == TX_MODE_SELECT) {
-    pc->prob_tx[0] = vp8_read_literal(bc, 8);
-    pc->prob_tx[1] = vp8_read_literal(bc, 8);
+    pc->prob_tx[0] = vp8_read_literal(&header_bc, 8);
+    pc->prob_tx[1] = vp8_read_literal(&header_bc, 8);
   }
 #else
-  pc->txfm_mode = (TXFM_MODE) vp8_read_bit(bc);
+  pc->txfm_mode = (TXFM_MODE) vp8_read_bit(&header_bc);
   if (pc->txfm_mode == ALLOW_8X8)
     pc->txfm_mode = ALLOW_16X16;
 #endif
 
-  pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
-  pc->filter_level = vp8_read_literal(bc, 6);
-  pc->sharpness_level = vp8_read_literal(bc, 3);
+  pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(&header_bc);
+  pc->filter_level = vp8_read_literal(&header_bc, 6);
+  pc->sharpness_level = vp8_read_literal(&header_bc, 3);
 
   /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
   xd->mode_ref_lf_delta_update = 0;
-  xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
+  xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(&header_bc);
 
   if (xd->mode_ref_lf_delta_enabled) {
     /* Do the deltas need to be updated */
-    xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
+    xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(&header_bc);
 
     if (xd->mode_ref_lf_delta_update) {
       /* Send update */
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        if (vp8_read_bit(bc)) {
-          /*sign = vp8_read_bit( bc );*/
-          xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+        if (vp8_read_bit(&header_bc)) {
+          /*sign = vp8_read_bit( &header_bc );*/
+          xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(&header_bc, 6);
 
-          if (vp8_read_bit(bc))        /* Apply sign */
+          if (vp8_read_bit(&header_bc))        /* Apply sign */
             xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
         }
       }
 
       /* Send update */
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        if (vp8_read_bit(bc)) {
-          /*sign = vp8_read_bit( bc );*/
-          xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+        if (vp8_read_bit(&header_bc)) {
+          /*sign = vp8_read_bit( &header_bc );*/
+          xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(&header_bc, 6);
 
-          if (vp8_read_bit(bc))        /* Apply sign */
+          if (vp8_read_bit(&header_bc))        /* Apply sign */
             xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
         }
       }
     }
   }
 
-  setup_token_decoder(pbi, data + first_partition_length_in_bytes);
+  // Dummy read for now
+  vp8_read_literal(&header_bc, 2);
 
-  xd->current_bc = &pbi->bc2;
+  setup_token_decoder(pbi, data + first_partition_length_in_bytes,
+                      &residual_bc);
 
   /* Read the default quantizers. */
   {
     int Q, q_update;
 
-    Q = vp8_read_literal(bc, QINDEX_BITS);  /* AC 1st order Q = default */
+    Q = vp8_read_literal(&header_bc, QINDEX_BITS);
     pc->base_qindex = Q;
     q_update = 0;
-    pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
-    pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update);
-    pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update);
-    pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update);
-    pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update);
+    /* AC 1st order Q = default */
+    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
+    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);
+    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);
+    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
+    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
 
     if (q_update)
       vp8cx_init_de_quantizer(pbi);
@@ -1359,8 +1296,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
    */
   if (pc->frame_type != KEY_FRAME) {
     /* Should the GF or ARF be updated from the current frame */
-    pc->refresh_golden_frame = vp8_read_bit(bc);
-    pc->refresh_alt_ref_frame = vp8_read_bit(bc);
+    pc->refresh_golden_frame = vp8_read_bit(&header_bc);
+    pc->refresh_alt_ref_frame = vp8_read_bit(&header_bc);
 
     if (pc->refresh_alt_ref_frame) {
       vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));
@@ -1378,37 +1315,38 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     pc->copy_buffer_to_gf = 0;
 
     if (!pc->refresh_golden_frame)
-      pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+      pc->copy_buffer_to_gf = vp8_read_literal(&header_bc, 2);
 
     pc->copy_buffer_to_arf = 0;
 
     if (!pc->refresh_alt_ref_frame)
-      pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+      pc->copy_buffer_to_arf = vp8_read_literal(&header_bc, 2);
 
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
+    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(&header_bc);
+    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(&header_bc);
 
     /* Is high precision mv allowed */
-    xd->allow_high_precision_mv = (unsigned char)vp8_read_bit(bc);
+    xd->allow_high_precision_mv = (unsigned char)vp8_read_bit(&header_bc);
     // Read the type of subpel filter to use
 #if CONFIG_SWITCHABLE_INTERP
-    if (vp8_read_bit(bc)) {
+    if (vp8_read_bit(&header_bc)) {
       pc->mcomp_filter_type = SWITCHABLE;
     } else
 #endif
     {
-      pc->mcomp_filter_type = vp8_read_literal(bc, 2);
+      pc->mcomp_filter_type = vp8_read_literal(&header_bc, 2);
     }
     /* To enable choice of different interploation filters */
     vp8_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
-  pc->refresh_entropy_probs = vp8_read_bit(bc);
+  pc->refresh_entropy_probs = vp8_read_bit(&header_bc);
   if (pc->refresh_entropy_probs == 0) {
     vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
   }
 
-  pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);
+  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)
+                           || vp8_read_bit(&header_bc);
 
   if (0) {
     FILE *z = fopen("decodestats.stt", "a");
@@ -1479,7 +1417,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   vp8_zero(pbi->common.fc.mv_ref_ct);
   vp8_zero(pbi->common.fc.mv_ref_ct_a);
 
-  read_coef_probs(pbi);
+  read_coef_probs(pbi, &header_bc);
 
   vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
   vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
@@ -1500,12 +1438,9 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
 
   /* Read the mb_no_coeff_skip flag */
-  pc->mb_no_coeff_skip = (int)vp8_read_bit(bc);
+  pc->mb_no_coeff_skip = (int)vp8_read_bit(&header_bc);
 
-  if(pbi->interleaved_decoding)
-    vpx_decode_mode_mvs_init(pbi);
-  else
-    vp8_decode_mode_mvs(pbi);
+  vpx_decode_mode_mvs_init(pbi, &header_bc);
 
   vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
 
@@ -1515,13 +1450,13 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
 
   /* Decode a row of superblocks */
   for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
-    decode_sb_row(pbi, pc, mb_row, xd);
+    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
   }
   corrupt_tokens |= xd->corrupted;
 
   /* Collect information about decoder corruption. */
   /* 1. Check first boolean decoder for errors. */
-  pc->yv12_fb[pc->new_fb_idx].corrupted = vp8dx_bool_error(bc);
+  pc->yv12_fb[pc->new_fb_idx].corrupted = vp8dx_bool_error(&header_bc);
   /* 2. Check the macroblock information */
   pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
 
@@ -1534,7 +1469,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
                          "A stream must start with a complete key frame");
   }
 
-  /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
   vp8_adapt_coef_probs(pc);
   if (pc->frame_type != KEY_FRAME) {
     vp8_adapt_mode_probs(pc);
@@ -1561,7 +1495,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
 #ifdef PACKET_TESTING
   {
     FILE *f = fopen("decompressor.VP8", "ab");
-    unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8;
+    unsigned int size = residual_bc.pos + header_bc.pos + 8;
     fwrite((void *) &size, 4, 1, f);
     fwrite((void *) pbi->Source, size, 1, f);
     fclose(f);
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 5b5ec7e2a..85f213470 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -302,17 +302,17 @@ static int vp8_get_signed(BOOL_DECODER *br, int value_to_sign) {
       val += (UINT16)(1 << bits_count);\
   } while (0);
 
-static int vp8_decode_coefs(VP8D_COMP *dx, const MACROBLOCKD *xd,
-                            ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                            PLANE_TYPE type,
+static int decode_coefs(VP8D_COMP *dx, const MACROBLOCKD *xd,
+                        BOOL_DECODER* const br,
+                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                        PLANE_TYPE type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                            TX_TYPE tx_type,
+                        TX_TYPE tx_type,
 #endif
-                            int seg_eob, INT16 *qcoeff_ptr, int i,
-                            const int *const scan, int block_type,
-                            const int *coef_bands) {
+                        int seg_eob, INT16 *qcoeff_ptr, int i,
+                        const int *const scan, int block_type,
+                        const int *coef_bands) {
   FRAME_CONTEXT *const fc = &dx->common.fc;
-  BOOL_DECODER *br = xd->current_bc;
   int tmp, c = (type == PLANE_TYPE_Y_NO_DC);
   const vp8_prob *prob, *coef_probs;
 
@@ -446,7 +446,8 @@ SKIP_START:
   return c;
 }
 
-int vp8_decode_mb_tokens_16x16(VP8D_COMP *pbi, MACROBLOCKD *xd) {
+int vp8_decode_mb_tokens_16x16(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                               BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
 
@@ -473,12 +474,12 @@ int vp8_decode_mb_tokens_16x16(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   // Luma block
   {
     const int* const scan = vp8_default_zig_zag1d_16x16;
-    c = vp8_decode_coefs(pbi, xd, A, L, type,
+    c = decode_coefs(pbi, xd, bc, A, L, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                         tx_type,
+                     tx_type,
 #endif
-                         seg_eob, qcoeff_ptr,
-                         0, scan, TX_16X16, coef_bands_x_16x16);
+                     seg_eob, qcoeff_ptr,
+                     0, scan, TX_16X16, coef_bands_x_16x16);
     eobs[0] = c;
     *A = *L = (c != !type);
     for (i = 1; i < 16; i++) {
@@ -503,12 +504,12 @@ int vp8_decode_mb_tokens_16x16(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     ENTROPY_CONTEXT* const l = L + vp8_block2left_8x8[i];
     const int* const scan = vp8_default_zig_zag1d_8x8;
 
-    c = vp8_decode_coefs(pbi, xd, a, l, type,
+    c = decode_coefs(pbi, xd, bc, a, l, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                         tx_type,
+                     tx_type,
 #endif
-                         seg_eob, qcoeff_ptr,
-                         i, scan, TX_8X8, coef_bands_x_8x8);
+                     seg_eob, qcoeff_ptr,
+                     i, scan, TX_8X8, coef_bands_x_8x8);
     a[0] = l[0] = ((eobs[i] = c) != !type);
     a[1] = a[0];
     l[1] = l[0];
@@ -521,7 +522,8 @@ int vp8_decode_mb_tokens_16x16(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   return eobtotal;
 }
 
-int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
+int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                             BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
 
@@ -548,12 +550,12 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
       seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);
     else
       seg_eob = 4;
-    c = vp8_decode_coefs(pbi, xd, a, l, type,
+    c = decode_coefs(pbi, xd, bc, a, l, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                         tx_type,
+                     tx_type,
 #endif
-                         seg_eob, qcoeff_ptr + 24 * 16,
-                         24, scan, TX_8X8, coef_bands_x);
+                     seg_eob, qcoeff_ptr + 24 * 16,
+                     24, scan, TX_8X8, coef_bands_x);
     a[0] = l[0] = ((eobs[24] = c) != !type);
 
     eobtotal += c - 4;
@@ -578,18 +580,17 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     if (i == 16)
       type = PLANE_TYPE_UV;
 #if CONFIG_HYBRIDTRANSFORM8X8
-    if (type == PLANE_TYPE_Y_WITH_DC &&
-        xd->mode_info_context->mbmi.mode == I8X8_PRED) {
+    if (type == PLANE_TYPE_Y_WITH_DC) {
       tx_type = get_tx_type(xd, xd->block + i);
     }
 #endif
 
-    c = vp8_decode_coefs(pbi, xd, a, l, type,
+    c = decode_coefs(pbi, xd, bc, a, l, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                         tx_type,
+                     tx_type,
 #endif
-                         seg_eob, qcoeff_ptr,
-                         i, scan, TX_8X8, coef_bands_x_8x8);
+                     seg_eob, qcoeff_ptr,
+                     i, scan, TX_8X8, coef_bands_x_8x8);
     a[0] = l[0] = ((eobs[i] = c) != !type);
     a[1] = a[0];
     l[1] = l[0];
@@ -611,12 +612,12 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
       ENTROPY_CONTEXT *const l = L + vp8_block2left[i];
       const int *scan = vp8_default_zig_zag1d;
 
-      c = vp8_decode_coefs(pbi, xd, a, l, type,
+      c = decode_coefs(pbi, xd, bc, a, l, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                           tx_type,
+                       tx_type,
 #endif
-                           seg_eob, qcoeff_ptr,
-                           i, scan, TX_4X4, coef_bands_x);
+                       seg_eob, qcoeff_ptr,
+                       i, scan, TX_4X4, coef_bands_x);
       a[0] = l[0] = ((eobs[i] = c) != !type);
 
       eobtotal += c;
@@ -628,7 +629,8 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
 }
 
 
-int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *xd) {
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *xd,
+                         BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
 
@@ -649,12 +651,12 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *xd) {
     ENTROPY_CONTEXT *const l = L + vp8_block2left[24];
     type = PLANE_TYPE_Y2;
 
-    c = vp8_decode_coefs(dx, xd, a, l, type,
+    c = decode_coefs(dx, xd, bc, a, l, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                         DCT_DCT,
+                     DCT_DCT,
 #endif
-                         seg_eob, qcoeff_ptr + 24 * 16, 24,
-                         scan, TX_4X4, coef_bands_x);
+                     seg_eob, qcoeff_ptr + 24 * 16, 24,
+                     scan, TX_4X4, coef_bands_x);
     a[0] = l[0] = ((eobs[24] = c) != !type);
     eobtotal += c - 16;
 
@@ -673,10 +675,7 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *xd) {
       type = PLANE_TYPE_UV;
 
 #if CONFIG_HYBRIDTRANSFORM
-    if (type == PLANE_TYPE_Y_WITH_DC)
-      tx_type = get_tx_type(xd, &xd->block[i]);
-#endif
-#if CONFIG_HYBRIDTRANSFORM
+    tx_type = get_tx_type(xd, &xd->block[i]);
     switch(tx_type) {
       case ADST_DCT :
         scan = vp8_row_scan;
@@ -692,12 +691,12 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *xd) {
     }
 #endif
 
-    c = vp8_decode_coefs(dx, xd, a, l, type,
+    c = decode_coefs(dx, xd, bc, a, l, type,
 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
-                         tx_type,
+                     tx_type,
 #endif
-                         seg_eob, qcoeff_ptr,
-                         i, scan, TX_4X4, coef_bands_x);
+                     seg_eob, qcoeff_ptr,
+                     i, scan, TX_4X4, coef_bands_x);
     a[0] = l[0] = ((eobs[i] = c) != !type);
 
     eobtotal += c;
diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
index d02d4cae1..df36efed5 100644
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -14,9 +14,12 @@
 
 #include "onyxd_int.h"
 
-void vp8_reset_mb_tokens_context(MACROBLOCKD *xd);
-int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
-int vp8_decode_mb_tokens_8x8(VP8D_COMP *, MACROBLOCKD *);
-int vp8_decode_mb_tokens_16x16(VP8D_COMP *, MACROBLOCKD *);
+void vp8_reset_mb_tokens_context(MACROBLOCKD* const);
+int vp8_decode_mb_tokens(VP8D_COMP* const, MACROBLOCKD* const,
+                         BOOL_DECODER* const);
+int vp8_decode_mb_tokens_8x8(VP8D_COMP* const, MACROBLOCKD* const,
+                             BOOL_DECODER* const);
+int vp8_decode_mb_tokens_16x16(VP8D_COMP* const, MACROBLOCKD* const,
+                               BOOL_DECODER* const);
 
 #endif /* DETOKENIZE_H */
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index aa3ef81b3..2e7751325 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -149,8 +149,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) {
 
   pbi->decoded_key_frame = 0;
 
-  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV || CONFIG_SUPERBLOCKS;
-
   return (VP8D_PTR) pbi;
 }
 
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 53350b819..0a84256e2 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -45,7 +45,6 @@ typedef struct {
   ENTROPY_CONTEXT_PLANES *L;
 
   INT16 *qcoeff_start_ptr;
-  BOOL_DECODER *current_bc;
 
   vp8_prob const *coef_probs[BLOCK_TYPES];
   vp8_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
@@ -60,8 +59,6 @@ typedef struct VP8Decompressor {
 
   DECLARE_ALIGNED(16, VP8_COMMON, common);
 
-  vp8_reader bc, bc2;
-
   VP8D_CONFIG oxcf;
 
 
@@ -82,8 +79,6 @@ typedef struct VP8Decompressor {
 
   int decoded_key_frame;
 
-  int interleaved_decoding;
-
 } VP8D_COMP;
 
 int vp8_decode_frame(VP8D_COMP *cpi);
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index c79e915f8..8e74901b3 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -68,7 +68,6 @@ DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, b
 
 DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
 DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
 
 DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
 DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 812565915..f74f85fad 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -111,10 +111,10 @@ static int remap_prob(int v, int m) {
   return i;
 }
 
-static void write_prob_diff_update(vp8_writer *const w,
+static void write_prob_diff_update(vp8_writer *const bc,
                                    vp8_prob newp, vp8_prob oldp) {
   int delp = remap_prob(newp, oldp);
-  vp8_encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
+  vp8_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
 }
 
 static int prob_diff_update_cost(vp8_prob newp, vp8_prob oldp) {
@@ -186,7 +186,7 @@ unsigned int pick_best_mv_ref( MACROBLOCK *x,
 #endif
 
 static void update_mode(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   int n,
   vp8_token tok               [/* n */],
   vp8_tree tree,
@@ -212,34 +212,33 @@ static void update_mode(
   if (new_b + (n << 8) < old_b) {
     int i = 0;
 
-    vp8_write_bit(w, 1);
+    vp8_write_bit(bc, 1);
 
     do {
       const vp8_prob p = Pnew[i];
 
-      vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+      vp8_write_literal(bc, Pcur[i] = p ? p : 1, 8);
     } while (++i < n);
   } else
-    vp8_write_bit(w, 0);
+    vp8_write_bit(bc, 0);
 }
 
-static void update_mbintra_mode_probs(VP8_COMP *cpi) {
-  VP8_COMMON *const cm = & cpi->common;
-
-  vp8_writer *const w = & cpi->bc;
+static void update_mbintra_mode_probs(VP8_COMP* const cpi,
+                                      vp8_writer* const bc) {
+  VP8_COMMON *const cm = &cpi->common;
 
   {
     vp8_prob Pnew   [VP8_YMODES - 1];
     unsigned int bct [VP8_YMODES - 1] [2];
 
     update_mode(
-      w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+      bc, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
       Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
     );
   }
 }
 
-static __inline int get_prob(int num, int den) {
+static int get_prob(int num, int den) {
   int p;
   if (den <= 0)
     return 128;
@@ -251,33 +250,24 @@ static __inline int get_prob(int num, int den) {
   return p;
 }
 
+static int get_binary_prob(int n0, int n1) {
+  return get_prob(n0, n0 + n1);
+}
+
 void update_skip_probs(VP8_COMP *cpi) {
-  VP8_COMMON *const pc = & cpi->common;
+  VP8_COMMON *const pc = &cpi->common;
   int prob_skip_false[3] = {0, 0, 0};
   int k;
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-    if ((cpi->skip_false_count[k] + cpi->skip_true_count[k])) {
-      prob_skip_false[k] =
-        cpi->skip_false_count[k] * 256 /
-        (cpi->skip_false_count[k] + cpi->skip_true_count[k]);
-
-      if (prob_skip_false[k] <= 1)
-        prob_skip_false[k] = 1;
-
-      if (prob_skip_false[k] > 255)
-        prob_skip_false[k] = 255;
-    } else
-      prob_skip_false[k] = 128;
-
-    pc->mbskip_pred_probs[k] = prob_skip_false[k];
+    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],
+                                               cpi->skip_true_count[k]);
   }
 }
 
 #if CONFIG_SWITCHABLE_INTERP
-void update_switchable_interp_probs(VP8_COMP *cpi) {
-  VP8_COMMON *const pc = & cpi->common;
-  vp8_writer *const w = & cpi->bc;
+void update_switchable_interp_probs(VP8_COMP *cpi, vp8_writer* const bc) {
+  VP8_COMMON *const pc = &cpi->common;
   unsigned int branch_ct[32][2];
   int i, j;
   for (j = 0; j <= VP8_SWITCHABLE_FILTERS; ++j) {
@@ -301,7 +291,7 @@ void update_switchable_interp_probs(VP8_COMP *cpi) {
     for (i = 0; i < VP8_SWITCHABLE_FILTERS - 1; ++i) {
       if (pc->fc.switchable_interp_prob[j][i] < 1)
         pc->fc.switchable_interp_prob[j][i] = 1;
-      vp8_write_literal(w, pc->fc.switchable_interp_prob[j][i], 8);
+      vp8_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);
 /*
       if (!cpi->dummy_packing)
 #if VP8_SWITCHABLE_FILTERS == 3
@@ -329,7 +319,7 @@ void update_switchable_interp_probs(VP8_COMP *cpi) {
 
 // This function updates the reference frame prediction stats
 static void update_refpred_stats(VP8_COMP *cpi) {
-  VP8_COMMON *const cm = & cpi->common;
+  VP8_COMMON *const cm = &cpi->common;
   int i;
   int tot_count;
   vp8_prob new_pred_probs[PREDICTION_PROBS];
@@ -347,15 +337,8 @@ static void update_refpred_stats(VP8_COMP *cpi) {
   } else {
     // From the prediction counts set the probabilities for each context
     for (i = 0; i < PREDICTION_PROBS; i++) {
-      tot_count = cpi->ref_pred_count[i][0] + cpi->ref_pred_count[i][1];
-      if (tot_count) {
-        new_pred_probs[i] =
-          (cpi->ref_pred_count[i][0] * 255 + (tot_count >> 1)) / tot_count;
-
-        // Clamp to minimum allowed value
-        new_pred_probs[i] += !new_pred_probs[i];
-      } else
-        new_pred_probs[i] = 128;
+      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
+                                          cpi->ref_pred_count[i][1]);
 
       // Decide whether or not to update the reference frame probs.
       // Returned costs are in 1/256 bit units.
@@ -454,13 +437,15 @@ static int prob_diff_update_savings_search(const unsigned int *ct,
   return bestsavings;
 }
 
-static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
-  const TOKENEXTRA *const stop = p + xcount;
+static void pack_mb_tokens(vp8_writer* const bc,
+                           TOKENEXTRA **tp,
+                           const TOKENEXTRA *const stop) {
   unsigned int split;
   unsigned int shift;
-  int count = w->count;
-  unsigned int range = w->range;
-  unsigned int lowvalue = w->lowvalue;
+  int count = bc->count;
+  unsigned int range = bc->range;
+  unsigned int lowvalue = bc->lowvalue;
+  TOKENEXTRA *p = *tp;
 
   while (p < stop) {
     const int t = p->Token;
@@ -471,6 +456,12 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
     int v = a->value;
     int n = a->Len;
 
+    if (t == EOSB_TOKEN)
+    {
+      ++p;
+      break;
+    }
+
     /* skip one or two nodes */
     if (p->skip_eob_node) {
       n -= p->skip_eob_node;
@@ -497,17 +488,17 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
         int offset = shift - count;
 
         if ((lowvalue << (offset - 1)) & 0x80000000) {
-          int x = w->pos - 1;
+          int x = bc->pos - 1;
 
-          while (x >= 0 && w->buffer[x] == 0xff) {
-            w->buffer[x] = (unsigned char)0;
+          while (x >= 0 && bc->buffer[x] == 0xff) {
+            bc->buffer[x] = (unsigned char)0;
             x--;
           }
 
-          w->buffer[x] += 1;
+          bc->buffer[x] += 1;
         }
 
-        w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+        bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
         lowvalue <<= offset;
         shift = count;
         lowvalue &= 0xffffff;
@@ -547,17 +538,17 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
             int offset = shift - count;
 
             if ((lowvalue << (offset - 1)) & 0x80000000) {
-              int x = w->pos - 1;
+              int x = bc->pos - 1;
 
-              while (x >= 0 && w->buffer[x] == 0xff) {
-                w->buffer[x] = (unsigned char)0;
+              while (x >= 0 && bc->buffer[x] == 0xff) {
+                bc->buffer[x] = (unsigned char)0;
                 x--;
               }
 
-              w->buffer[x] += 1;
+              bc->buffer[x] += 1;
             }
 
-            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+            bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
             lowvalue <<= offset;
             shift = count;
             lowvalue &= 0xffffff;
@@ -583,14 +574,14 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
         range <<= 1;
 
         if ((lowvalue & 0x80000000)) {
-          int x = w->pos - 1;
+          int x = bc->pos - 1;
 
-          while (x >= 0 && w->buffer[x] == 0xff) {
-            w->buffer[x] = (unsigned char)0;
+          while (x >= 0 && bc->buffer[x] == 0xff) {
+            bc->buffer[x] = (unsigned char)0;
             x--;
           }
 
-          w->buffer[x] += 1;
+          bc->buffer[x] += 1;
 
         }
 
@@ -598,20 +589,19 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
 
         if (!++count) {
           count = -8;
-          w->buffer[w->pos++] = (lowvalue >> 24);
+          bc->buffer[bc->pos++] = (lowvalue >> 24);
           lowvalue &= 0xffffff;
         }
       }
 
     }
-
     ++p;
   }
 
-  w->count = count;
-  w->lowvalue = lowvalue;
-  w->range = range;
-
+  bc->count = count;
+  bc->lowvalue = lowvalue;
+  bc->range = range;
+  *tp = p;
 }
 
 static void write_partition_size(unsigned char *cx_data, int size) {
@@ -628,107 +618,108 @@ static void write_partition_size(unsigned char *cx_data, int size) {
 
 static void write_mv_ref
 (
-  vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+  vp8_writer *bc, MB_PREDICTION_MODE m, const vp8_prob *p
 ) {
 #if CONFIG_DEBUG
   assert(NEARESTMV <= m  &&  m <= SPLITMV);
 #endif
-  vp8_write_token(w, vp8_mv_ref_tree, p,
+  vp8_write_token(bc, vp8_mv_ref_tree, p,
                   vp8_mv_ref_encoding_array - NEARESTMV + m);
 }
 
 #if CONFIG_SUPERBLOCKS
-static void write_sb_mv_ref(vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p) {
+static void write_sb_mv_ref(vp8_writer *bc, MB_PREDICTION_MODE m,
+                            const vp8_prob *p) {
 #if CONFIG_DEBUG
   assert(NEARESTMV <= m  &&  m < SPLITMV);
 #endif
-  vp8_write_token(w, vp8_sb_mv_ref_tree, p,
+  vp8_write_token(bc, vp8_sb_mv_ref_tree, p,
                   vp8_sb_mv_ref_encoding_array - NEARESTMV + m);
 }
 #endif
 
 static void write_sub_mv_ref
 (
-  vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+  vp8_writer *bc, B_PREDICTION_MODE m, const vp8_prob *p
 ) {
 #if CONFIG_DEBUG
   assert(LEFT4X4 <= m  &&  m <= NEW4X4);
 #endif
-  vp8_write_token(w, vp8_sub_mv_ref_tree, p,
+  vp8_write_token(bc, vp8_sub_mv_ref_tree, p,
                   vp8_sub_mv_ref_encoding_array - LEFT4X4 + m);
 }
 
 #if CONFIG_NEWMVENTROPY
-static void write_nmv (vp8_writer *w, const MV *mv, const int_mv *ref,
-                       const nmv_context *nmvc, int usehp) {
+static void write_nmv(vp8_writer *bc, const MV *mv, const int_mv *ref,
+                      const nmv_context *nmvc, int usehp) {
   MV e;
   e.row = mv->row - ref->as_mv.row;
   e.col = mv->col - ref->as_mv.col;
 
-  vp8_encode_nmv(w, &e, &ref->as_mv, nmvc);
-  vp8_encode_nmv_fp(w, &e, &ref->as_mv, nmvc, usehp);
+  vp8_encode_nmv(bc, &e, &ref->as_mv, nmvc);
+  vp8_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);
 }
 
 #else
 
 static void write_mv
 (
-  vp8_writer *w, const MV *mv, const int_mv *ref, const MV_CONTEXT *mvc
+  vp8_writer *bc, const MV *mv, const int_mv *ref, const MV_CONTEXT *mvc
 ) {
   MV e;
   e.row = mv->row - ref->as_mv.row;
   e.col = mv->col - ref->as_mv.col;
 
-  vp8_encode_motion_vector(w, &e, mvc);
+  vp8_encode_motion_vector(bc, &e, mvc);
 }
 
 static void write_mv_hp
 (
-  vp8_writer *w, const MV *mv, const int_mv *ref, const MV_CONTEXT_HP *mvc
+  vp8_writer *bc, const MV *mv, const int_mv *ref, const MV_CONTEXT_HP *mvc
 ) {
   MV e;
   e.row = mv->row - ref->as_mv.row;
   e.col = mv->col - ref->as_mv.col;
 
-  vp8_encode_motion_vector_hp(w, &e, mvc);
+  vp8_encode_motion_vector_hp(bc, &e, mvc);
 }
 #endif  /* CONFIG_NEWMVENTROPY */
 
 // This function writes the current macro block's segnment id to the bitstream
 // It should only be called if a segment map update is indicated.
-static void write_mb_segid(vp8_writer *w,
+static void write_mb_segid(vp8_writer *bc,
                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
   // Encode the MB segment id.
   if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
     switch (mi->segment_id) {
       case 0:
-        vp8_write(w, 0, xd->mb_segment_tree_probs[0]);
-        vp8_write(w, 0, xd->mb_segment_tree_probs[1]);
+        vp8_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp8_write(bc, 0, xd->mb_segment_tree_probs[1]);
         break;
       case 1:
-        vp8_write(w, 0, xd->mb_segment_tree_probs[0]);
-        vp8_write(w, 1, xd->mb_segment_tree_probs[1]);
+        vp8_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp8_write(bc, 1, xd->mb_segment_tree_probs[1]);
         break;
       case 2:
-        vp8_write(w, 1, xd->mb_segment_tree_probs[0]);
-        vp8_write(w, 0, xd->mb_segment_tree_probs[2]);
+        vp8_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp8_write(bc, 0, xd->mb_segment_tree_probs[2]);
         break;
       case 3:
-        vp8_write(w, 1, xd->mb_segment_tree_probs[0]);
-        vp8_write(w, 1, xd->mb_segment_tree_probs[2]);
+        vp8_write(bc, 1, xd->mb_segment_tree_probs[0]);
+        vp8_write(bc, 1, xd->mb_segment_tree_probs[2]);
         break;
 
         // TRAP.. This should not happen
       default:
-        vp8_write(w, 0, xd->mb_segment_tree_probs[0]);
-        vp8_write(w, 0, xd->mb_segment_tree_probs[1]);
+        vp8_write(bc, 0, xd->mb_segment_tree_probs[0]);
+        vp8_write(bc, 0, xd->mb_segment_tree_probs[1]);
         break;
     }
   }
 }
 
 // This function encodes the reference frame
-static void encode_ref_frame(vp8_writer *const w,
+static void encode_ref_frame(vp8_writer *const bc,
                              VP8_COMMON *const cm,
                              MACROBLOCKD *xd,
                              int segment_id,
@@ -765,7 +756,7 @@ static void encode_ref_frame(vp8_writer *const w,
       (xd->mode_info_context->mbmi.ref_frame == pred_rf);
 
     set_pred_flag(xd, PRED_REF, prediction_flag);
-    vp8_write(w, prediction_flag, pred_prob);
+    vp8_write(bc, prediction_flag, pred_prob);
 
     // If not predicted correctly then code value explicitly
     if (!prediction_flag) {
@@ -787,18 +778,18 @@ static void encode_ref_frame(vp8_writer *const w,
       }
 
       if (mod_refprobs[0]) {
-        vp8_write(w, (rf != INTRA_FRAME), mod_refprobs[0]);
+        vp8_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);
       }
 
       // Inter coded
       if (rf != INTRA_FRAME) {
         if (mod_refprobs[1]) {
-          vp8_write(w, (rf != LAST_FRAME), mod_refprobs[1]);
+          vp8_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);
         }
 
         if (rf != LAST_FRAME) {
           if (mod_refprobs[2]) {
-            vp8_write(w, (rf != GOLDEN_FRAME), mod_refprobs[2]);
+            vp8_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);
           }
         }
       }
@@ -811,40 +802,25 @@ static void encode_ref_frame(vp8_writer *const w,
 
 // Update the probabilities used to encode reference frame data
 static void update_ref_probs(VP8_COMP *const cpi) {
-  VP8_COMMON *const cm = & cpi->common;
+  VP8_COMMON *const cm = &cpi->common;
 
   const int *const rfct = cpi->count_mb_ref_frame_usage;
   const int rf_intra = rfct[INTRA_FRAME];
   const int rf_inter = rfct[LAST_FRAME] +
                        rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
 
-  cm->prob_intra_coded = (rf_intra + rf_inter)
-                         ? rf_intra * 255 / (rf_intra + rf_inter) : 1;
-
-  if (!cm->prob_intra_coded)
-    cm->prob_intra_coded = 1;
-
-  cm->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-  if (!cm->prob_last_coded)
-    cm->prob_last_coded = 1;
-
-  cm->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                      ? (rfct[GOLDEN_FRAME] * 255) /
-                      (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-  if (!cm->prob_gf_coded)
-    cm->prob_gf_coded = 1;
+  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);
+  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);
+  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);
 
   // Compute a modified set of probabilities to use when prediction of the
   // reference frame fails
   compute_mod_refprobs(cm);
 }
 
-static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
+static void pack_inter_mode_mvs(VP8_COMP *const cpi, vp8_writer *const bc) {
   int i;
-  VP8_COMMON *const pc = & cpi->common;
-  vp8_writer *const w = & cpi->bc;
+  VP8_COMMON *const pc = &cpi->common;
 #if CONFIG_NEWMVENTROPY
   const nmv_context *nmvc = &pc->fc.nmvc;
 #else
@@ -855,6 +831,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   MODE_INFO *m;
   MODE_INFO *prev_m;
+  TOKENEXTRA *tok = cpi->tok;
+  TOKENEXTRA *tok_end = tok + cpi->tok_count;
 
   const int mis = pc->mode_info_stride;
   int mb_row, mb_col;
@@ -871,73 +849,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
   cpi->mb.partition_info = cpi->mb.pi;
 
-  // Update the probabilities used to encode reference frame data
-  update_ref_probs(cpi);
-
-#ifdef ENTROPY_STATS
-  active_section = 1;
-#endif
-
-  if (pc->mb_no_coeff_skip) {
-    int k;
-
-    update_skip_probs(cpi);
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-      vp8_write_literal(w, pc->mbskip_pred_probs[k], 8);
-  }
-
-#if CONFIG_PRED_FILTER
-  // Write the prediction filter mode used for this frame
-  vp8_write_literal(w, pc->pred_filter_mode, 2);
-
-  // Write prediction filter on/off probability if signaling at MB level
-  if (pc->pred_filter_mode == 2)
-    vp8_write_literal(w, pc->prob_pred_filter_off, 8);
-
-  // printf("pred_filter_mode:%d  prob_pred_filter_off:%d\n",
-  //       pc->pred_filter_mode, pc->prob_pred_filter_off);
-#endif
-#if CONFIG_SWITCHABLE_INTERP
-  if (pc->mcomp_filter_type == SWITCHABLE)
-    update_switchable_interp_probs(cpi);
-#endif
-
-  vp8_write_literal(w, pc->prob_intra_coded, 8);
-  vp8_write_literal(w, pc->prob_last_coded, 8);
-  vp8_write_literal(w, pc->prob_gf_coded, 8);
-
-  if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-    vp8_write(w, 1, 128);
-    vp8_write(w, 1, 128);
-    for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
-      if (cpi->single_pred_count[i] + cpi->comp_pred_count[i]) {
-        pc->prob_comppred[i] = cpi->single_pred_count[i] * 255 /
-                               (cpi->single_pred_count[i] + cpi->comp_pred_count[i]);
-        if (pc->prob_comppred[i] < 1)
-          pc->prob_comppred[i] = 1;
-      } else {
-        pc->prob_comppred[i] = 128;
-      }
-      vp8_write_literal(w, pc->prob_comppred[i], 8);
-    }
-  } else if (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY) {
-    vp8_write(w, 0, 128);
-  } else { /* compound prediction only */
-    vp8_write(w, 1, 128);
-    vp8_write(w, 0, 128);
-  }
-
-  update_mbintra_mode_probs(cpi);
-
-#if CONFIG_NEWMVENTROPY
-  vp8_write_nmvprobs(cpi, xd->allow_high_precision_mv);
-#else
-  if (xd->allow_high_precision_mv)
-    vp8_write_mvprobs_hp(cpi);
-  else
-    vp8_write_mvprobs(cpi);
-#endif
-
   mb_row = 0;
   for (row = 0; row < pc->mb_rows; row += 2) {
     m = pc->mi + row * mis;
@@ -950,7 +861,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
 #if CONFIG_SUPERBLOCKS
-      vp8_write(w, m->mbmi.encoded_as_sb, pc->sb_coded);
+      vp8_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
 #endif
       for (i = 0; i < 4; i++) {
         MB_MODE_INFO *mi;
@@ -972,7 +883,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           continue;
         }
 
-        mi = & m->mbmi;
+        mi = &m->mbmi;
         rf = mi->ref_frame;
         mode = mi->mode;
         segment_id = mi->segment_id;
@@ -1000,14 +911,14 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
             pred_prob = get_pred_prob(pc, xd, PRED_SEG_ID);
 
             // Code the segment id prediction flag for this mb
-            vp8_write(w, prediction_flag, pred_prob);
+            vp8_write(bc, prediction_flag, pred_prob);
 
             // If the mb segment id wasn't predicted code explicitly
             if (!prediction_flag)
-              write_mb_segid(w, mi, &cpi->mb.e_mbd);
+              write_mb_segid(bc, mi, &cpi->mb.e_mbd);
           } else {
             // Normal unpredicted coding
-            write_mb_segid(w, mi, &cpi->mb.e_mbd);
+            write_mb_segid(bc, mi, &cpi->mb.e_mbd);
           }
         }
 
@@ -1022,12 +933,12 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
             skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
           }
 #endif
-          vp8_encode_bool(w, skip_coeff,
+          vp8_encode_bool(bc, skip_coeff,
                           get_pred_prob(pc, xd, PRED_MBSKIP));
         }
 
         // Encode the reference frame.
-        encode_ref_frame(w, pc, xd, segment_id, rf);
+        encode_ref_frame(bc, pc, xd, segment_id, rf);
 
         if (rf == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
@@ -1037,7 +948,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           // TODO(rbultje) write using SB tree structure
 
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-            write_ymode(w, mode, pc->fc.ymode_prob);
+            write_ymode(bc, mode, pc->fc.ymode_prob);
           }
 
           if (mode == B_PRED) {
@@ -1046,32 +957,40 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
             int uses_second =
               m->bmi[0].as_mode.second !=
               (B_PREDICTION_MODE)(B_DC_PRED - 1);
-            vp8_write(w, uses_second, 128);
+            vp8_write(bc, uses_second, 128);
 #endif
             do {
 #if CONFIG_COMP_INTRA_PRED
               B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
 #endif
-              write_bmode(w, m->bmi[j].as_mode.first,
+              write_bmode(bc, m->bmi[j].as_mode.first,
                           pc->fc.bmode_prob);
+              /*
+              if (!cpi->dummy_packing) {
+                int p;
+                for (p = 0; p < VP8_BINTRAMODES - 1; ++p)
+                  printf(" %d", pc->fc.bmode_prob[p]);
+                printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);
+              }
+              */
 #if CONFIG_COMP_INTRA_PRED
               if (uses_second) {
-                write_bmode(w, mode2, pc->fc.bmode_prob);
+                write_bmode(bc, mode2, pc->fc.bmode_prob);
               }
 #endif
             } while (++j < 16);
           }
           if (mode == I8X8_PRED) {
-            write_i8x8_mode(w, m->bmi[0].as_mode.first,
+            write_i8x8_mode(bc, m->bmi[0].as_mode.first,
                             pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(w, m->bmi[2].as_mode.first,
+            write_i8x8_mode(bc, m->bmi[2].as_mode.first,
                             pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(w, m->bmi[8].as_mode.first,
+            write_i8x8_mode(bc, m->bmi[8].as_mode.first,
                             pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(w, m->bmi[10].as_mode.first,
+            write_i8x8_mode(bc, m->bmi[10].as_mode.first,
                             pc->fc.i8x8_mode_prob);
           } else {
-            write_uv_mode(w, mi->uv_mode,
+            write_uv_mode(bc, mi->uv_mode,
                           pc->fc.uv_mode_prob[mode]);
           }
         } else {
@@ -1103,11 +1022,11 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
 #if CONFIG_SUPERBLOCKS
             if (mi->encoded_as_sb) {
-              write_sb_mv_ref(w, mode, mv_ref_p);
+              write_sb_mv_ref(bc, mode, mv_ref_p);
             } else
 #endif
             {
-              write_mv_ref(w, mode, mv_ref_p);
+              write_mv_ref(bc, mode, mv_ref_p);
             }
             vp8_accum_mv_refs(&cpi->common, mode, ct);
           }
@@ -1116,7 +1035,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           // Is the prediction filter enabled
           if (mode >= NEARESTMV && mode < SPLITMV) {
             if (cpi->common.pred_filter_mode == 2)
-              vp8_write(w, mi->pred_filter_enabled,
+              vp8_write(bc, mi->pred_filter_enabled,
                         pc->prob_pred_filter_off);
             else
               assert(mi->pred_filter_enabled ==
@@ -1127,7 +1046,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           if (mode >= NEARESTMV && mode <= SPLITMV)
           {
             if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-              vp8_write_token(w, vp8_switchable_interp_tree,
+              vp8_write_token(bc, vp8_switchable_interp_tree,
                               get_pred_probs(&cpi->common, xd, PRED_SWITCHABLE_INTERP),
                               vp8_switchable_interp_encodings +
                               vp8_switchable_interp_map[mi->interp_filter]);
@@ -1155,7 +1074,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           // does the feature use compound prediction or not
           // (if not specified at the frame/segment level)
           if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-            vp8_write(w, mi->second_ref_frame != INTRA_FRAME,
+            vp8_write(bc, mi->second_ref_frame != INTRA_FRAME,
                       get_pred_prob(pc, xd, PRED_COMP));
           }
 
@@ -1181,14 +1100,14 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
                 }
 #endif
 #if CONFIG_NEWMVENTROPY
-                write_nmv(w, &mi->mv[0].as_mv, &best_mv,
+                write_nmv(bc, &mi->mv[0].as_mv, &best_mv,
                           (const nmv_context*) nmvc,
                           xd->allow_high_precision_mv);
 #else
                 if (xd->allow_high_precision_mv) {
-                  write_mv_hp(w, &mi->mv[0].as_mv, &best_mv, mvc_hp);
+                  write_mv_hp(bc, &mi->mv[0].as_mv, &best_mv, mvc_hp);
                 } else {
-                  write_mv(w, &mi->mv[0].as_mv, &best_mv, mvc);
+                  write_mv(bc, &mi->mv[0].as_mv, &best_mv, mvc);
                 }
 #endif
 
@@ -1208,14 +1127,14 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
                   cpi->best_ref_index_counts[best_index]++;
 #endif
 #if CONFIG_NEWMVENTROPY
-                  write_nmv(w, &mi->mv[1].as_mv, &best_second_mv,
+                  write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,
                             (const nmv_context*) nmvc,
                             xd->allow_high_precision_mv);
 #else
                   if (xd->allow_high_precision_mv) {
-                    write_mv_hp(w, &mi->mv[1].as_mv, &best_second_mv, mvc_hp);
+                    write_mv_hp(bc, &mi->mv[1].as_mv, &best_second_mv, mvc_hp);
                   } else {
-                    write_mv(w, &mi->mv[1].as_mv, &best_second_mv, mvc);
+                    write_mv(bc, &mi->mv[1].as_mv, &best_second_mv, mvc);
                   }
 #endif
                 }
@@ -1227,7 +1146,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
                 ++count_mb_seg [mi->partitioning];
 #endif
 
-                write_split(w, mi->partitioning, cpi->common.fc.mbsplit_prob);
+                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
                 cpi->mbsplit_count[mi->partitioning]++;
 
                 do {
@@ -1252,7 +1171,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
                   abovemv.as_int = above_block_mv(m, k, mis);
                   mv_contz = vp8_mv_cont(&leftmv, &abovemv);
 
-                  write_sub_mv_ref(w, blockmode,
+                  write_sub_mv_ref(bc, blockmode,
                                    cpi->common.fc.sub_mv_ref_prob [mv_contz]);
                   cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
                   if (blockmode == NEW4X4) {
@@ -1260,33 +1179,37 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
                     active_section = 11;
 #endif
 #if CONFIG_NEWMVENTROPY
-                    write_nmv(w, &blockmv.as_mv, &best_mv,
+                    write_nmv(bc, &blockmv.as_mv, &best_mv,
                               (const nmv_context*) nmvc,
                               xd->allow_high_precision_mv);
 #else
                     if (xd->allow_high_precision_mv) {
-                      write_mv_hp(w, &blockmv.as_mv, &best_mv,
+                      write_mv_hp(bc, &blockmv.as_mv, &best_mv,
                                   (const MV_CONTEXT_HP *) mvc_hp);
                     } else {
-                      write_mv(w, &blockmv.as_mv, &best_mv,
+                      write_mv(bc, &blockmv.as_mv, &best_mv,
                                (const MV_CONTEXT *) mvc);
                     }
 #endif
 
                     if (mi->second_ref_frame) {
 #if CONFIG_NEWMVENTROPY
-                      write_nmv(w,
+                      write_nmv(bc,
                                 &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
                                 &best_second_mv,
                                 (const nmv_context*) nmvc,
                                 xd->allow_high_precision_mv);
 #else
                       if (xd->allow_high_precision_mv) {
-                        write_mv_hp(w, &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                                    &best_second_mv, (const MV_CONTEXT_HP *) mvc_hp);
+                        write_mv_hp(
+                            bc,
+                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                            &best_second_mv, (const MV_CONTEXT_HP *)mvc_hp);
                       } else {
-                        write_mv(w, &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                                 &best_second_mv, (const MV_CONTEXT *) mvc);
+                        write_mv(
+                            bc,
+                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                            &best_second_mv, (const MV_CONTEXT *) mvc);
                       }
 #endif
                     }
@@ -1309,12 +1232,18 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
                get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
           TX_SIZE sz = mi->txfm_size;
           // FIXME(rbultje) code ternary symbol once all experiments are merged
-          vp8_write(w, sz != TX_4X4, pc->prob_tx[0]);
+          vp8_write(bc, sz != TX_4X4, pc->prob_tx[0]);
           if (sz != TX_4X4 && mode != I8X8_PRED)
-            vp8_write(w, sz != TX_8X8, pc->prob_tx[1]);
+            vp8_write(bc, sz != TX_8X8, pc->prob_tx[1]);
         }
 #endif
 
+#ifdef ENTROPY_STATS
+        active_section = 1;
+#endif
+        assert(tok < tok_end);
+        pack_mb_tokens(bc, &tok, tok_end);
+
 #if CONFIG_SUPERBLOCKS
         if (m->mbmi.encoded_as_sb) {
           assert(!i);
@@ -1348,9 +1277,110 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 }
 
 
-static void write_kfmodes(VP8_COMP *cpi) {
-  vp8_writer *const bc = & cpi->bc;
-  VP8_COMMON *const c = & cpi->common;
+static void write_mb_modes_kf(const VP8_COMMON  *c,
+                              const MACROBLOCKD *xd,
+                              const MODE_INFO   *m,
+                              int                mode_info_stride,
+                              vp8_writer *const  bc) {
+  const int mis = mode_info_stride;
+  int ym;
+  int segment_id;
+
+  ym = m->mbmi.mode;
+  segment_id = m->mbmi.segment_id;
+
+  if (xd->update_mb_segmentation_map) {
+    write_mb_segid(bc, &m->mbmi, xd);
+  }
+
+  if (c->mb_no_coeff_skip &&
+      (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+       (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
+        int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          skip_coeff &= m[1].mbmi.mb_skip_coeff;
+          skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+        }
+#endif
+        vp8_encode_bool(bc, skip_coeff,
+                    get_pred_prob(c, xd, PRED_MBSKIP));
+  }
+
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb) {
+    sb_kfwrite_ymode(bc, ym,
+                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+  } else
+#endif
+  {
+    kfwrite_ymode(bc, ym,
+                  c->kf_ymode_prob[c->kf_ymode_probs_index]);
+  }
+
+  if (ym == B_PRED) {
+    const int mis = c->mode_info_stride;
+    int i = 0;
+#if CONFIG_COMP_INTRA_PRED
+    int uses_second =
+      m->bmi[0].as_mode.second !=
+      (B_PREDICTION_MODE)(B_DC_PRED - 1);
+    vp8_write(bc, uses_second, 128);
+#endif
+    do {
+      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+      const B_PREDICTION_MODE L = left_block_mode(m, i);
+      const int bm = m->bmi[i].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+      const int bm2 = m->bmi[i].as_mode.second;
+#endif
+
+#ifdef ENTROPY_STATS
+      ++intra_mode_stats [A] [L] [bm];
+#endif
+
+      write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
+      // printf("    mode: %d\n", bm);
+#if CONFIG_COMP_INTRA_PRED
+      if (uses_second) {
+        write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);
+      }
+#endif
+    } while (++i < 16);
+  }
+  if (ym == I8X8_PRED) {
+    write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
+    write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
+    write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
+    write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+                    c->fc.i8x8_mode_prob);
+    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
+  } else
+    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+
+#if CONFIG_TX_SELECT
+  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
+      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
+        (segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+         get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+    TX_SIZE sz = m->mbmi.txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp8_write(bc, sz != TX_4X4, c->prob_tx[0]);
+    if (sz != TX_4X4 && ym <= TM_PRED)
+      vp8_write(bc, sz != TX_8X8, c->prob_tx[1]);
+  }
+#endif
+}
+
+static void write_kfmodes(VP8_COMP* const cpi, vp8_writer* const bc) {
+  VP8_COMMON *const c = &cpi->common;
   const int mis = c->mode_info_stride;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   MODE_INFO *m;
@@ -1359,16 +1389,8 @@ static void write_kfmodes(VP8_COMP *cpi) {
   int mb_row, mb_col;
   int row_delta[4] = { 0, +1,  0, -1};
   int col_delta[4] = { +1, -1, +1, +1};
-
-  if (c->mb_no_coeff_skip) {
-    update_skip_probs(cpi);
-    for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-      vp8_write_literal(bc, c->mbskip_pred_probs[i], 8);
-  }
-
-  if (!c->kf_ymode_probs_update) {
-    vp8_write_literal(bc, c->kf_ymode_probs_index, 3);
-  }
+  TOKENEXTRA *tok = cpi->tok;
+  TOKENEXTRA *tok_end = tok + cpi->tok_count;
 
   mb_row = 0;
   for (row = 0; row < c->mb_rows; row += 2) {
@@ -1382,8 +1404,6 @@ static void write_kfmodes(VP8_COMP *cpi) {
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
       for (i = 0; i < 4; i++) {
-        int ym;
-        int segment_id;
         int dy = row_delta[i];
         int dx = col_delta[i];
         int offset_extended = dy * mis + dx;
@@ -1399,97 +1419,12 @@ static void write_kfmodes(VP8_COMP *cpi) {
         // Make sure the MacroBlockD mode info pointer is set correctly
         xd->mode_info_context = m;
 
-        ym = m->mbmi.mode;
-        segment_id = m->mbmi.segment_id;
-
-        if (cpi->mb.e_mbd.update_mb_segmentation_map) {
-          write_mb_segid(bc, &m->mbmi, &cpi->mb.e_mbd);
-        }
-
-        if (c->mb_no_coeff_skip &&
-            (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-             (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-              int skip_coeff = m->mbmi.mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
-              if (m->mbmi.encoded_as_sb) {
-                skip_coeff &= m[1].mbmi.mb_skip_coeff;
-                skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-                skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-              }
-#endif
-              vp8_encode_bool(bc, skip_coeff,
-                          get_pred_prob(c, xd, PRED_MBSKIP));
-        }
-
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          sb_kfwrite_ymode(bc, ym,
-                           c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
-        } else
-#endif
-        {
-          kfwrite_ymode(bc, ym,
-                        c->kf_ymode_prob[c->kf_ymode_probs_index]);
-        }
-
-        if (ym == B_PRED) {
-          const int mis = c->mode_info_stride;
-          int i = 0;
-#if CONFIG_COMP_INTRA_PRED
-          int uses_second =
-            m->bmi[0].as_mode.second !=
-            (B_PREDICTION_MODE)(B_DC_PRED - 1);
-          vp8_write(bc, uses_second, 128);
-#endif
-          do {
-            const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
-            const B_PREDICTION_MODE L = left_block_mode(m, i);
-            const int bm = m->bmi[i].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-            const int bm2 = m->bmi[i].as_mode.second;
-#endif
-
+        write_mb_modes_kf(c, xd, m, mis, bc);
 #ifdef ENTROPY_STATS
-            ++intra_mode_stats [A] [L] [bm];
-#endif
-
-            write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
-            // printf("    mode: %d\n", bm);
-#if CONFIG_COMP_INTRA_PRED
-            if (uses_second) {
-              write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);
-            }
-#endif
-          } while (++i < 16);
-        }
-        if (ym == I8X8_PRED) {
-          write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                          c->fc.i8x8_mode_prob);
-          // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
-          write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                          c->fc.i8x8_mode_prob);
-          // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
-          write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                          c->fc.i8x8_mode_prob);
-          // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
-          write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                          c->fc.i8x8_mode_prob);
-          // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
-        } else
-          write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
-
-#if CONFIG_TX_SELECT
-        if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-            !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
-              (segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-               get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-          TX_SIZE sz = m->mbmi.txfm_size;
-          // FIXME(rbultje) code ternary symbol once all experiments are merged
-          vp8_write(bc, sz != TX_4X4, c->prob_tx[0]);
-          if (sz != TX_4X4 && ym <= TM_PRED)
-            vp8_write(bc, sz != TX_8X8, c->prob_tx[1]);
-        }
+        active_section = 8;
 #endif
+        assert(tok < tok_end);
+        pack_mb_tokens(bc, &tok, tok_end);
 
 #if CONFIG_SUPERBLOCKS
         if (m->mbmi.encoded_as_sb) {
@@ -1686,7 +1621,7 @@ void build_coeff_contexts(VP8_COMP *cpi) {
 static void update_coef_probs2(VP8_COMP *cpi) {
   const vp8_prob grpupd = 192;
   int i, j, k, t;
-  vp8_writer *const w = & cpi->bc;
+  vp8_writer *const w = &cpi->bc;
   int update[2];
   int savings;
 
@@ -1851,9 +1786,8 @@ static void update_coef_probs2(VP8_COMP *cpi) {
 }
 #endif
 
-static void update_coef_probs(VP8_COMP *cpi) {
+static void update_coef_probs(VP8_COMP* const cpi, vp8_writer* const bc) {
   int i, j, k, t;
-  vp8_writer *const w = & cpi->bc;
   int update[2] = {0, 0};
   int savings;
 
@@ -1906,10 +1840,10 @@ static void update_coef_probs(VP8_COMP *cpi) {
 
   // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
   /* Is coef updated at all */
-  if (update[1] == 0 || savings < 0)
-    vp8_write_bit(w, 0);
-  else {
-    vp8_write_bit(w, 1);
+  if (update[1] == 0 || savings < 0) {
+    vp8_write_bit(bc, 0);
+  } else {
+    vp8_write_bit(bc, 1);
     for (i = 0; i < BLOCK_TYPES; ++i) {
       for (j = !i; j < COEF_BANDS; ++j) {
         int prev_coef_savings[ENTROPY_NODES] = {0};
@@ -1937,14 +1871,14 @@ static void update_coef_probs(VP8_COMP *cpi) {
             if (s > 0)
               u = 1;
 #endif
-            vp8_write(w, u, upd);
+            vp8_write(bc, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
               ++ tree_update_hist [i][j][k][t] [u];
 #endif
             if (u) {
               /* send/use new probability */
-              write_prob_diff_update(w, newp, *Pold);
+              write_prob_diff_update(bc, newp, *Pold);
               *Pold = newp;
             }
           }
@@ -1996,10 +1930,10 @@ static void update_coef_probs(VP8_COMP *cpi) {
 
   // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
   /* Is coef updated at all */
-  if (update[1] == 0 || savings < 0)
-    vp8_write_bit(w, 0);
-  else {
-    vp8_write_bit(w, 1);
+  if (update[1] == 0 || savings < 0) {
+    vp8_write_bit(bc, 0);
+  } else {
+    vp8_write_bit(bc, 1);
     for (i = 0; i < BLOCK_TYPES; ++i) {
       for (j = !i; j < COEF_BANDS; ++j) {
         int prev_coef_savings[ENTROPY_NODES] = {0};
@@ -2027,14 +1961,14 @@ static void update_coef_probs(VP8_COMP *cpi) {
             if (s > 0)
               u = 1;
 #endif
-            vp8_write(w, u, upd);
+            vp8_write(bc, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
               ++ hybrid_tree_update_hist [i][j][k][t] [u];
 #endif
             if (u) {
               /* send/use new probability */
-              write_prob_diff_update(w, newp, *Pold);
+              write_prob_diff_update(bc, newp, *Pold);
               *Pold = newp;
             }
           }
@@ -2081,10 +2015,10 @@ static void update_coef_probs(VP8_COMP *cpi) {
       }
     }
 
-    if (update[1] == 0 || savings < 0)
-      vp8_write_bit(w, 0);
-    else {
-      vp8_write_bit(w, 1);
+    if (update[1] == 0 || savings < 0) {
+      vp8_write_bit(bc, 0);
+    } else {
+      vp8_write_bit(bc, 1);
       for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
         for (j = !i; j < COEF_BANDS; ++j) {
           for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -2105,14 +2039,14 @@ static void update_coef_probs(VP8_COMP *cpi) {
               s = prob_update_savings(ct, oldp, newp, upd);
               u = s > 0 ? 1 : 0;
 #endif
-              vp8_write(w, u, upd);
+              vp8_write(bc, u, upd);
 #ifdef ENTROPY_STATS
               if (!cpi->dummy_packing)
                 ++ tree_update_hist_8x8 [i][j][k][t] [u];
 #endif
               if (u) {
                 /* send/use new probability */
-                write_prob_diff_update(w, newp, oldp);
+                write_prob_diff_update(bc, newp, oldp);
                 *Pold = newp;
               }
             }
@@ -2155,10 +2089,10 @@ static void update_coef_probs(VP8_COMP *cpi) {
       }
     }
 
-    if (update[1] == 0 || savings < 0)
-      vp8_write_bit(w, 0);
-    else {
-      vp8_write_bit(w, 1);
+    if (update[1] == 0 || savings < 0) {
+      vp8_write_bit(bc, 0);
+    } else {
+      vp8_write_bit(bc, 1);
       for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
         for (j = !i; j < COEF_BANDS; ++j) {
           for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -2179,14 +2113,14 @@ static void update_coef_probs(VP8_COMP *cpi) {
               s = prob_update_savings(ct, oldp, newp, upd);
               u = s > 0 ? 1 : 0;
 #endif
-              vp8_write(w, u, upd);
+              vp8_write(bc, u, upd);
 #ifdef ENTROPY_STATS
               if (!cpi->dummy_packing)
                 ++ hybrid_tree_update_hist_8x8 [i][j][k][t] [u];
 #endif
               if (u) {
                 /* send/use new probability */
-                write_prob_diff_update(w, newp, oldp);
+                write_prob_diff_update(bc, newp, oldp);
                 *Pold = newp;
               }
             }
@@ -2233,10 +2167,10 @@ static void update_coef_probs(VP8_COMP *cpi) {
     }
   }
 
-  if (update[1] == 0 || savings < 0)
-    vp8_write_bit(w, 0);
-  else {
-    vp8_write_bit(w, 1);
+  if (update[1] == 0 || savings < 0) {
+    vp8_write_bit(bc, 0);
+  } else {
+    vp8_write_bit(bc, 1);
     for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
       for (j = !i; j < COEF_BANDS; ++j) {
         for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -2257,14 +2191,14 @@ static void update_coef_probs(VP8_COMP *cpi) {
             s = prob_update_savings(ct, oldp, newp, upd);
             u = s > 0 ? 1 : 0;
 #endif
-            vp8_write(w, u, upd);
+            vp8_write(bc, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
               ++tree_update_hist_16x16[i][j][k][t][u];
 #endif
             if (u) {
               /* send/use new probability */
-              write_prob_diff_update(w, newp, oldp);
+              write_prob_diff_update(bc, newp, oldp);
               *Pold = newp;
             }
           }
@@ -2307,10 +2241,10 @@ static void update_coef_probs(VP8_COMP *cpi) {
     }
   }
 
-  if (update[1] == 0 || savings < 0)
-    vp8_write_bit(w, 0);
-  else {
-    vp8_write_bit(w, 1);
+  if (update[1] == 0 || savings < 0) {
+    vp8_write_bit(bc, 0);
+  } else {
+    vp8_write_bit(bc, 1);
     for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
       for (j = !i; j < COEF_BANDS; ++j) {
         for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -2331,14 +2265,14 @@ static void update_coef_probs(VP8_COMP *cpi) {
             s = prob_update_savings(ct, oldp, newp, upd);
             u = s > 0 ? 1 : 0;
 #endif
-            vp8_write(w, u, upd);
+            vp8_write(bc, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
               ++hybrid_tree_update_hist_16x16[i][j][k][t][u];
 #endif
             if (u) {
               /* send/use new probability */
-              write_prob_diff_update(w, newp, oldp);
+              write_prob_diff_update(bc, newp, oldp);
               *Pold = newp;
             }
           }
@@ -2402,7 +2336,7 @@ static void segment_reference_frames(VP8_COMP *cpi) {
   int ref[MAX_MB_SEGMENTS] = {0};
   int i, j;
   int mb_index = 0;
-  MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   for (i = 0; i < oci->mb_rows; i++) {
     for (j = 0; j < oci->mb_cols; j++, mb_index++) {
@@ -2419,9 +2353,9 @@ static void segment_reference_frames(VP8_COMP *cpi) {
 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size) {
   int i, j;
   VP8_HEADER oh;
-  VP8_COMMON *const pc = & cpi->common;
-  vp8_writer *const bc = & cpi->bc;
-  MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+  VP8_COMMON *const pc = &cpi->common;
+  vp8_writer header_bc, residual_bc;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int extra_bytes_packed = 0;
 
   unsigned char *cx_data = dest;
@@ -2464,40 +2398,65 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     extra_bytes_packed = 7;
     cx_data += extra_bytes_packed;
 
-    vp8_start_encode(bc, cx_data);
+    vp8_start_encode(&header_bc, cx_data);
 
     // signal clr type
-    vp8_write_bit(bc, pc->clr_type);
-    vp8_write_bit(bc, pc->clamp_type);
+    vp8_write_bit(&header_bc, pc->clr_type);
+    vp8_write_bit(&header_bc, pc->clamp_type);
 
-  } else
-    vp8_start_encode(bc, cx_data);
+  } else {
+    vp8_start_encode(&header_bc, cx_data);
+  }
 
   // Signal whether or not Segmentation is enabled
-  vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+  vp8_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
 
   // Indicate which features are enabled
   if (xd->segmentation_enabled) {
     // Indicate whether or not the segmentation map is being updated.
-    vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
+    vp8_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
 
     // If it is, then indicate the method that will be used.
     if (xd->update_mb_segmentation_map) {
       // Select the coding strategy (temporal or spatial)
       choose_segmap_coding_method(cpi);
+      // Send the tree probabilities used to decode unpredicted
+      // macro-block segments
+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+        int data = xd->mb_segment_tree_probs[i];
+
+        if (data != 255) {
+          vp8_write_bit(&header_bc, 1);
+          vp8_write_literal(&header_bc, data, 8);
+        } else {
+          vp8_write_bit(&header_bc, 0);
+        }
+      }
 
       // Write out the chosen coding method.
-      vp8_write_bit(bc, (pc->temporal_update) ? 1 : 0);
+      vp8_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
+      if (pc->temporal_update) {
+        for (i = 0; i < PREDICTION_PROBS; i++) {
+          int data = pc->segment_pred_probs[i];
+
+          if (data != 255) {
+            vp8_write_bit(&header_bc, 1);
+            vp8_write_literal(&header_bc, data, 8);
+          } else {
+            vp8_write_bit(&header_bc, 0);
+          }
+        }
+      }
     }
 
-    vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+    vp8_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
 
     // segment_reference_frames(cpi);
 
     if (xd->update_mb_segmentation_data) {
       signed char Data;
 
-      vp8_write_bit(bc, (xd->mb_segment_abs_delta) ? 1 : 0);
+      vp8_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
 
       // For each segments id...
       for (i = 0; i < MAX_MB_SEGMENTS; i++) {
@@ -2510,67 +2469,67 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
 
           // check if there's an update
           if (segfeature_changed(xd, i, j)) {
-            vp8_write_bit(bc, 1);
+            vp8_write_bit(&header_bc, 1);
 
             if (segfeature_active(xd, i, j)) {
               // this bit is to say we are still
               // active/  if we were inactive
               // this is unnecessary
               if (old_segfeature_active(xd, i, j)) {
-                vp8_write_bit(bc, 1);
+                vp8_write_bit(&header_bc, 1);
               }
               // Is the segment data signed..
               if (is_segfeature_signed(j)) {
                 // Encode the relevant feature data
                 if (Data < 0) {
                   Data = - Data;
-                  vp8_write_literal(bc, Data,
+                  vp8_write_literal(&header_bc, Data,
                                     seg_feature_data_bits(j));
-                  vp8_write_bit(bc, 1);
+                  vp8_write_bit(&header_bc, 1);
                 } else {
-                  vp8_write_literal(bc, Data,
+                  vp8_write_literal(&header_bc, Data,
                                     seg_feature_data_bits(j));
-                  vp8_write_bit(bc, 0);
+                  vp8_write_bit(&header_bc, 0);
                 }
               }
               // Unsigned data element so no sign bit needed
               else
-                vp8_write_literal(bc, Data,
+                vp8_write_literal(&header_bc, Data,
                                   seg_feature_data_bits(j));
             }
             // feature is inactive now
             else if (old_segfeature_active(xd, i, j)) {
-              vp8_write_bit(bc, 0);
+              vp8_write_bit(&header_bc, 0);
             }
           } else {
-            vp8_write_bit(bc, 0);
+            vp8_write_bit(&header_bc, 0);
           }
 #else
 
           // If the feature is enabled...
           if (segfeature_active(xd, i, j)) {
-            vp8_write_bit(bc, 1);
+            vp8_write_bit(&header_bc, 1);
 
             // Is the segment data signed..
             if (is_segfeature_signed(j)) {
               // Encode the relevant feature data
               if (Data < 0) {
                 Data = - Data;
-                vp8_write_literal(bc, Data,
+                vp8_write_literal(&header_bc, Data,
                                   seg_feature_data_bits(j));
-                vp8_write_bit(bc, 1);
+                vp8_write_bit(&header_bc, 1);
               } else {
-                vp8_write_literal(bc, Data,
+                vp8_write_literal(&header_bc, Data,
                                   seg_feature_data_bits(j));
-                vp8_write_bit(bc, 0);
+                vp8_write_bit(&header_bc, 0);
               }
             }
             // Unsigned data element so no sign bit needed
             else
-              vp8_write_literal(bc, Data,
+              vp8_write_literal(&header_bc, Data,
                                 seg_feature_data_bits(j));
           } else
-            vp8_write_bit(bc, 0);
+            vp8_write_bit(&header_bc, 0);
 #endif
         }
       }
@@ -2581,33 +2540,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     save_segment_info(xd);
 #endif
 
-    if (xd->update_mb_segmentation_map) {
-      // Send the tree probabilities used to decode unpredicted
-      // macro-block segments
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-        int Data = xd->mb_segment_tree_probs[i];
-
-        if (Data != 255) {
-          vp8_write_bit(bc, 1);
-          vp8_write_literal(bc, Data, 8);
-        } else
-          vp8_write_bit(bc, 0);
-      }
-
-      // If predictive coding of segment map is enabled send the
-      // prediction probabilities.
-      if (pc->temporal_update) {
-        for (i = 0; i < PREDICTION_PROBS; i++) {
-          int Data = pc->segment_pred_probs[i];
-
-          if (Data != 255) {
-            vp8_write_bit(bc, 1);
-            vp8_write_literal(bc, Data, 8);
-          } else
-            vp8_write_bit(bc, 0);
-        }
-      }
-    }
   }
 
   // Encode the common prediction model status flag probability updates for
@@ -2616,23 +2548,21 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
   if (pc->frame_type != KEY_FRAME) {
     for (i = 0; i < PREDICTION_PROBS; i++) {
       if (cpi->ref_pred_probs_update[i]) {
-        vp8_write_bit(bc, 1);
-        vp8_write_literal(bc, pc->ref_pred_probs[i], 8);
-      } else
-        vp8_write_bit(bc, 0);
+        vp8_write_bit(&header_bc, 1);
+        vp8_write_literal(&header_bc, pc->ref_pred_probs[i], 8);
+      } else {
+        vp8_write_bit(&header_bc, 0);
+      }
     }
   }
 
 #if CONFIG_SUPERBLOCKS
   {
     /* sb mode probability */
-    int sb_coded = 256 - (cpi->sb_count << 8) / (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
-    if (sb_coded <= 0)
-      sb_coded = 1;
-    else if (sb_coded >= 256)
-      sb_coded = 255;
-    pc->sb_coded = sb_coded;
-    vp8_write_literal(bc, pc->sb_coded, 8);
+    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+
+    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
+    vp8_write_literal(&header_bc, pc->sb_coded, 8);
   }
 #endif
 
@@ -2647,29 +2577,29 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
       pc->prob_tx[0] = 128;
       pc->prob_tx[1] = 128;
     }
-    vp8_write_literal(bc, pc->txfm_mode, 2);
+    vp8_write_literal(&header_bc, pc->txfm_mode, 2);
     if (pc->txfm_mode == TX_MODE_SELECT) {
-      vp8_write_literal(bc, pc->prob_tx[0], 8);
-      vp8_write_literal(bc, pc->prob_tx[1], 8);
+      vp8_write_literal(&header_bc, pc->prob_tx[0], 8);
+      vp8_write_literal(&header_bc, pc->prob_tx[1], 8);
     }
   }
 #else
-  vp8_write_bit(bc, !!pc->txfm_mode);
+  vp8_write_bit(&header_bc, !!pc->txfm_mode);
 #endif
 
   // Encode the loop filter level and type
-  vp8_write_bit(bc, pc->filter_type);
-  vp8_write_literal(bc, pc->filter_level, 6);
-  vp8_write_literal(bc, pc->sharpness_level, 3);
+  vp8_write_bit(&header_bc, pc->filter_type);
+  vp8_write_literal(&header_bc, pc->filter_level, 6);
+  vp8_write_literal(&header_bc, pc->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-  vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+  vp8_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
 
   if (xd->mode_ref_lf_delta_enabled) {
     // Do the deltas need to be updated
     int send_update = xd->mode_ref_lf_delta_update;
 
-    vp8_write_bit(bc, send_update);
+    vp8_write_bit(&header_bc, send_update);
     if (send_update) {
       int Data;
 
@@ -2680,18 +2610,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
         // Frame level data
         if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {
           xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
-          vp8_write_bit(bc, 1);
+          vp8_write_bit(&header_bc, 1);
 
           if (Data > 0) {
-            vp8_write_literal(bc, (Data & 0x3F), 6);
-            vp8_write_bit(bc, 0);    // sign
+            vp8_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp8_write_bit(&header_bc, 0);    // sign
           } else {
             Data = -Data;
-            vp8_write_literal(bc, (Data & 0x3F), 6);
-            vp8_write_bit(bc, 1);    // sign
+            vp8_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp8_write_bit(&header_bc, 1);    // sign
           }
-        } else
-          vp8_write_bit(bc, 0);
+        } else {
+          vp8_write_bit(&header_bc, 0);
+        }
       }
 
       // Send update
@@ -2700,41 +2631,42 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
 
         if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {
           xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
-          vp8_write_bit(bc, 1);
+          vp8_write_bit(&header_bc, 1);
 
           if (Data > 0) {
-            vp8_write_literal(bc, (Data & 0x3F), 6);
-            vp8_write_bit(bc, 0);    // sign
+            vp8_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp8_write_bit(&header_bc, 0);    // sign
           } else {
             Data = -Data;
-            vp8_write_literal(bc, (Data & 0x3F), 6);
-            vp8_write_bit(bc, 1);    // sign
+            vp8_write_literal(&header_bc, (Data & 0x3F), 6);
+            vp8_write_bit(&header_bc, 1);    // sign
           }
-        } else
-          vp8_write_bit(bc, 0);
+        } else {
+          vp8_write_bit(&header_bc, 0);
+        }
       }
     }
   }
 
   // signal here is multi token partition is enabled
-  // vp8_write_literal(bc, pc->multi_token_partition, 2);
-  vp8_write_literal(bc, 0, 2);
+  // vp8_write_literal(&header_bc, pc->multi_token_partition, 2);
+  vp8_write_literal(&header_bc, 0, 2);
 
   // Frame Q baseline quantizer index
-  vp8_write_literal(bc, pc->base_qindex, QINDEX_BITS);
+  vp8_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
 
   // Transmit Dc, Second order and Uv quantizer delta information
-  put_delta_q(bc, pc->y1dc_delta_q);
-  put_delta_q(bc, pc->y2dc_delta_q);
-  put_delta_q(bc, pc->y2ac_delta_q);
-  put_delta_q(bc, pc->uvdc_delta_q);
-  put_delta_q(bc, pc->uvac_delta_q);
+  put_delta_q(&header_bc, pc->y1dc_delta_q);
+  put_delta_q(&header_bc, pc->y2dc_delta_q);
+  put_delta_q(&header_bc, pc->y2ac_delta_q);
+  put_delta_q(&header_bc, pc->uvdc_delta_q);
+  put_delta_q(&header_bc, pc->uvac_delta_q);
 
   // When there is a key frame all reference buffers are updated using the new key frame
   if (pc->frame_type != KEY_FRAME) {
     // Should the GF or ARF be updated using the transmitted frame or buffer
-    vp8_write_bit(bc, pc->refresh_golden_frame);
-    vp8_write_bit(bc, pc->refresh_alt_ref_frame);
+    vp8_write_bit(&header_bc, pc->refresh_golden_frame);
+    vp8_write_bit(&header_bc, pc->refresh_alt_ref_frame);
 
     // For inter frames the current default behavior is that when
     // cm->refresh_golden_frame is set we copy the old GF over to
@@ -2744,17 +2676,17 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
 
     // If not being updated from current frame should either GF or ARF be updated from another buffer
     if (!pc->refresh_golden_frame)
-      vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
+      vp8_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);
 
     if (!pc->refresh_alt_ref_frame)
-      vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
+      vp8_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);
 
     // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
-    vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
-    vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    vp8_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+    vp8_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
 
     // Signal whether to allow high MV precision
-    vp8_write_bit(bc, (xd->allow_high_precision_mv) ? 1 : 0);
+    vp8_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
 #if CONFIG_SWITCHABLE_INTERP
     if (pc->mcomp_filter_type == SWITCHABLE) {
       /* Check to see if only one of the filters is actually used */
@@ -2778,16 +2710,16 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
       }
     }
     // Signal the type of subpel filter to use
-    vp8_write_bit(bc, (pc->mcomp_filter_type == SWITCHABLE));
+    vp8_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
     if (pc->mcomp_filter_type != SWITCHABLE)
 #endif  /* CONFIG_SWITCHABLE_INTERP */
-      vp8_write_literal(bc, (pc->mcomp_filter_type), 2);
+      vp8_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
   }
 
-  vp8_write_bit(bc, pc->refresh_entropy_probs);
+  vp8_write_bit(&header_bc, pc->refresh_entropy_probs);
 
   if (pc->frame_type != KEY_FRAME)
-    vp8_write_bit(bc, pc->refresh_last_frame);
+    vp8_write_bit(&header_bc, pc->refresh_last_frame);
 
 #ifdef ENTROPY_STATS
   if (pc->frame_type == INTER_FRAME)
@@ -2827,34 +2759,86 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
   vp8_zero(cpi->common.fc.mv_ref_ct)
   vp8_zero(cpi->common.fc.mv_ref_ct_a)
 
-  update_coef_probs(cpi);
+  update_coef_probs(cpi, &header_bc);
 
 #ifdef ENTROPY_STATS
   active_section = 2;
 #endif
 
   // Write out the mb_no_coeff_skip flag
-  vp8_write_bit(bc, pc->mb_no_coeff_skip);
+  vp8_write_bit(&header_bc, pc->mb_no_coeff_skip);
+  if (pc->mb_no_coeff_skip) {
+    int k;
 
-  if (pc->frame_type == KEY_FRAME) {
-    decide_kf_ymode_entropy(cpi);
-    write_kfmodes(cpi);
+    update_skip_probs(cpi);
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+      vp8_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
+  }
 
-#ifdef ENTROPY_STATS
-    active_section = 8;
-#endif
+  if (pc->frame_type == KEY_FRAME) {
+    if (!pc->kf_ymode_probs_update) {
+      vp8_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
+    }
   } else {
-    pack_inter_mode_mvs(cpi);
-    vp8_update_mode_context(&cpi->common);
+    // Update the probabilities used to encode reference frame data
+    update_ref_probs(cpi);
 
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
+
+#if CONFIG_PRED_FILTER
+    // Write the prediction filter mode used for this frame
+    vp8_write_literal(&header_bc, pc->pred_filter_mode, 2);
+
+    // Write prediction filter on/off probability if signaling at MB level
+    if (pc->pred_filter_mode == 2)
+      vp8_write_literal(&header_bc, pc->prob_pred_filter_off, 8);
+
+#endif
+#if CONFIG_SWITCHABLE_INTERP
+    if (pc->mcomp_filter_type == SWITCHABLE)
+      update_switchable_interp_probs(cpi, &header_bc);
+#endif
+
+    vp8_write_literal(&header_bc, pc->prob_intra_coded, 8);
+    vp8_write_literal(&header_bc, pc->prob_last_coded, 8);
+    vp8_write_literal(&header_bc, pc->prob_gf_coded, 8);
+
+    {
+      const int comp_pred_mode = cpi->common.comp_pred_mode;
+      const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
+      const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
+
+      vp8_write(&header_bc, use_compound_pred, 128);
+      if (use_compound_pred) {
+        vp8_write(&header_bc, use_hybrid_pred, 128);
+        if (use_hybrid_pred) {
+          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
+            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],
+                                                   cpi->comp_pred_count[i]);
+            vp8_write_literal(&header_bc, pc->prob_comppred[i], 8);
+          }
+        }
+      }
+    }
+
+    update_mbintra_mode_probs(cpi, &header_bc);
+
+#if CONFIG_NEWMVENTROPY
+    vp8_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);
+#else
+    if (xd->allow_high_precision_mv) {
+      vp8_write_mvprobs_hp(cpi, &header_bc);
+    } else {
+      vp8_write_mvprobs(cpi, &header_bc);
+    }
+#endif
   }
 
-  vp8_stop_encode(bc);
+  vp8_stop_encode(&header_bc);
 
-  oh.first_partition_length_in_bytes = cpi->bc.pos;
+  oh.first_partition_length_in_bytes = header_bc.pos;
 
   /* update frame tag */
   {
@@ -2868,15 +2852,21 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     dest[2] = v >> 16;
   }
 
-  *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos;
+  *size = VP8_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
+  vp8_start_encode(&residual_bc, cx_data + header_bc.pos);
 
-  vp8_start_encode(&cpi->bc2, cx_data + bc->pos);
+  if (pc->frame_type == KEY_FRAME) {
+    decide_kf_ymode_entropy(cpi);
+    write_kfmodes(cpi, &residual_bc);
+  } else {
+    pack_inter_mode_mvs(cpi, &residual_bc);
+    vp8_update_mode_context(&cpi->common);
+  }
 
-  pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
 
-  vp8_stop_encode(&cpi->bc2);
+  vp8_stop_encode(&residual_bc);
 
-  *size += cpi->bc2.pos;
+  *size += residual_bc.pos;
 
 }
 
@@ -2899,14 +2889,9 @@ void print_tree_update_probs() {
       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
         fprintf(f, "      {");
         for (l = 0; l < ENTROPY_NODES; l++) {
-          Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
-          if (Sum > 0) {
-            if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
-              fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
-            else
-              fprintf(f, "%3ld, ", 1);
-          } else
-            fprintf(f, "%3ld, ", 128);
+          fprintf(f, "%3ld, ",
+              get_binary_prob(tree_update_hist[i][j][k][l][0],
+                              tree_update_hist[i][j][k][l][1]));
         }
         fprintf(f, "},\n");
       }
@@ -2928,14 +2913,9 @@ void print_tree_update_probs() {
       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
         fprintf(f, "      {");
         for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
-          Sum = tree_update_hist_8x8[i][j][k][l][0] + tree_update_hist_8x8[i][j][k][l][1];
-          if (Sum > 0) {
-            if (((tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum) > 0)
-              fprintf(f, "%3ld, ", (tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum);
-            else
-              fprintf(f, "%3ld, ", 1);
-          } else
-            fprintf(f, "%3ld, ", 128);
+          fprintf(f, "%3ld, ",
+              get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],
+                              tree_update_hist_8x8[i][j][k][l][1]));
         }
         fprintf(f, "},\n");
       }
@@ -2956,14 +2936,9 @@ void print_tree_update_probs() {
       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
         fprintf(f, "      {");
         for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
-          Sum = tree_update_hist_16x16[i][j][k][l][0] + tree_update_hist_16x16[i][j][k][l][1];
-          if (Sum > 0) {
-            if (((tree_update_hist_16x16[i][j][k][l][0] * 255) / Sum) > 0)
-              fprintf(f, "%3ld, ", (tree_update_hist_16x16[i][j][k][l][0] * 255) / Sum);
-            else
-              fprintf(f, "%3ld, ", 1);
-          } else
-            fprintf(f, "%3ld, ", 128);
+          fprintf(f, "%3ld, ",
+              get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],
+                              tree_update_hist_16x16[i][j][k][l][1]));
         }
         fprintf(f, "},\n");
       }
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 861700409..80f9b75b8 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -170,7 +170,6 @@ typedef struct {
 #endif
 
   int optimize;
-  int q_index;
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index d81a547d2..cd13fec7c 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -419,6 +419,7 @@ void vp8_fht_c(short *input, short *output, int pitch,
     // pointers to vertical and horizontal transforms
     float *ptv, *pth;
 
+    assert(tx_type != DCT_DCT);
     // load and convert residual array into floating-point
     for(j = 0; j < tx_dim; j++) {
       for(i = 0; i < tx_dim; i++) {
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index f784cf434..adfbfc79b 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -120,8 +120,8 @@ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
    *  lambda using a non-linear combination (e.g., the smallest, or second
    *  smallest, etc.).
    */
-  act =     VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)(x->src.y_buffer,
-                                                           x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
+  act = vp8_variance16x16(x->src.y_buffer, x->src.y_stride, VP8_VAR_OFFS, 0,
+                          &sse);
   act = act << 4;
 
   /* If the region is flat, lower the activity some more. */
@@ -222,7 +222,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) {
 #if USE_ACT_INDEX
 // Calculate and activity index for each mb
 static void calc_activity_index(VP8_COMP *cpi, MACROBLOCK *x) {
-  VP8_COMMON *const cm = & cpi->common;
+  VP8_COMMON *const cm = &cpi->common;
   int mb_row, mb_col;
 
   int64_t act;
@@ -276,9 +276,9 @@ static void calc_activity_index(VP8_COMP *cpi, MACROBLOCK *x) {
 // Loop through all MBs. Note activity of each, average activity and
 // calculate a normalized activity for each
 static void build_activity_map(VP8_COMP *cpi) {
-  MACROBLOCK *const x = & cpi->mb;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  VP8_COMMON *const cm = & cpi->common;
+  VP8_COMMON *const cm = &cpi->common;
 
 #if ALT_ACT_MEASURE
   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
@@ -1051,9 +1051,6 @@ static void encode_sb(VP8_COMP *cpi,
         cpi->inter_zz_count++;
     }
 
-    // TODO Partitioning is broken!
-    cpi->tplist[mb_row].stop = *tp;
-
 #if CONFIG_SUPERBLOCKS
     if (xd->mode_info_context->mbmi.encoded_as_sb) {
       x->src.y_buffer += 32;
@@ -1064,7 +1061,10 @@ static void encode_sb(VP8_COMP *cpi,
       x->partition_info     += 2;
       xd->mode_info_context += 2;
       xd->prev_mode_info_context += 2;
-      
+
+      (*tp)->Token = EOSB_TOKEN;
+      (*tp)++;
+      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
       break;
     }
 #endif
@@ -1086,6 +1086,9 @@ static void encode_sb(VP8_COMP *cpi,
     assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
            (xd->mode_info_context - cpi->common.mip));
 #endif
+    (*tp)->Token = EOSB_TOKEN;
+    (*tp)++;
+    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
   }
 
   // debug output
@@ -1216,9 +1219,9 @@ void encode_sb_row(VP8_COMP *cpi,
 }
 
 void init_encode_frame_mb_context(VP8_COMP *cpi) {
-  MACROBLOCK *const x = & cpi->mb;
-  VP8_COMMON *const cm = & cpi->common;
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCK *const x = &cpi->mb;
+  VP8_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   // GF active flags data structure
   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
@@ -1287,9 +1290,9 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
 
 static void encode_frame_internal(VP8_COMP *cpi) {
   int mb_row;
-  MACROBLOCK *const x = & cpi->mb;
-  VP8_COMMON *const cm = & cpi->common;
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCK *const x = &cpi->mb;
+  VP8_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   TOKENEXTRA *tp = cpi->tok;
   int totalrate;
@@ -1719,7 +1722,7 @@ void vp8_build_block_offsets(MACROBLOCK *x) {
 }
 
 static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x) {
-  const MACROBLOCKD *xd = & x->e_mbd;
+  const MACROBLOCKD *xd = &x->e_mbd;
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
   const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
 
@@ -1928,7 +1931,7 @@ void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
     update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
   }
 }
-#endif
+#endif /* CONFIG_SUPERBLOCKS */
 
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
                                     MACROBLOCK *x,
@@ -1939,14 +1942,15 @@ void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
     adjust_act_zbin(cpi, x);
     vp8_update_zbin_extra(cpi, x);
   }
-
   if (mbmi->mode == I8X8_PRED) {
     vp8_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
     vp8_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
-  } else if (mbmi->mode == B_PRED)
+  } else if (mbmi->mode == B_PRED) {
+    vp8_intra_prediction_down_copy(&x->e_mbd);
     vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-  else
+  } else {
     vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+  }
 
   if (mbmi->mode != I8X8_PRED) {
     vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
@@ -2047,6 +2051,7 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
 
   if (mbmi->ref_frame == INTRA_FRAME) {
     if (mbmi->mode == B_PRED) {
+      vp8_intra_prediction_down_copy(xd);
       vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
       vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
     } else if (mbmi->mode == I8X8_PRED) {
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 703a1015e..ff5395f2d 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -48,7 +48,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
     }
   }
 
-  intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);
+  intra_pred_var = vp8_get_mb_ss(x->src_diff);
 
   return intra_pred_var;
 }
@@ -57,6 +57,9 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
                               MACROBLOCK *x, int ib) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
+#if CONFIG_HYBRIDTRANSFORM
+  TX_TYPE tx_type;
+#endif
 
 #if CONFIG_COMP_INTRA_PRED
   if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
@@ -72,11 +75,11 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
   ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
 
 #if CONFIG_HYBRIDTRANSFORM
-  if (x->q_index < ACTIVE_HT) {
-    txfm_map(b, b->bmi.as_mode.first);
-    vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
-    vp8_ht_quantize_b_4x4(be, b);
-    vp8_ihtllm_c(b->dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type, 4);
+  tx_type = get_tx_type(&x->e_mbd, b);
+  if (tx_type != DCT_DCT) {
+    vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 4);
+    vp8_ht_quantize_b_4x4(be, b, tx_type);
+    vp8_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);
   } else
 #endif
   {
@@ -91,12 +94,6 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {
   int i;
 
-#if 0
-  MACROBLOCKD *xd = &mb->e_mbd;
-  // Intra modes requiring top-right MB reconstructed data have been disabled
-  vp8_intra_prediction_down_copy(xd);
-#endif
-
   for (i = 0; i < 16; i++)
     vp8_encode_intra4x4block(rtcd, mb, i);
   return;
@@ -107,7 +104,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   BLOCK *b = &x->block[0];
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 #if CONFIG_HYBRIDTRANSFORM16X16
-  TX_TYPE txfm_type = xd->mode_info_context->bmi[0].as_mode.tx_type;
+  TX_TYPE tx_type;
 #endif
 
 #if CONFIG_COMP_INTRA_PRED
@@ -124,16 +121,14 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
 
   if (tx_size == TX_16X16) {
 #if CONFIG_HYBRIDTRANSFORM16X16
-    if ((xd->mode_info_context->mbmi.mode < I8X8_PRED) &&
-        (x->q_index < ACTIVE_HT16)) {
-      BLOCKD  *bd = &xd->block[0];
-      txfm_map(bd, pred_mode_conv(xd->mode_info_context->mbmi.mode));
-      txfm_type = bd->bmi.as_mode.tx_type;
-      vp8_fht_c(b->src_diff, b->coeff, 32, txfm_type, 16);
+    BLOCKD  *bd = &xd->block[0];
+    tx_type = get_tx_type(xd, bd);
+    if (tx_type != DCT_DCT) {
+      vp8_fht_c(b->src_diff, b->coeff, 32, tx_type, 16);
       vp8_quantize_mby_16x16(x);
       if (x->optimize)
         vp8_optimize_mby_16x16(x, rtcd);
-      vp8_ihtllm_c(bd->dqcoeff, bd->diff, 32, txfm_type, 16);
+      vp8_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);
     } else
 #endif
     {
@@ -201,6 +196,9 @@ void vp8_encode_intra8x8(const VP8_ENCODER_RTCD *rtcd,
   BLOCK *be = &x->block[ib];
   const int iblock[4] = {0, 1, 4, 5};
   int i;
+#if CONFIG_HYBRIDTRANSFORM8X8
+  TX_TYPE tx_type;
+#endif
 
 #if CONFIG_COMP_INTRA_PRED
   if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
@@ -220,16 +218,20 @@ void vp8_encode_intra8x8(const VP8_ENCODER_RTCD *rtcd,
     vp8_subtract_4b_c(be, b, 16);
 
 #if CONFIG_HYBRIDTRANSFORM8X8
-    txfm_map(b, pred_mode_conv(b->bmi.as_mode.first));
-    vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
-              b->bmi.as_mode.tx_type, 8);
-    x->quantize_b_8x8(x->block + idx, xd->block + idx);
-    vp8_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                 b->bmi.as_mode.tx_type, 8);
-#else
-    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-    x->quantize_b_8x8(x->block + idx, xd->block + idx);
-    vp8_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+    tx_type = get_tx_type(xd, xd->block + idx);
+    if (tx_type != DCT_DCT) {
+      vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
+                tx_type, 8);
+      x->quantize_b_8x8(x->block + idx, xd->block + idx);
+      vp8_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+                   tx_type, 8);
+    } else {
+#endif
+      x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->quantize_b_8x8(x->block + idx, xd->block + idx);
+      vp8_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+#if CONFIG_HYBRIDTRANSFORM8X8
+    }
 #endif
   } else {
     for (i = 0; i < 4; i++) {
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 50de2f2c0..dc54d05a2 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -267,7 +267,7 @@ static const int plane_rd_mult[4] = {
 
 void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
                 ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                const VP8_ENCODER_RTCD *rtcd, int tx_type) {
+                const VP8_ENCODER_RTCD *rtcd, int tx_size) {
   BLOCK *b;
   BLOCKD *d;
   vp8_token_state tokens[65][2];
@@ -298,7 +298,7 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
 
   b = &mb->block[i];
   d = &mb->e_mbd.block[i];
-  switch (tx_type) {
+  switch (tx_size) {
     default:
     case TX_4X4:
       scan = vp8_default_zig_zag1d;
@@ -308,11 +308,9 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       // TODO: this isn't called (for intra4x4 modes), but will be left in
       // since it could be used later
       {
-        int active_ht = (mb->q_index < ACTIVE_HT) &&
-                        (mb->e_mbd.mode_info_context->mbmi.mode == B_PRED);
-
-        if((type == PLANE_TYPE_Y_WITH_DC) && active_ht) {
-          switch (d->bmi.as_mode.tx_type) {
+        TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);
+        if (tx_type != DCT_DCT) {
+          switch (tx_type) {
             case ADST_DCT:
               scan = vp8_row_scan;
               break;
@@ -325,9 +323,9 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
               scan = vp8_default_zig_zag1d;
               break;
           }
-
-        } else
+        } else {
           scan = vp8_default_zig_zag1d;
+        }
       }
 #endif
       break;
@@ -380,9 +378,9 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
         band = bands[i + 1];
         pt = vp8_prev_token_class[t0];
         rate0 +=
-          mb->token_costs[tx_type][type][band][pt][tokens[next][0].token];
+          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
         rate1 +=
-          mb->token_costs[tx_type][type][band][pt][tokens[next][1].token];
+          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];
       }
       UPDATE_RD_COST();
       /* And pick the best. */
@@ -427,12 +425,12 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
         band = bands[i + 1];
         if (t0 != DCT_EOB_TOKEN) {
           pt = vp8_prev_token_class[t0];
-          rate0 += mb->token_costs[tx_type][type][band][pt][
+          rate0 += mb->token_costs[tx_size][type][band][pt][
               tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
           pt = vp8_prev_token_class[t1];
-          rate1 += mb->token_costs[tx_type][type][band][pt][
+          rate1 += mb->token_costs[tx_size][type][band][pt][
               tokens[next][1].token];
         }
       }
@@ -464,11 +462,11 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != DCT_EOB_TOKEN) {
-        tokens[next][0].rate += mb->token_costs[tx_type][type][band][0][t0];
+        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != DCT_EOB_TOKEN) {
-        tokens[next][1].rate += mb->token_costs[tx_type][type][band][0][t1];
+        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       /* Don't update next, because we didn't add a new node. */
@@ -484,8 +482,8 @@ void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_type][type][band][pt][t0];
-  rate1 += mb->token_costs[tx_type][type][band][pt][t1];
+  rate0 += mb->token_costs[tx_size][type][band][pt][t0];
+  rate1 += mb->token_costs[tx_size][type][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
@@ -652,7 +650,7 @@ void vp8_optimize_mby_8x8(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) {
   type = PLANE_TYPE_Y_NO_DC;
   for (b = 0; b < 16; b += 4) {
     optimize_b(x, b, type,
-               ta + vp8_block2above[b], tl + vp8_block2left[b],
+               ta + vp8_block2above_8x8[b], tl + vp8_block2left_8x8[b],
                rtcd, TX_8X8);
     *(ta + vp8_block2above_8x8[b] + 1) = *(ta + vp8_block2above_8x8[b]);
     *(tl + vp8_block2left_8x8[b] + 1)  = *(tl + vp8_block2left_8x8[b]);
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index 1289d89bb..d520d995a 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -28,38 +28,38 @@ extern unsigned int active_section;
 nmv_context_counts tnmvcounts;
 #endif
 
-static void encode_nmv_component(vp8_writer *w,
+static void encode_nmv_component(vp8_writer* const bc,
                                  int v,
                                  int r,
-                                 const nmv_component *mvcomp) {
+                                 const nmv_component* const mvcomp) {
   int s, z, c, o, d;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
-  vp8_write(w, s, mvcomp->sign);
+  vp8_write(bc, s, mvcomp->sign);
   z = (s ? -v : v) - 1;       /* magnitude - 1 */
 
   c = vp8_get_mv_class(z, &o);
 
-  vp8_write_token(w, vp8_mv_class_tree, mvcomp->classes,
+  vp8_write_token(bc, vp8_mv_class_tree, mvcomp->classes,
                   vp8_mv_class_encodings + c);
 
   d = (o >> 3);               /* int mv data */
 
   if (c == MV_CLASS_0) {
-    vp8_write_token(w, vp8_mv_class0_tree, mvcomp->class0,
+    vp8_write_token(bc, vp8_mv_class0_tree, mvcomp->class0,
                     vp8_mv_class0_encodings + d);
   } else {
     int i, b;
     b = c + CLASS0_BITS - 1;  /* number of bits */
     for (i = 0; i < b; ++i)
-      vp8_write(w, ((d >> i) & 1), mvcomp->bits[i]);
+      vp8_write(bc, ((d >> i) & 1), mvcomp->bits[i]);
   }
 }
 
-static void encode_nmv_component_fp(vp8_writer *w,
+static void encode_nmv_component_fp(vp8_writer *bc,
                                     int v,
                                     int r,
-                                    const nmv_component *mvcomp,
+                                    const nmv_component* const mvcomp,
                                     int usehp) {
   int s, z, c, o, d, f, e;
   assert (v != 0);            /* should not be zero */
@@ -74,24 +74,24 @@ static void encode_nmv_component_fp(vp8_writer *w,
 
   /* Code the fractional pel bits */
   if (c == MV_CLASS_0) {
-    vp8_write_token(w, vp8_mv_fp_tree, mvcomp->class0_fp[d],
+    vp8_write_token(bc, vp8_mv_fp_tree, mvcomp->class0_fp[d],
                     vp8_mv_fp_encodings + f);
   } else {
-    vp8_write_token(w, vp8_mv_fp_tree, mvcomp->fp,
+    vp8_write_token(bc, vp8_mv_fp_tree, mvcomp->fp,
                     vp8_mv_fp_encodings + f);
   }
   /* Code the high precision bit */
   if (usehp) {
     if (c == MV_CLASS_0) {
-      vp8_write(w, e, mvcomp->class0_hp);
+      vp8_write(bc, e, mvcomp->class0_hp);
     } else {
-      vp8_write(w, e, mvcomp->hp);
+      vp8_write(bc, e, mvcomp->hp);
     }
   }
 }
 
 static void build_nmv_component_cost_table(int *mvcost,
-                                           const nmv_component *mvcomp,
+                                           const nmv_component* const mvcomp,
                                            int usehp) {
   int i, v;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
@@ -177,7 +177,7 @@ static int update_nmv_savings(const unsigned int ct[2],
 }
 
 static int update_nmv(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   const unsigned int ct[2],
   vp8_prob *const cur_p,
   const vp8_prob new_p,
@@ -199,15 +199,15 @@ static int update_nmv(
 
   if (cur_b - mod_b > cost) {
     *cur_p = mod_p;
-    vp8_write(w, 1, upd_p);
+    vp8_write(bc, 1, upd_p);
 #ifdef LOW_PRECISION_MV_UPDATE
-    vp8_write_literal(w, mod_p >> 1, 7);
+    vp8_write_literal(bc, mod_p >> 1, 7);
 #else
-    vp8_write_literal(w, mod_p, 8);
+    vp8_write_literal(bc, mod_p, 8);
 #endif
     return 1;
   } else {
-    vp8_write(w, 0, upd_p);
+    vp8_write(bc, 0, upd_p);
     return 0;
   }
 }
@@ -318,7 +318,8 @@ void print_nmvstats() {
   }
 }
 
-static void add_nmvcount(nmv_context_counts *dst, nmv_context_counts *src) {
+static void add_nmvcount(nmv_context_counts* const dst,
+                         const nmv_context_counts* const src) {
   int i, j, k;
   for (j = 0; j < MV_JOINTS; ++j) {
     dst->joints[j] += src->joints[j];
@@ -357,8 +358,7 @@ static void add_nmvcount(nmv_context_counts *dst, nmv_context_counts *src) {
 }
 #endif
 
-void vp8_write_nmvprobs(VP8_COMP * cpi, int usehp) {
-  vp8_writer *const w  = & cpi->bc;
+void vp8_write_nmvprobs(VP8_COMP* const cpi, int usehp, vp8_writer* const bc) {
   int i, j;
   nmv_context prob;
   unsigned int branch_ct_joint[MV_JOINTS - 1][2];
@@ -443,37 +443,37 @@ void vp8_write_nmvprobs(VP8_COMP * cpi, int usehp) {
     }
   }
   if (savings <= 0) {
-    vp8_write_bit(w, 0);
+    vp8_write_bit(bc, 0);
     return;
   }
-  vp8_write_bit(w, 1);
+  vp8_write_bit(bc, 1);
 #endif
 
   for (j = 0; j < MV_JOINTS - 1; ++j) {
-    update_nmv(w, branch_ct_joint[j],
+    update_nmv(bc, branch_ct_joint[j],
                &cpi->common.fc.nmvc.joints[j],
                prob.joints[j],
                VP8_NMV_UPDATE_PROB);
   }
   for (i = 0; i < 2; ++i) {
-    update_nmv(w, branch_ct_sign[i],
+    update_nmv(bc, branch_ct_sign[i],
                &cpi->common.fc.nmvc.comps[i].sign,
                prob.comps[i].sign,
                VP8_NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j) {
-      update_nmv(w, branch_ct_classes[i][j],
+      update_nmv(bc, branch_ct_classes[i][j],
                  &cpi->common.fc.nmvc.comps[i].classes[j],
                  prob.comps[i].classes[j],
                  VP8_NMV_UPDATE_PROB);
     }
     for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      update_nmv(w, branch_ct_class0[i][j],
+      update_nmv(bc, branch_ct_class0[i][j],
                  &cpi->common.fc.nmvc.comps[i].class0[j],
                  prob.comps[i].class0[j],
                  VP8_NMV_UPDATE_PROB);
     }
     for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      update_nmv(w, branch_ct_bits[i][j],
+      update_nmv(bc, branch_ct_bits[i][j],
                  &cpi->common.fc.nmvc.comps[i].bits[j],
                  prob.comps[i].bits[j],
                  VP8_NMV_UPDATE_PROB);
@@ -483,14 +483,14 @@ void vp8_write_nmvprobs(VP8_COMP * cpi, int usehp) {
     for (j = 0; j < CLASS0_SIZE; ++j) {
       int k;
       for (k = 0; k < 3; ++k) {
-        update_nmv(w, branch_ct_class0_fp[i][j][k],
+        update_nmv(bc, branch_ct_class0_fp[i][j][k],
                    &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
                    prob.comps[i].class0_fp[j][k],
                    VP8_NMV_UPDATE_PROB);
       }
     }
     for (j = 0; j < 3; ++j) {
-      update_nmv(w, branch_ct_fp[i][j],
+      update_nmv(bc, branch_ct_fp[i][j],
                  &cpi->common.fc.nmvc.comps[i].fp[j],
                  prob.comps[i].fp[j],
                  VP8_NMV_UPDATE_PROB);
@@ -498,11 +498,11 @@ void vp8_write_nmvprobs(VP8_COMP * cpi, int usehp) {
   }
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(w, branch_ct_class0_hp[i],
+      update_nmv(bc, branch_ct_class0_hp[i],
                  &cpi->common.fc.nmvc.comps[i].class0_hp,
                  prob.comps[i].class0_hp,
                  VP8_NMV_UPDATE_PROB);
-      update_nmv(w, branch_ct_hp[i],
+      update_nmv(bc, branch_ct_hp[i],
                  &cpi->common.fc.nmvc.comps[i].hp,
                  prob.comps[i].hp,
                  VP8_NMV_UPDATE_PROB);
@@ -510,34 +510,35 @@ void vp8_write_nmvprobs(VP8_COMP * cpi, int usehp) {
   }
 }
 
-void vp8_encode_nmv(vp8_writer *w, const MV *mv, const MV *ref,
-                    const nmv_context *mvctx) {
+void vp8_encode_nmv(vp8_writer* const bc, const MV* const mv,
+                    const MV* const ref, const nmv_context* const mvctx) {
   MV_JOINT_TYPE j = vp8_get_mv_joint(*mv);
-  vp8_write_token(w, vp8_mv_joint_tree, mvctx->joints,
+  vp8_write_token(bc, vp8_mv_joint_tree, mvctx->joints,
                   vp8_mv_joint_encodings + j);
   if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component(w, mv->row, ref->col, &mvctx->comps[0]);
+    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);
   }
   if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component(w, mv->col, ref->col, &mvctx->comps[1]);
+    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);
   }
 }
 
-void vp8_encode_nmv_fp(vp8_writer *w, const MV *mv, const MV *ref,
-                       const nmv_context *mvctx, int usehp) {
+void vp8_encode_nmv_fp(vp8_writer* const bc, const MV* const mv,
+                       const MV* const ref, const nmv_context* const mvctx,
+                       int usehp) {
   MV_JOINT_TYPE j = vp8_get_mv_joint(*mv);
   usehp = usehp && vp8_use_nmv_hp(ref);
   if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component_fp(w, mv->row, ref->row, &mvctx->comps[0], usehp);
+    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);
   }
   if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component_fp(w, mv->col, ref->col, &mvctx->comps[1], usehp);
+    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);
   }
 }
 
 void vp8_build_nmv_cost_table(int *mvjoint,
                               int *mvcost[2],
-                              const nmv_context *mvctx,
+                              const nmv_context* const mvctx,
                               int usehp,
                               int mvc_flag_v,
                               int mvc_flag_h) {
@@ -552,7 +553,7 @@ void vp8_build_nmv_cost_table(int *mvjoint,
 #else  /* CONFIG_NEWMVENTROPY */
 
 static void encode_mvcomponent(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   const int v,
   const struct mv_context *mvc
 ) {
@@ -560,41 +561,44 @@ static void encode_mvcomponent(
   const int x = v < 0 ? -v : v;
 
   if (x < mvnum_short) {   // Small
-    vp8_write(w, 0, p [mvpis_short]);
-    vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, mvnum_short_bits);
+    vp8_write(bc, 0, p[mvpis_short]);
+    vp8_treed_write(bc, vp8_small_mvtree, p + MVPshort, x, mvnum_short_bits);
     if (!x)
       return;         // no sign bit
   } else {                // Large
     int i = 0;
 
-    vp8_write(w, 1, p [mvpis_short]);
+    vp8_write(bc, 1, p[mvpis_short]);
 
     do
-      vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+      vp8_write(bc, (x >> i) & 1, p[MVPbits + i]);
 
     while (++i < mvnum_short_bits);
 
     i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
 
     do
-      vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+      vp8_write(bc, (x >> i) & 1, p[MVPbits + i]);
 
     while (--i > mvnum_short_bits);
 
     if (x & ~((2 << mvnum_short_bits) - 1))
-      vp8_write(w, (x >> mvnum_short_bits) & 1, p [MVPbits + mvnum_short_bits]);
+      vp8_write(bc, (x >> mvnum_short_bits) & 1, p[MVPbits + mvnum_short_bits]);
   }
 
-  vp8_write(w, v < 0, p [MVPsign]);
+  vp8_write(bc, v < 0, p[MVPsign]);
 }
 
-void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc) {
-  encode_mvcomponent(w, mv->row >> 1, &mvc[0]);
-  encode_mvcomponent(w, mv->col >> 1, &mvc[1]);
+void vp8_encode_motion_vector(vp8_writer* const       bc,
+                              const MV* const         mv,
+                              const MV_CONTEXT* const mvc) {
+  encode_mvcomponent(bc, mv->row >> 1, &mvc[0]);
+  encode_mvcomponent(bc, mv->col >> 1, &mvc[1]);
 }
 
 
-static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc) {
+static unsigned int cost_mvcomponent(const int v,
+                                     const struct mv_context* const mvc) {
   const vp8_prob *p = mvc->prob;
   const int x = v;   // v<0? -v:v;
   unsigned int cost;
@@ -628,7 +632,8 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
   return cost;   // + vp8_cost_bit( p [MVPsign], v < 0);
 }
 
-void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]) {
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc,
+                                    const int mvc_flag[2]) {
   int i = 1;   // -mv_max;
   unsigned int cost0 = 0;
   unsigned int cost1 = 0;
@@ -682,7 +687,7 @@ __inline static void calc_prob(vp8_prob *p, const unsigned int ct[2]) {
 }
 
 static void update(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   const unsigned int ct[2],
   vp8_prob *const cur_p,
   const vp8_prob new_p,
@@ -695,16 +700,16 @@ static void update(
 
   if (cur_b - new_b > cost) {
     *cur_p = new_p;
-    vp8_write(w, 1, update_p);
-    vp8_write_literal(w, new_p >> 1, 7);
+    vp8_write(bc, 1, update_p);
+    vp8_write_literal(bc, new_p >> 1, 7);
     *updated = 1;
 
   } else
-    vp8_write(w, 0, update_p);
+    vp8_write(bc, 0, update_p);
 }
 
 static void write_component_probs(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   struct mv_context *cur_mvc,
   const struct mv_context *default_mvc_,
   const struct mv_context *update_mvc,
@@ -800,9 +805,11 @@ static void write_component_probs(
     while (++j < mvlong_width);
   }
 
-  update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated);
+  update(bc, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short],
+         *Pupdate++, updated);
 
-  update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated);
+  update(bc, sign_ct, Pcur + MVPsign, Pnew[MVPsign],
+         *Pupdate++, updated);
 
   {
     const vp8_prob *const new_p = Pnew + MVPshort;
@@ -812,7 +819,7 @@ static void write_component_probs(
 
     do
 
-      update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+      update(bc, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
 
     while (++j < mvnum_short - 1);
   }
@@ -825,25 +832,25 @@ static void write_component_probs(
 
     do
 
-      update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+      update(bc, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
 
     while (++j < mvlong_width);
   }
 }
 
-void vp8_write_mvprobs(VP8_COMP *cpi) {
-  vp8_writer *const w  = & cpi->bc;
+void vp8_write_mvprobs(VP8_COMP* const cpi, vp8_writer* const bc) {
   MV_CONTEXT *mvc = cpi->common.fc.mvc;
   int flags[2] = {0, 0};
 #ifdef ENTROPY_STATS
   active_section = 4;
 #endif
   write_component_probs(
-    w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->MVcount[0], 0, &flags[0]
-  );
+      bc, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0],
+      cpi->MVcount[0], 0, &flags[0]);
+
   write_component_probs(
-    w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1], cpi->MVcount[1], 1, &flags[1]
-  );
+      bc, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1],
+      cpi->MVcount[1], 1, &flags[1]);
 
   if (flags[0] || flags[1])
     vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
@@ -855,7 +862,7 @@ void vp8_write_mvprobs(VP8_COMP *cpi) {
 
 
 static void encode_mvcomponent_hp(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   const int v,
   const struct mv_context_hp *mvc
 ) {
@@ -863,41 +870,41 @@ static void encode_mvcomponent_hp(
   const int x = v < 0 ? -v : v;
 
   if (x < mvnum_short_hp) {   // Small
-    vp8_write(w, 0, p [mvpis_short_hp]);
-    vp8_treed_write(w, vp8_small_mvtree_hp, p + MVPshort_hp, x,
+    vp8_write(bc, 0, p[mvpis_short_hp]);
+    vp8_treed_write(bc, vp8_small_mvtree_hp, p + MVPshort_hp, x,
                     mvnum_short_bits_hp);
     if (!x)
       return;         // no sign bit
   } else {                // Large
     int i = 0;
 
-    vp8_write(w, 1, p [mvpis_short_hp]);
+    vp8_write(bc, 1, p[mvpis_short_hp]);
 
     do
-      vp8_write(w, (x >> i) & 1, p [MVPbits_hp + i]);
+      vp8_write(bc, (x >> i) & 1, p[MVPbits_hp + i]);
 
     while (++i < mvnum_short_bits_hp);
 
     i = mvlong_width_hp - 1;  /* Skip bit 3, which is sometimes implicit */
 
     do
-      vp8_write(w, (x >> i) & 1, p [MVPbits_hp + i]);
+      vp8_write(bc, (x >> i) & 1, p[MVPbits_hp + i]);
 
     while (--i > mvnum_short_bits_hp);
 
     if (x & ~((2 << mvnum_short_bits_hp) - 1))
-      vp8_write(w, (x >> mvnum_short_bits_hp) & 1,
-                p [MVPbits_hp + mvnum_short_bits_hp]);
+      vp8_write(bc, (x >> mvnum_short_bits_hp) & 1,
+                p[MVPbits_hp + mvnum_short_bits_hp]);
   }
 
-  vp8_write(w, v < 0, p [MVPsign_hp]);
+  vp8_write(bc, v < 0, p[MVPsign_hp]);
 }
 
-void vp8_encode_motion_vector_hp(vp8_writer *w, const MV *mv,
+void vp8_encode_motion_vector_hp(vp8_writer *bc, const MV *mv,
                                  const MV_CONTEXT_HP *mvc) {
 
-  encode_mvcomponent_hp(w, mv->row, &mvc[0]);
-  encode_mvcomponent_hp(w, mv->col, &mvc[1]);
+  encode_mvcomponent_hp(bc, mv->row, &mvc[0]);
+  encode_mvcomponent_hp(bc, mv->col, &mvc[1]);
 }
 
 
@@ -940,7 +947,7 @@ static unsigned int cost_mvcomponent_hp(const int v,
 
 void vp8_build_component_cost_table_hp(int *mvcost[2],
                                        const MV_CONTEXT_HP *mvc,
-                                       int mvc_flag[2]) {
+                                       const int mvc_flag[2]) {
   int i = 1;   // -mv_max;
   unsigned int cost0 = 0;
   unsigned int cost1 = 0;
@@ -978,7 +985,7 @@ void vp8_build_component_cost_table_hp(int *mvcost[2],
 
 
 static void write_component_probs_hp(
-  vp8_writer *const w,
+  vp8_writer *const bc,
   struct mv_context_hp *cur_mvc,
   const struct mv_context_hp *default_mvc_,
   const struct mv_context_hp *update_mvc,
@@ -1074,10 +1081,10 @@ static void write_component_probs_hp(
     while (++j < mvlong_width_hp);
   }
 
-  update(w, is_short_ct, Pcur + mvpis_short_hp, Pnew[mvpis_short_hp],
+  update(bc, is_short_ct, Pcur + mvpis_short_hp, Pnew[mvpis_short_hp],
          *Pupdate++, updated);
 
-  update(w, sign_ct, Pcur + MVPsign_hp, Pnew[MVPsign_hp], *Pupdate++,
+  update(bc, sign_ct, Pcur + MVPsign_hp, Pnew[MVPsign_hp], *Pupdate++,
          updated);
 
   {
@@ -1088,7 +1095,7 @@ static void write_component_probs_hp(
 
     do
 
-      update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+      update(bc, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
 
     while (++j < mvnum_short_hp - 1);
   }
@@ -1101,25 +1108,24 @@ static void write_component_probs_hp(
 
     do
 
-      update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+      update(bc, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
 
     while (++j < mvlong_width_hp);
   }
 }
 
-void vp8_write_mvprobs_hp(VP8_COMP *cpi) {
-  vp8_writer *const w  = & cpi->bc;
+void vp8_write_mvprobs_hp(VP8_COMP* const cpi, vp8_writer* const bc) {
   MV_CONTEXT_HP *mvc = cpi->common.fc.mvc_hp;
   int flags[2] = {0, 0};
 #ifdef ENTROPY_STATS
   active_section = 4;
 #endif
   write_component_probs_hp(
-    w, &mvc[0], &vp8_default_mv_context_hp[0], &vp8_mv_update_probs_hp[0],
+    bc, &mvc[0], &vp8_default_mv_context_hp[0], &vp8_mv_update_probs_hp[0],
     cpi->MVcount_hp[0], 0, &flags[0]
   );
   write_component_probs_hp(
-    w, &mvc[1], &vp8_default_mv_context_hp[1], &vp8_mv_update_probs_hp[1],
+    bc, &mvc[1], &vp8_default_mv_context_hp[1], &vp8_mv_update_probs_hp[1],
     cpi->MVcount_hp[1], 1, &flags[1]
   );
 
diff --git a/vp8/encoder/encodemv.h b/vp8/encoder/encodemv.h
index e675fe058..c06831cb2 100644
--- a/vp8/encoder/encodemv.h
+++ b/vp8/encoder/encodemv.h
@@ -15,11 +15,12 @@
 #include "onyx_int.h"
 
 #if CONFIG_NEWMVENTROPY
-void vp8_write_nmvprobs(VP8_COMP *, int usehp);
-void vp8_encode_nmv(vp8_writer *w, const MV *mv, const MV *ref,
-                    const nmv_context *mvctx);
-void vp8_encode_nmv_fp(vp8_writer *w, const MV *mv, const MV *ref,
-                       const nmv_context *mvctx, int usehp);
+void vp8_write_nmvprobs(VP8_COMP* const, int usehp, vp8_writer* const);
+void vp8_encode_nmv(vp8_writer* const w, const MV* const mv,
+                    const MV* const ref, const nmv_context* const mvctx);
+void vp8_encode_nmv_fp(vp8_writer* const w, const MV* const mv,
+                       const MV* const ref, const nmv_context *mvctx,
+                       int usehp);
 void vp8_build_nmv_cost_table(int *mvjoint,
                               int *mvcost[2],
                               const nmv_context *mvctx,
@@ -27,18 +28,18 @@ void vp8_build_nmv_cost_table(int *mvjoint,
                               int mvc_flag_v,
                               int mvc_flag_h);
 #else  /* CONFIG_NEWMVENTROPY */
-void vp8_write_mvprobs(VP8_COMP *);
-void vp8_encode_motion_vector(vp8_writer *, const MV *,
-                              const MV_CONTEXT *);
+void vp8_write_mvprobs(VP8_COMP* const, vp8_writer* const);
+void vp8_encode_motion_vector(vp8_writer* const, const MV* const,
+                              const MV_CONTEXT* const);
 void vp8_build_component_cost_table(int *mvcost[2],
-                                    const MV_CONTEXT *mvc,
-                                    int mvc_flag[2]);
-void vp8_write_mvprobs_hp(VP8_COMP *);
-void vp8_encode_motion_vector_hp(vp8_writer *, const MV *,
-                                 const MV_CONTEXT_HP *);
+                                    const MV_CONTEXT*,
+                                    const int mvc_flag[2]);
+void vp8_write_mvprobs_hp(VP8_COMP* const, vp8_writer* const);
+void vp8_encode_motion_vector_hp(vp8_writer* const, const MV* const,
+                                 const MV_CONTEXT_HP* const);
 void vp8_build_component_cost_table_hp(int *mvcost[2],
-                                       const MV_CONTEXT_HP *mvc,
-                                       int mvc_flag[2]);
+                                       const MV_CONTEXT_HP*,
+                                       const int mvc_flag[2]);
 #endif  /* CONFIG_NEWMVENTROPY */
 
 #endif
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index bad50b606..1e88454a1 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -350,7 +350,7 @@ void vp8_end_first_pass(VP8_COMP *cpi) {
 }
 
 static void zz_motion_search(VP8_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
   BLOCKD *d = &x->e_mbd.block[0];
 
@@ -364,14 +364,15 @@ static void zz_motion_search(VP8_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r
 
   ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);
 
-  VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16)(src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
+  vp8_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+               (unsigned int *)(best_motion_err));
 }
 
 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
                                      int_mv *ref_mv, MV *best_mv,
                                      YV12_BUFFER_CONFIG *recon_buffer,
                                      int *best_motion_err, int recon_yoffset) {
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
   BLOCKD *d = &x->e_mbd.block[0];
   int num00;
@@ -387,7 +388,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
   int new_mv_mode_penalty = 256;
 
   // override the default variance function to use MSE
-  v_fn_ptr.vf    = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16);
+  v_fn_ptr.vf = vp8_mse16x16;
 
   // Set up pointers for this macro block recon buffer
   xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
@@ -436,9 +437,9 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
 
 void vp8_first_pass(VP8_COMP *cpi) {
   int mb_row, mb_col;
-  MACROBLOCK *const x = & cpi->mb;
-  VP8_COMMON *const cm = & cpi->common;
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCK *const x = &cpi->mb;
+  VP8_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   int recon_yoffset, recon_uvoffset;
   YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 356e32c3f..44e83fdc7 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -23,80 +23,6 @@ extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER
 void vp8_cmachine_specific_config(VP8_COMP *cpi) {
 #if CONFIG_RUNTIME_CPU_DETECT
   cpi->rtcd.common                    = &cpi->common.rtcd;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.sad32x32              = vp8_sad32x32_c;
-#endif
-  cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
-  cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
-  cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
-  cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
-  cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
-
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.sad32x32x3            = vp8_sad32x32x3_c;
-#endif
-  cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
-  cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
-  cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
-  cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
-  cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
-
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.sad32x32x8            = vp8_sad32x32x8_c;
-#endif
-  cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
-  cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
-  cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
-  cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
-  cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
-
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.sad32x32x4d           = vp8_sad32x32x4d_c;
-#endif
-  cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
-  cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
-  cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
-  cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_c;
-  cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_c;
-#if ARCH_X86 || ARCH_X86_64
-  cpi->rtcd.variance.copy32xn              = vp8_copy32xn_c;
-#endif
-  cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-  cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-  cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-  cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-  cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.var32x32              = vp8_variance32x32_c;
-#endif
-
-  cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-  cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-  cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-  cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-  cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.subpixvar32x32        = vp8_sub_pixel_variance32x32_c;
-#endif
-  cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.halfpixvar32x32_h     = vp8_variance_halfpixvar32x32_h_c;
-#endif
-  cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.halfpixvar32x32_v     = vp8_variance_halfpixvar32x32_v_c;
-#endif
-  cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.halfpixvar32x32_hv    = vp8_variance_halfpixvar32x32_hv_c;
-#endif
-  cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
-#if CONFIG_SUPERBLOCKS
-  cpi->rtcd.variance.subpixmse32x32        = vp8_sub_pixel_mse32x32_c;
-#endif
-
-  cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-  cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
 
   cpi->rtcd.fdct.short8x8                  = vp8_short_fdct8x8_c;
   cpi->rtcd.fdct.short16x16                = vp8_short_fdct16x16_c;
@@ -118,16 +44,11 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) {
   cpi->rtcd.search.refining_search         = vp8_refining_search_sad;
   cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
   cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
-  cpi->rtcd.variance.satd16x16             = vp8_satd16x16_c;
   cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
   cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
   cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
   cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
   cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
-#if CONFIG_INTERNAL_STATS
-  cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_c;
-  cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_c;
-#endif
 #endif
 
   vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
diff --git a/vp8/encoder/mbgraph.c b/vp8/encoder/mbgraph.c
index 180ee5870..2eecfcdad 100644
--- a/vp8/encoder/mbgraph.c
+++ b/vp8/encoder/mbgraph.c
@@ -83,10 +83,8 @@ static unsigned int do_16x16_motion_iteration
 
   vp8_set_mbmode_and_mvs(x, NEWMV, dst_mv);
   vp8_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-  // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
-  best_err = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16)
-             (xd->dst.y_buffer, xd->dst.y_stride,
-              xd->predictor, 16, INT_MAX);
+  best_err = vp8_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
+                          xd->predictor, 16, INT_MAX);
 
   /* restore UMV window */
   x->mv_col_min = tmp_col_min;
@@ -130,11 +128,8 @@ static int do_16x16_motion_search
   // FIXME should really use something like near/nearest MV and/or MV prediction
   xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
   xd->pre.y_stride = ref->y_stride;
-  // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
-  err = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16)
-        (ref->y_buffer + mb_y_offset,
-         ref->y_stride, xd->dst.y_buffer,
-         xd->dst.y_stride, INT_MAX);
+  err = vp8_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
+                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -193,10 +188,8 @@ static int do_16x16_zerozero_search
   xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
   xd->pre.y_stride = ref->y_stride;
   // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
-  err = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16)
-        (ref->y_buffer + mb_y_offset,
-         ref->y_stride, xd->dst.y_buffer,
-         xd->dst.y_stride, INT_MAX);
+  err = vp8_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
+                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
 
   dst_mv->as_int = 0;
 
@@ -221,11 +214,8 @@ static int find_best_16x16_intra
 
     xd->mode_info_context->mbmi.mode = mode;
     vp8_build_intra_predictors_mby(xd);
-    // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
-    err = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16)
-          (xd->predictor, 16,
-           buf->y_buffer + mb_y_offset,
-           buf->y_stride, best_err);
+    err = vp8_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,
+                       buf->y_stride, best_err);
     // find best
     if (err < best_err) {
       best_err  = err;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index a6cf2f18b..85f5f289e 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1479,7 +1479,8 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
         for (t = 0; t < 4; t++)
           block_offset[t] = ss[i + t].offset + best_address;
 
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 717fe96ee..0e46071a9 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -77,7 +77,7 @@ extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFF
 extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
 #endif
 
-int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
 extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
 
@@ -101,25 +101,14 @@ extern const int vp8_gf_interval_table[101];
 #if CONFIG_INTERNAL_STATS
 #include "math.h"
 
-extern double vp8_calc_ssim
-(
-  YV12_BUFFER_CONFIG *source,
-  YV12_BUFFER_CONFIG *dest,
-  int lumamask,
-  double *weight,
-  const vp8_variance_rtcd_vtable_t *rtcd
-);
+extern double vp8_calc_ssim(YV12_BUFFER_CONFIG *source,
+                            YV12_BUFFER_CONFIG *dest, int lumamask,
+                            double *weight);
 
 
-extern double vp8_calc_ssimg
-(
-  YV12_BUFFER_CONFIG *source,
-  YV12_BUFFER_CONFIG *dest,
-  double *ssim_y,
-  double *ssim_u,
-  double *ssim_v,
-  const vp8_variance_rtcd_vtable_t *rtcd
-);
+extern double vp8_calc_ssimg(YV12_BUFFER_CONFIG *source,
+                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                             double *ssim_u, double *ssim_v);
 
 
 #endif
@@ -608,7 +597,7 @@ static void init_seg_features(VP8_COMP *cpi) {
 
 // DEBUG: Print out the segment id of each MB in the current frame.
 static void print_seg_map(VP8_COMP *cpi) {
-  VP8_COMMON *cm = & cpi->common;
+  VP8_COMMON *cm = &cpi->common;
   int row, col;
   int map_index = 0;
   FILE *statsfile;
@@ -1282,7 +1271,7 @@ static int vp8_alloc_partition_data(VP8_COMP *cpi) {
 }
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi) {
-  VP8_COMMON *cm = & cpi->common;
+  VP8_COMMON *cm = &cpi->common;
 
   int width = cm->Width;
   int height = cm->Height;
@@ -1363,7 +1352,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 
   vpx_free(cpi->tplist);
 
-  CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
+  CHECK_MEM_ERROR(cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));
 }
 
 
@@ -2026,74 +2016,48 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   init_mv_ref_counts();
 #endif
 
+#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].vf             = VF; \
+    cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
+    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
+    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
+    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
+
+
 #if CONFIG_SUPERBLOCKS
-  cpi->fn_ptr[BLOCK_32X32].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32);
-  cpi->fn_ptr[BLOCK_32X32].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32);
-  cpi->fn_ptr[BLOCK_32X32].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar32x32);
-  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_h);
-  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_v);
-  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_hv);
-  cpi->fn_ptr[BLOCK_32X32].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x3);
-  cpi->fn_ptr[BLOCK_32X32].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x8);
-  cpi->fn_ptr[BLOCK_32X32].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x4d);
+  BFP(BLOCK_32X32, vp8_sad32x32, vp8_variance32x32, vp8_sub_pixel_variance32x32,
+      vp8_variance_halfpixvar32x32_h, vp8_variance_halfpixvar32x32_v,
+      vp8_variance_halfpixvar32x32_hv, vp8_sad32x32x3, vp8_sad32x32x8,
+      vp8_sad32x32x4d)
 #endif
 
-  cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
-  cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
-  cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
-  cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_h);
-  cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
-  cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
-  cpi->fn_ptr[BLOCK_16X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
-  cpi->fn_ptr[BLOCK_16X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
-  cpi->fn_ptr[BLOCK_16X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
-
-  cpi->fn_ptr[BLOCK_16X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
-  cpi->fn_ptr[BLOCK_16X8].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
-  cpi->fn_ptr[BLOCK_16X8].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
-  cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
-  cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
-  cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
-  cpi->fn_ptr[BLOCK_16X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
-  cpi->fn_ptr[BLOCK_16X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
-  cpi->fn_ptr[BLOCK_16X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
-
-  cpi->fn_ptr[BLOCK_8X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
-  cpi->fn_ptr[BLOCK_8X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
-  cpi->fn_ptr[BLOCK_8X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
-  cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
-  cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
-  cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
-  cpi->fn_ptr[BLOCK_8X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
-  cpi->fn_ptr[BLOCK_8X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
-  cpi->fn_ptr[BLOCK_8X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
-
-  cpi->fn_ptr[BLOCK_8X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
-  cpi->fn_ptr[BLOCK_8X8].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
-  cpi->fn_ptr[BLOCK_8X8].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
-  cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
-  cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
-  cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
-  cpi->fn_ptr[BLOCK_8X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
-  cpi->fn_ptr[BLOCK_8X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
-  cpi->fn_ptr[BLOCK_8X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
-
-  cpi->fn_ptr[BLOCK_4X4].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
-  cpi->fn_ptr[BLOCK_4X4].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
-  cpi->fn_ptr[BLOCK_4X4].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
-  cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
-  cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
-  cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
-  cpi->fn_ptr[BLOCK_4X4].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
-  cpi->fn_ptr[BLOCK_4X4].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
-  cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
+  BFP(BLOCK_16X16, vp8_sad16x16, vp8_variance16x16, vp8_sub_pixel_variance16x16,
+       vp8_variance_halfpixvar16x16_h, vp8_variance_halfpixvar16x16_v,
+       vp8_variance_halfpixvar16x16_hv, vp8_sad16x16x3, vp8_sad16x16x8,
+       vp8_sad16x16x4d)
+
+  BFP(BLOCK_16X8, vp8_sad16x8, vp8_variance16x8, vp8_sub_pixel_variance16x8,
+      NULL, NULL, NULL, vp8_sad16x8x3, vp8_sad16x8x8, vp8_sad16x8x4d)
+
+  BFP(BLOCK_8X16, vp8_sad8x16, vp8_variance8x16, vp8_sub_pixel_variance8x16,
+      NULL, NULL, NULL, vp8_sad8x16x3, vp8_sad8x16x8, vp8_sad8x16x4d)
+
+  BFP(BLOCK_8X8, vp8_sad8x8, vp8_variance8x8, vp8_sub_pixel_variance8x8,
+      NULL, NULL, NULL, vp8_sad8x8x3, vp8_sad8x8x8, vp8_sad8x8x4d)
+
+  BFP(BLOCK_4X4, vp8_sad4x4, vp8_variance4x4, vp8_sub_pixel_variance4x4,
+      NULL, NULL, NULL, vp8_sad4x4x3, vp8_sad4x4x8, vp8_sad4x4x4d)
 
 #if ARCH_X86 || ARCH_X86_64
-  cpi->fn_ptr[BLOCK_16X16].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
-  cpi->fn_ptr[BLOCK_16X8].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
-  cpi->fn_ptr[BLOCK_8X16].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
-  cpi->fn_ptr[BLOCK_8X8].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
-  cpi->fn_ptr[BLOCK_4X4].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
+  cpi->fn_ptr[BLOCK_16X16].copymem  = vp8_copy32xn;
+  cpi->fn_ptr[BLOCK_16X8].copymem   = vp8_copy32xn;
+  cpi->fn_ptr[BLOCK_8X16].copymem   = vp8_copy32xn;
+  cpi->fn_ptr[BLOCK_8X8].copymem    = vp8_copy32xn;
+  cpi->fn_ptr[BLOCK_4X4].copymem    = vp8_copy32xn;
 #endif
 
   cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
@@ -2369,8 +2333,7 @@ void vp8_remove_compressor(VP8_PTR *ptr) {
 
 static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
                                  unsigned char *recon, int recon_stride,
-                                 unsigned int cols, unsigned int rows,
-                                 vp8_variance_rtcd_vtable_t *rtcd) {
+                                 unsigned int cols, unsigned int rows) {
   unsigned int row, col;
   uint64_t total_sse = 0;
   int diff;
@@ -2379,9 +2342,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
     for (col = 0; col + 16 <= cols; col += 16) {
       unsigned int sse;
 
-      VARIANCE_INVOKE(rtcd, mse16x16)(orig + col, orig_stride,
-                                      recon + col, recon_stride,
-                                      &sse);
+      vp8_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
       total_sse += sse;
     }
 
@@ -2433,8 +2394,7 @@ static void generate_psnr_packet(VP8_COMP *cpi) {
   pkt.kind = VPX_CODEC_PSNR_PKT;
   sse = calc_plane_error(orig->y_buffer, orig->y_stride,
                          recon->y_buffer, recon->y_stride,
-                         width, height,
-                         IF_RTCD(&cpi->rtcd.variance));
+                         width, height);
   pkt.data.psnr.sse[0] = sse;
   pkt.data.psnr.sse[1] = sse;
   pkt.data.psnr.samples[0] = width * height;
@@ -2445,8 +2405,7 @@ static void generate_psnr_packet(VP8_COMP *cpi) {
 
   sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
                          recon->u_buffer, recon->uv_stride,
-                         width, height,
-                         IF_RTCD(&cpi->rtcd.variance));
+                         width, height);
   pkt.data.psnr.sse[0] += sse;
   pkt.data.psnr.sse[2] = sse;
   pkt.data.psnr.samples[0] += width * height;
@@ -2454,8 +2413,7 @@ static void generate_psnr_packet(VP8_COMP *cpi) {
 
   sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
                          recon->v_buffer, recon->uv_stride,
-                         width, height,
-                         IF_RTCD(&cpi->rtcd.variance));
+                         width, height);
   pkt.data.psnr.sse[0] += sse;
   pkt.data.psnr.sse[3] = sse;
   pkt.data.psnr.samples[0] += width * height;
@@ -3427,8 +3385,7 @@ static void encode_frame_to_data_rate
     if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
       int last_q = Q;
       int kf_err = vp8_calc_ss_err(cpi->Source,
-                                   &cm->yv12_fb[cm->new_fb_idx],
-                                   IF_RTCD(&cpi->rtcd.variance));
+                                   &cm->yv12_fb[cm->new_fb_idx]);
 
       int high_err_target = cpi->ambient_err;
       int low_err_target = (cpi->ambient_err >> 1);
@@ -3620,8 +3577,7 @@ static void encode_frame_to_data_rate
     if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
       if (mcomp_filter_index < mcomp_filters) {
         INT64 err = vp8_calc_ss_err(cpi->Source,
-                                    &cm->yv12_fb[cm->new_fb_idx],
-                                    IF_RTCD(&cpi->rtcd.variance));
+                                    &cm->yv12_fb[cm->new_fb_idx]);
         INT64 rate = cpi->projected_frame_size << 8;
         mcomp_filter_cost[mcomp_filter_index] =
           (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
@@ -3683,8 +3639,7 @@ static void encode_frame_to_data_rate
   // the force key frame
   if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
     cpi->ambient_err = vp8_calc_ss_err(cpi->Source,
-                                       &cm->yv12_fb[cm->new_fb_idx],
-                                       IF_RTCD(&cpi->rtcd.variance));
+                                       &cm->yv12_fb[cm->new_fb_idx]);
   }
 
   // This frame's MVs are saved and will be used in next frame's MV
@@ -3902,8 +3857,7 @@ static void encode_frame_to_data_rate
     vp8_clear_system_state();  // __asm emms;
 
     recon_err = vp8_calc_ss_err(cpi->Source,
-                                &cm->yv12_fb[cm->new_fb_idx],
-                                IF_RTCD(&cpi->rtcd.variance));
+                                &cm->yv12_fb[cm->new_fb_idx]);
 
     if (cpi->twopass.total_left_stats->coded_error != 0.0)
       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
@@ -4389,16 +4343,16 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
         int64_t sq_error;
 
         ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                              recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,
-                              IF_RTCD(&cpi->rtcd.variance));
+                              recon->y_buffer, recon->y_stride, orig->y_width,
+                              orig->y_height);
 
         ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                              recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
-                              IF_RTCD(&cpi->rtcd.variance));
+                              recon->u_buffer, recon->uv_stride, orig->uv_width,
+                              orig->uv_height);
 
         ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                              recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
-                              IF_RTCD(&cpi->rtcd.variance));
+                              recon->v_buffer, recon->uv_stride, orig->uv_width,
+                              orig->uv_height);
 
         sq_error = ye + ue + ve;
 
@@ -4418,16 +4372,16 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
           vp8_clear_system_state();
 
           ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                                pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height,
-                                IF_RTCD(&cpi->rtcd.variance));
+                                pp->y_buffer, pp->y_stride, orig->y_width,
+                                orig->y_height);
 
           ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                                pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
-                                IF_RTCD(&cpi->rtcd.variance));
+                                pp->u_buffer, pp->uv_stride, orig->uv_width,
+                                orig->uv_height);
 
           ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                                pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
-                                IF_RTCD(&cpi->rtcd.variance));
+                                pp->v_buffer, pp->uv_stride, orig->uv_width,
+                                orig->uv_height);
 
           sq_error = ye + ue + ve;
 
@@ -4440,8 +4394,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
           cpi->totalp  += frame_psnr2;
 
           frame_ssim2 = vp8_calc_ssim(cpi->Source,
-                                      &cm->post_proc_buffer, 1, &weight,
-                                      IF_RTCD(&cpi->rtcd.variance));
+                                      &cm->post_proc_buffer, 1, &weight);
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
@@ -4460,7 +4413,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
         frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
-                                    &y, &u, &v, IF_RTCD(&cpi->rtcd.variance));
+                                    &y, &u, &v);
         cpi->total_ssimg_y += y;
         cpi->total_ssimg_u += u;
         cpi->total_ssimg_v += v;
@@ -4603,19 +4556,19 @@ int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert
 
 
 
-int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd) {
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
   int i, j;
   int Total = 0;
 
   unsigned char *src = source->y_buffer;
   unsigned char *dst = dest->y_buffer;
-  (void)rtcd;
 
   // Loop through the Y plane raw and reconstruction data summing (square differences)
   for (i = 0; i < source->y_height; i += 16) {
     for (j = 0; j < source->y_width; j += 16) {
       unsigned int sse;
-      Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+      Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
+                            &sse);
     }
 
     src += 16 * source->y_stride;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 1e7494039..79287e5fa 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -365,7 +365,6 @@ typedef struct {
 
 typedef struct VP8_ENCODER_RTCD {
   VP8_COMMON_RTCD            *common;
-  vp8_variance_rtcd_vtable_t  variance;
   vp8_fdct_rtcd_vtable_t      fdct;
   vp8_encodemb_rtcd_vtable_t  encodemb;
   vp8_search_rtcd_vtable_t    search;
@@ -420,9 +419,6 @@ typedef struct VP8_COMP {
 
   MACROBLOCK mb;
   VP8_COMMON common;
-  vp8_writer bc, bc2;
-  // bool_writer *bc2;
-
   VP8_CONFIG oxcf;
 
   struct lookahead_ctx    *lookahead;
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 954997889..57bd41468 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -21,7 +21,8 @@
 #include "vpx_ports/arm.h"
 #endif
 
-extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source,
+                           YV12_BUFFER_CONFIG *dest);
 #if HAVE_ARMV7
 extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
 #endif
@@ -71,7 +72,8 @@ vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst
 
   vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
 }
-static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int Fraction, const vp8_variance_rtcd_vtable_t *rtcd) {
+static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
+                                    YV12_BUFFER_CONFIG *dest, int Fraction) {
   int i, j;
   int Total = 0;
   int srcoffset, dstoffset;
@@ -79,7 +81,6 @@ static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONF
   unsigned char *dst = dest->y_buffer;
 
   int linestocopy = (source->y_height >> (Fraction + 4));
-  (void)rtcd;
 
   if (linestocopy < 1)
     linestocopy = 1;
@@ -97,7 +98,8 @@ static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONF
   for (i = 0; i < linestocopy; i += 16) {
     for (j = 0; j < source->y_width; j += 16) {
       unsigned int sse;
-      Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+      Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
+                            &sse);
     }
 
     src += 16 * source->y_stride;
@@ -179,7 +181,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
   // Get the err using the previous frame's filter value.
   vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
-  best_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+  best_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3);
 
   //  Re-instate the unfiltered frame
   vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
@@ -192,7 +194,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
     vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
     // Get the err for filtered frame
-    filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+    filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3);
 
     //  Re-instate the unfiltered frame
     vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
@@ -221,7 +223,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
       vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
       // Get the err for filtered frame
-      filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+      filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3);
 
       //  Re-instate the unfiltered frame
       vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
@@ -308,7 +310,7 @@ void vp8cx_pick_filter_level_sg(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi, int segme
   vp8cx_set_alt_lf_level(cpi, filt_mid);
   vp8_loop_filter_frame_segment(cm, &cpi->mb.e_mbd, filt_mid, segment);
 
-  best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+  best_err = vp8_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
 
   //  Re-instate the unfiltered frame
@@ -348,7 +350,7 @@ void vp8cx_pick_filter_level_sg(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi, int segme
       vp8cx_set_alt_lf_level(cpi, filt_low);
       vp8_loop_filter_frame_segment(cm, &cpi->mb.e_mbd, filt_low, segment);
 
-      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
 
       //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
@@ -383,7 +385,7 @@ void vp8cx_pick_filter_level_sg(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi, int segme
       vp8cx_set_alt_lf_level(cpi, filt_high);
       vp8_loop_filter_frame_segment(cm, &cpi->mb.e_mbd, filt_high, segment);
 
-      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
 
       //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
@@ -517,7 +519,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
   vp8cx_set_alt_lf_level(cpi, filt_mid);
   vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
 
-  best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+  best_err = vp8_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
 
   //  Re-instate the unfiltered frame
@@ -557,7 +559,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
       vp8cx_set_alt_lf_level(cpi, filt_low);
       vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
 
-      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
 
       //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
@@ -592,7 +594,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
       vp8cx_set_alt_lf_level(cpi, filt_high);
       vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
 
-      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+      filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
 
       //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 8ae3029ee..b6a1f27f8 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -22,7 +22,7 @@ extern int enc_debug;
 #endif
 
 #if CONFIG_HYBRIDTRANSFORM
-void vp8_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
+void vp8_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
@@ -39,7 +39,7 @@ void vp8_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
 
   int const *pt_scan ;
 
-  switch(d->bmi.as_mode.tx_type) {
+  switch (tx_type) {
     case ADST_DCT :
       pt_scan = vp8_row_scan;
       break;
@@ -653,12 +653,12 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) {
   }
 
   /* save this macroblock QIndex for vp8_update_zbin_extra() */
-  x->q_index = QIndex;
+  x->e_mbd.q_index = QIndex;
 }
 
 void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x) {
   int i;
-  int QIndex = x->q_index;
+  int QIndex = x->e_mbd.q_index;
   int zbin_extra;
 
   // Y
diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
index ad3a3fc0e..1375ed0b0 100644
--- a/vp8/encoder/quantize.h
+++ b/vp8/encoder/quantize.h
@@ -31,7 +31,9 @@
 #endif
 
 #if CONFIG_HYBRIDTRANSFORM
-extern prototype_quantize_block(vp8_ht_quantize_b_4x4);
+#define prototype_quantize_block_type(sym) \
+  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)
+extern prototype_quantize_block_type(vp8_ht_quantize_b_4x4);
 #endif
 
 #ifndef vp8_quantize_quantb_4x4
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 2b5f699b6..570bedfe9 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -123,7 +123,7 @@ int vp8_bits_per_mb(FRAME_TYPE frame_type, int qindex) {
 
 
 void vp8_save_coding_context(VP8_COMP *cpi) {
-  CODING_CONTEXT *const cc = & cpi->coding_context;
+  CODING_CONTEXT *const cc = &cpi->coding_context;
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
@@ -195,7 +195,7 @@ void vp8_save_coding_context(VP8_COMP *cpi) {
 }
 
 void vp8_restore_coding_context(VP8_COMP *cpi) {
-  CODING_CONTEXT *const cc = & cpi->coding_context;
+  CODING_CONTEXT *const cc = &cpi->coding_context;
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index c82a87d69..e3f989acd 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -520,7 +520,7 @@ int vp8_mbuverror_c(MACROBLOCK *mb) {
   return error;
 }
 
-int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd) {
+int vp8_uvsse(MACROBLOCK *x) {
   unsigned char *uptr, *vptr;
   unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
   unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
@@ -551,16 +551,14 @@ int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd) {
   vptr = x->e_mbd.pre.v_buffer + offset;
 
   if ((mv_row | mv_col) & 7) {
-    VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride,
-                                        (mv_col & 7) << 1, (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
-    VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride,
-                                        (mv_col & 7) << 1, (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
+    vp8_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
+                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
+    vp8_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
+                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
     sse2 += sse1;
   } else {
-    VARIANCE_INVOKE(rtcd, var8x8)(uptr, pre_stride,
-                                  upred_ptr, uv_stride, &sse2);
-    VARIANCE_INVOKE(rtcd, var8x8)(vptr, pre_stride,
-                                  vpred_ptr, uv_stride, &sse1);
+    vp8_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
+    vp8_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
     sse2 += sse1;
   }
   return sse2;
@@ -618,40 +616,39 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
       band = vp8_coef_bands;
       default_eob = 16;
 #if CONFIG_HYBRIDTRANSFORM
-      if (type == PLANE_TYPE_Y_WITH_DC &&
-          mb->q_index < ACTIVE_HT &&
-          mbmi->mode == B_PRED) {
-        tx_type = b->bmi.as_mode.tx_type;
-        switch (tx_type) {
-          case ADST_DCT:
-            scan = vp8_row_scan;
-            break;
-
-          case DCT_ADST:
-            scan = vp8_col_scan;
-            break;
-
-          default:
-            scan = vp8_default_zig_zag1d;
-            break;
-        }
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        tx_type = get_tx_type_4x4(xd, b);
+        if (tx_type != DCT_DCT) {
+          switch (tx_type) {
+            case ADST_DCT:
+              scan = vp8_row_scan;
+              break;
+
+            case DCT_ADST:
+              scan = vp8_col_scan;
+              break;
 
+            default:
+              scan = vp8_default_zig_zag1d;
+              break;
+          }
+        }
       }
 #endif
+
       break;
     case TX_8X8:
       scan = vp8_default_zig_zag1d_8x8;
       band = vp8_coef_bands_8x8;
       default_eob = 64;
 #if CONFIG_HYBRIDTRANSFORM8X8
-      {
+      if (type == PLANE_TYPE_Y_WITH_DC) {
         BLOCKD *bb;
         int ib = (b - xd->block);
         if (ib < 16) {
           ib = (ib & 8) + ((ib & 4) >> 1);
           bb = xd->block + ib;
-          if (mbmi->mode == I8X8_PRED)
-            tx_type = bb->bmi.as_mode.tx_type;
+          tx_type = get_tx_type_8x8(xd, bb);
         }
       }
 #endif
@@ -661,10 +658,9 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
       band = vp8_coef_bands_16x16;
       default_eob = 256;
 #if CONFIG_HYBRIDTRANSFORM16X16
-      if (type == PLANE_TYPE_Y_WITH_DC &&
-          mbmi->mode < I8X8_PRED &&
-          mb->q_index < ACTIVE_HT16)
-          tx_type = b->bmi.as_mode.tx_type;
+      if (type == PLANE_TYPE_Y_WITH_DC) {
+        tx_type = get_tx_type_16x16(xd, b);
+      }
 #endif
       break;
     default:
@@ -675,8 +671,6 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
   else
     seg_eob = default_eob;
 
-  //mbmi->mode = mode;
-
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
 #if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
@@ -871,6 +865,12 @@ static int vp8_rdcost_mby_16x16(MACROBLOCK *mb) {
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
                                   const VP8_ENCODER_RTCD *rtcd, int *skippable) {
   int d;
+  MACROBLOCKD *xd = &mb->e_mbd;
+  BLOCKD *b  = &mb->e_mbd.block[0];
+  BLOCK  *be = &mb->block[0];
+#if CONFIG_HYBRIDTRANSFORM16X16
+  TX_TYPE tx_type;
+#endif
 
   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(
     mb->src_diff,
@@ -879,12 +879,9 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
     mb->block[0].src_stride);
 
 #if CONFIG_HYBRIDTRANSFORM16X16
-  if ((mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED) &&
-      (mb->q_index < ACTIVE_HT16)) {
-    BLOCKD *b  = &mb->e_mbd.block[0];
-    BLOCK  *be = &mb->block[0];
-    txfm_map(b, pred_mode_conv(mb->e_mbd.mode_info_context->mbmi.mode));
-    vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 16);
+  tx_type = get_tx_type_16x16(xd, b);
+  if (tx_type != DCT_DCT) {
+    vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 16);
   } else
     vp8_transform_mby_16x16(mb);
 #else
@@ -1145,12 +1142,7 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
                                      int *bestrate, int *bestratey,
                                      int *bestdistortion) {
   B_PREDICTION_MODE mode;
-
-#if CONFIG_HYBRIDTRANSFORM
-  int QIndex = x->q_index;
-  int active_ht = (QIndex < ACTIVE_HT);
-  TX_TYPE best_tx_type;
-#endif
+  MACROBLOCKD *xd = &x->e_mbd;
 
 #if CONFIG_COMP_INTRA_PRED
   B_PREDICTION_MODE mode2;
@@ -1161,6 +1153,10 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
+#if CONFIG_HYBRIDTRANSFORM
+  TX_TYPE tx_type = DCT_DCT;
+  TX_TYPE best_tx_type = DCT_DCT;
+#endif
   /*
    * The predictor buffer is a 2d buffer with a stride of 16.  Create
    * a temp buffer that meets the stride requirements, but we are only
@@ -1177,11 +1173,6 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
       int64_t this_rd;
       int ratey;
 
-      // TODO Temporarily ignore modes that need the above-right data. SB
-      // encoding means this data is not available for the bottom right MB
-      // Do we need to do this for mode2 also?
-      if (mode == B_LD_PRED || mode == B_VL_PRED)
-        continue;
       b->bmi.as_mode.first = mode;
       rate = bmode_costs[mode];
 
@@ -1197,48 +1188,49 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 #endif
       ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
 
+      b->bmi.as_mode.first = mode;
 #if CONFIG_HYBRIDTRANSFORM
-      if (active_ht) {
-        txfm_map(b, mode);
-        vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
-        vp8_ht_quantize_b_4x4(be, b);
+      tx_type = get_tx_type_4x4(xd, b);
+      if (tx_type != DCT_DCT) {
+        vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 4);
+        vp8_ht_quantize_b_4x4(be, b, tx_type);
       } else {
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4(be, b);
       }
 #else
-        x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4(be, b);
+      x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+      x->quantize_b_4x4(be, b);
 #endif
 
-        tempa = ta;
-        templ = tl;
+      tempa = ta;
+      templ = tl;
 
-        ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
-        rate += ratey;
-        distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(
-            be->coeff, b->dqcoeff, 16) >> 2;
+      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
+      rate += ratey;
+      distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(
+          be->coeff, b->dqcoeff, 16) >> 2;
 
-        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-        if (this_rd < best_rd) {
-          *bestrate = rate;
-          *bestratey = ratey;
-          *bestdistortion = distortion;
-          best_rd = this_rd;
-          *best_mode = mode;
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
 #if CONFIG_HYBRIDTRANSFORM
-          best_tx_type = b->bmi.as_mode.tx_type ;
+        best_tx_type = tx_type;
 #endif
 
 #if CONFIG_COMP_INTRA_PRED
-          *best_second_mode = mode2;
+        *best_second_mode = mode2;
 #endif
-          *a = tempa;
-          *l = templ;
-          copy_predictor(best_predictor, b->predictor);
-          vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
-        }
+        *a = tempa;
+        *l = templ;
+        copy_predictor(best_predictor, b->predictor);
+        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+      }
 #if CONFIG_COMP_INTRA_PRED
     }
 #endif
@@ -1249,16 +1241,15 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 #endif
 
 #if CONFIG_HYBRIDTRANSFORM
-  b->bmi.as_mode.tx_type = best_tx_type;
-
   // inverse transform
-  if (active_ht)
-    vp8_ihtllm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type, 4);
+  if (best_tx_type != DCT_DCT)
+    vp8_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
   else
-    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
-                                                                b->diff, 32);
+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
+        best_dqcoeff, b->diff, 32);
 #else
-  IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
+  IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
+      best_dqcoeff, b->diff, 32);
 #endif
 
   vp8_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
@@ -1295,12 +1286,11 @@ static int64_t rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rat
     tl = (ENTROPY_CONTEXT *)&t_left;
   }
 
-  // TODO(agrange)
-  // vp8_intra_prediction_down_copy(xd);
-
   xd->mode_info_context->mbmi.mode = B_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
+  vp8_intra_prediction_down_copy(xd);
+
   for (i = 0; i < 16; i++) {
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
@@ -1413,9 +1403,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
   int64_t this_rd;
   MACROBLOCKD *xd = &x->e_mbd;
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int best_txtype, rd_txtype;
-#endif
 #if CONFIG_TX_SELECT
   int i;
   for (i = 0; i < NB_TXFM_MODES; i++)
@@ -1449,9 +1436,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-      rd_txtype = x->e_mbd.block[0].bmi.as_mode.tx_type;
-#endif
 
       if (this_rd < best_rd) {
         mode_selected = mode;
@@ -1463,9 +1447,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
         *Rate = rate;
         *rate_y = ratey;
         *Distortion = distortion;
-#if CONFIG_HYBRIDTRANSFORM16X16
-        best_txtype = rd_txtype;
-#endif
         *skippable = skip;
       }
 
@@ -1486,9 +1467,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
 
   mbmi->txfm_size = txfm_size;
   mbmi->mode = mode_selected;
-#if CONFIG_HYBRIDTRANSFORM16X16
-  x->e_mbd.block[0].bmi.as_mode.tx_type = best_txtype;
-#endif
 
 #if CONFIG_COMP_INTRA_PRED
   mbmi->second_mode = mode2_selected;
@@ -1539,6 +1517,7 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
 
       // FIXME rate for compound mode and second intrapred mode
       rate = mode_costs[mode];
+      b->bmi.as_mode.first = mode;
 
 #if CONFIG_COMP_INTRA_PRED
       if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
@@ -1555,10 +1534,11 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
 #if CONFIG_HYBRIDTRANSFORM8X8
-        txfm_map(b, pred_mode_conv(mode));
-        vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
-                  b->bmi.as_mode.tx_type, 8);
-
+        TX_TYPE tx_type = get_tx_type_8x8(xd, b);
+        if (tx_type != DCT_DCT)
+          vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32, tx_type, 8);
+        else
+          x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
 #else
         x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
 #endif
@@ -1596,8 +1576,8 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
 
         ta0 = *(a + vp8_block2above[ib]);
         ta1 = *(a + vp8_block2above[ib + 1]);
-        tl0 = *(l + vp8_block2above[ib]);
-        tl1 = *(l + vp8_block2above[ib + 4]);
+        tl0 = *(l + vp8_block2left[ib]);
+        tl1 = *(l + vp8_block2left[ib + 4]);
         rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,
                              &ta0, &tl0, TX_4X4);
         rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,
@@ -1839,7 +1819,7 @@ static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
 
 static int64_t rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int fullpixel) {
+                              int *distortion, int *skippable, int fullpixel) {
   vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
   ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
                                                          x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
@@ -1849,6 +1829,7 @@ static int64_t rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
   *rate       = rd_cost_mbuv(x);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+  *skippable  = mbuv_is_skippable_4x4(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
@@ -2105,7 +2086,7 @@ static int labels2mode(
   int_mv *best_ref_mv,
   int_mv *second_best_ref_mv,
   DEC_MVCOSTS) {
-  MACROBLOCKD *const xd = & x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
   const int mis = xd->mode_info_stride;
@@ -2199,30 +2180,19 @@ static int labels2mode(
   return cost;
 }
 
-static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels,
-                              int which_label, ENTROPY_CONTEXT *ta,
-                              ENTROPY_CONTEXT *tl) {
-  int b, cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-
-  for (b = 0; b < 16; b++)
-    if (labels[ b] == which_label)
-      cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC,
-                          ta + vp8_block2above[b],
-                          tl + vp8_block2left[b], TX_4X4);
-
-  return cost;
-
-}
-
-static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x,
-                                                int const *labels,
-                                                int which_label,
-                                                const VP8_ENCODER_RTCD *rtcd) {
+static int64_t encode_inter_mb_segment(MACROBLOCK *x,
+                                       int const *labels,
+                                       int which_label,
+                                       int *labelyrate,
+                                       int *distortion,
+                                       ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl,
+                                       const VP8_ENCODER_RTCD *rtcd) {
   int i;
-  unsigned int distortion = 0;
   MACROBLOCKD *xd = &x->e_mbd;
 
+  *labelyrate = 0;
+  *distortion = 0;
   for (i = 0; i < 16; i++) {
     if (labels[i] == which_label) {
       BLOCKD *bd = &x->e_mbd.block[i];
@@ -2234,18 +2204,65 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x,
         vp8_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
       ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, bd, 16);
       x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-
-      // set to 0 no way to account for 2nd order DC so discount
-      // be->coeff[0] = 0;
       x->quantize_b_4x4(be, bd);
-      thisdistortion = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(
-                         be->coeff, bd->dqcoeff, 16) / 4;
-      distortion += thisdistortion;
+      thisdistortion = vp8_block_error_c(be->coeff, bd->dqcoeff, 16);
+      *distortion += thisdistortion;
+      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+                                 ta + vp8_block2above[i],
+                                 tl + vp8_block2left[i], TX_4X4);
     }
   }
-  return distortion;
+  *distortion >>= 2;
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
+static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
+                                           int const *labels,
+                                           int which_label,
+                                           int *labelyrate,
+                                           int *distortion,
+                                           ENTROPY_CONTEXT *ta,
+                                           ENTROPY_CONTEXT *tl,
+                                           const VP8_ENCODER_RTCD *rtcd) {
+  int i, j;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int iblock[4] = { 0, 1, 4, 5 };
+
+  *distortion = 0;
+  *labelyrate = 0;
+  for (i = 0; i < 4; i++) {
+    int ib = vp8_i8x8_block[i];
+
+    if (labels[ib] == which_label) {
+      BLOCKD *bd = &xd->block[ib];
+      BLOCK *be = &x->block[ib];
+      int thisdistortion;
+
+      vp8_build_inter_predictors4b(xd, bd, 16);
+      if (xd->mode_info_context->mbmi.second_ref_frame)
+        vp8_build_2nd_inter_predictors4b(xd, bd, 16);
+      vp8_subtract_4b_c(be, bd, 16);
+
+      for (j = 0; j < 4; j += 2) {
+        bd = &xd->block[ib + iblock[j]];
+        be = &x->block[ib + iblock[j]];
+        x->vp8_short_fdct8x4(be->src_diff, be->coeff, 32);
+        x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
+        thisdistortion = vp8_block_error_c(be->coeff, bd->dqcoeff, 32);
+        *distortion += thisdistortion;
+        *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+                                   ta + vp8_block2above[ib + iblock[j]],
+                                   tl + vp8_block2left[ib + iblock[j]], TX_4X4);
+        *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
+                                   ta + vp8_block2above[ib + iblock[j] + 1],
+                                   tl + vp8_block2left[ib + iblock[j]],
+                                   TX_4X4);
+      }
+    }
+  }
+  *distortion >>= 2;
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
 
 static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
 
@@ -2284,7 +2301,7 @@ int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
 static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
                              BEST_SEG_INFO *bsi, unsigned int segmentation,
                              int_mv seg_mvs[16 /* n_blocks */][MAX_REF_FRAMES - 1]) {
-  int i;
+  int i, j;
   int const *labels;
   int br = 0, bd = 0;
   B_PREDICTION_MODE this_mode;
@@ -2296,6 +2313,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
   int rate = 0;
   int sbr = 0, sbd = 0;
   int segmentyrate = 0;
+  uint8_t best_eobs[16];
 
   vp8_variance_fn_ptr_t *v_fn_ptr;
 
@@ -2454,21 +2472,27 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
           mv_check_bounds(x, &second_mode_mv[this_mode]))
         continue;
 
-      distortion = vp8_encode_inter_mb_segment(
-                     x, labels, i,
-                     IF_RTCD(&cpi->rtcd));
-
-      labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
+      if (segmentation == BLOCK_4X4) {
+        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
+                                          &distortion,
+                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+      } else {
+        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
+                                              &distortion, ta_s, tl_s,
+                                              IF_RTCD(&cpi->rtcd));
+      }
+      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
       rate += labelyrate;
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
       if (this_rd < best_label_rd) {
         sbr = rate;
         sbd = distortion;
         bestlabelyrate = labelyrate;
         mode_selected = this_mode;
         best_label_rd = this_rd;
+        for (j = 0; j < 16; j++)
+          if (labels[j] == i)
+            best_eobs[j] = x->e_mbd.block[j].eob;
 
         vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
         vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -2509,7 +2533,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
       if (mbmi->second_ref_frame)
         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
       bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = bd->eob;
+      bsi->eobs[i] = best_eobs[i];
     }
   }
 }
@@ -2531,7 +2555,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
                                            int_mv *best_ref_mv, int_mv *second_best_ref_mv, int64_t best_rd,
                                            int *mdcounts, int *returntotrate,
                                            int *returnyrate, int *returndistortion,
-                                           int mvthresh,
+                                           int *skippable, int mvthresh,
                                            int_mv seg_mvs[BLOCK_MAX_SEGMENTS - 1][16 /* n_blocks */][MAX_REF_FRAMES - 1]) {
   int i;
   BEST_SEG_INFO bsi;
@@ -2627,6 +2651,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
   *returntotrate = bsi.r;
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
+  *skippable = mby_is_skippable_4x4(&x->e_mbd, 0);
 
   /* save partitions */
   mbmi->partitioning = bsi.segment_num;
@@ -3319,10 +3344,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   unsigned int ref_costs[MAX_REF_FRAMES];
   int_mv seg_mvs[BLOCK_MAX_SEGMENTS - 1][16 /* n_blocks */][MAX_REF_FRAMES - 1];
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int best_txtype, rd_txtype;
-#endif
-
   vpx_memset(mode8x8, 0, sizeof(mode8x8));
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@@ -3546,9 +3567,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           // FIXME compound intra prediction
           vp8_build_intra_predictors_mby(&x->e_mbd);
           macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
-#if CONFIG_HYBRIDTRANSFORM16X16
-          rd_txtype = x->e_mbd.block[0].bmi.as_mode.tx_type;
-#endif
           rate2 += rate_y;
           distortion2 += distortion;
           rate2 += x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
@@ -3709,6 +3727,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
                                                second_ref, best_yrd, mdcounts,
                                                &rate, &rate_y, &distortion,
+                                               &skippable,
                                                this_rd_thresh, seg_mvs);
       rate2 += rate;
       distortion2 += distortion;
@@ -3722,9 +3741,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
       if (tmp_rd < best_yrd) {
-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
+        int uv_skippable;
+
+        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                       cpi->common.full_pixel);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
       } else {
         this_rd = INT64_MAX;
         disable_skip = 1;
@@ -3883,8 +3906,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         if (threshold < x->encode_breakout)
           threshold = x->encode_breakout;
 
-        var = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
-              (*(b->base_src), b->src_stride,
+        var = vp8_variance16x16(*(b->base_src), b->src_stride,
                x->e_mbd.predictor, 16, &sse);
 
         if (sse < threshold) {
@@ -3894,7 +3916,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           if ((sse - var < q2dc *q2dc >> 4) ||
               (sse / 2 > var && sse - var < 64)) {
             // Check u and v to make sure skip is ok
-            int sse2 =  VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
+            int sse2 =  vp8_uvsse(x);
             if (sse2 * 2 < threshold) {
               x->skip = 1;
               distortion2 = sse + sse2;
@@ -3906,22 +3928,22 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
               disable_skip = 1;
               this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-              break;
             }
           }
         }
       }
 
-      vp8_build_1st_inter16x16_predictors_mbuv(&x->e_mbd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp8_build_2nd_inter16x16_predictors_mbuv(&x->e_mbd,
-                                                 &xd->predictor[256],
+      if (!x->skip) {
+        vp8_build_1st_inter16x16_predictors_mbuv(&x->e_mbd, &xd->predictor[256],
                                                  &xd->predictor[320], 8);
-      inter_mode_cost(cpi, x, this_mode, &rate2, &distortion2,
-                      &rate_y, &distortion, &rate_uv, &distortion_uv,
-                      &skippable, txfm_cache);
+        if (is_comp_pred)
+          vp8_build_2nd_inter16x16_predictors_mbuv(&x->e_mbd,
+                                                   &xd->predictor[256],
+                                                   &xd->predictor[320], 8);
+        inter_mode_cost(cpi, x, this_mode, &rate2, &distortion2,
+                        &rate_y, &distortion, &rate_uv, &distortion_uv,
+                        &skippable, txfm_cache);
+      }
       if (is_comp_pred)
         mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
       else
@@ -4019,10 +4041,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           // Note index of best mode so far
           best_mode_index = mode_index;
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-          best_txtype = rd_txtype;
-#endif
-
           if (this_mode <= B_PRED) {
             if (mbmi->txfm_size != TX_4X4
                 && this_mode != B_PRED
@@ -4195,11 +4213,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     }
   }
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  if (best_mbmode.mode < I8X8_PRED)
-    xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
-#endif
-
   if (best_mbmode.mode == I8X8_PRED)
     set_i8x8_block_modes(x, mode8x8);
 
@@ -4304,10 +4317,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
   TX_SIZE txfm_size_16x16;
   int i;
 
-#if CONFIG_HYBRIDTRANSFORM16X16
-  int best_txtype;
-#endif
-
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
                           &uv_intra_skippable);
@@ -4329,10 +4338,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
                                           &rate16x16_tokenonly, &dist16x16,
                                           &y_intra16x16_skippable, txfm_cache);
   mode16x16 = mbmi->mode;
-#if CONFIG_HYBRIDTRANSFORM16X16
-  best_txtype = xd->block[0].bmi.as_mode.tx_type;
-  xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
-#endif
   txfm_size_16x16 = mbmi->txfm_size;
 
   // FIXME(rbultje) support transform-size selection
@@ -4402,10 +4407,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
       mbmi->mode = mode16x16;
       rate = rate16x16 + rateuv8x8;
       dist = dist16x16 + (distuv8x8 >> 2);
-#if CONFIG_HYBRIDTRANSFORM16X16
-      // save this into supermacroblock coding decision buffer
-      xd->mode_info_context->bmi[0].as_mode.tx_type = best_txtype;
-#endif
 #if CONFIG_TX_SELECT
       for (i = 0; i < NB_TXFM_MODES; i++) {
         x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
@@ -4801,8 +4802,8 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
             if (threshold < x->encode_breakout)
               threshold = x->encode_breakout;
 
-            var = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32)(*(b->base_src),
-              b->src_stride, xd->dst.y_buffer, xd->dst.y_stride, &sse);
+            var = vp8_variance32x32(*(b->base_src), b->src_stride,
+                                    xd->dst.y_buffer, xd->dst.y_stride, &sse);
 
             if (sse < threshold) {
               unsigned int q2dc = xd->block[24].dequant[0];
@@ -4812,11 +4813,9 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
                   (sse / 2 > var && sse - var < 64)) {
                 // Check u and v to make sure skip is ok
                 unsigned int sse2, sse3;
-                var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
-                                  (x->src.u_buffer, x->src.uv_stride,
+                var += vp8_variance16x16(x->src.u_buffer, x->src.uv_stride,
                                    xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
-                var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
-                                  (x->src.v_buffer, x->src.uv_stride,
+                var += vp8_variance16x16(x->src.v_buffer, x->src.uv_stride,
                                    xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
                 sse2 += sse3;
                 if (sse2 * 2 < threshold) {
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index e88b80d34..e85bb45ce 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -161,8 +161,8 @@ static int cost_segmap(MACROBLOCKD *xd,
 }
 
 void choose_segmap_coding_method(VP8_COMP *cpi) {
-  VP8_COMMON *const cm = & cpi->common;
-  MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+  VP8_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   int i;
   int tot_count;
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index d3d9711dc..865496ae2 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -11,18 +11,10 @@
 
 #include "onyx_int.h"
 
-void vp8_ssim_parms_16x16_c
-(
-  unsigned char *s,
-  int sp,
-  unsigned char *r,
-  int rp,
-  unsigned long *sum_s,
-  unsigned long *sum_r,
-  unsigned long *sum_sq_s,
-  unsigned long *sum_sq_r,
-  unsigned long *sum_sxr
-) {
+void vp8_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,
+                            int rp, unsigned long *sum_s, unsigned long *sum_r,
+                            unsigned long *sum_sq_s, unsigned long *sum_sq_r,
+                            unsigned long *sum_sxr) {
   int i, j;
   for (i = 0; i < 16; i++, s += sp, r += rp) {
     for (j = 0; j < 16; j++) {
@@ -34,18 +26,10 @@ void vp8_ssim_parms_16x16_c
     }
   }
 }
-void vp8_ssim_parms_8x8_c
-(
-  unsigned char *s,
-  int sp,
-  unsigned char *r,
-  int rp,
-  unsigned long *sum_s,
-  unsigned long *sum_r,
-  unsigned long *sum_sq_s,
-  unsigned long *sum_sq_r,
-  unsigned long *sum_sxr
-) {
+void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+                          unsigned long *sum_s, unsigned long *sum_r,
+                          unsigned long *sum_sq_s, unsigned long *sum_sq_r,
+                          unsigned long *sum_sxr) {
   int i, j;
   for (i = 0; i < 8; i++, s += sp, r += rp) {
     for (j = 0; j < 8; j++) {
@@ -61,15 +45,9 @@ void vp8_ssim_parms_8x8_c
 const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
 const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
 
-static double similarity
-(
-  unsigned long sum_s,
-  unsigned long sum_r,
-  unsigned long sum_sq_s,
-  unsigned long sum_sq_r,
-  unsigned long sum_sxr,
-  int count
-) {
+static double similarity(unsigned long sum_s, unsigned long sum_r,
+                         unsigned long sum_sq_s, unsigned long sum_sq_r,
+                         unsigned long sum_sxr, int count) {
   int64_t ssim_n, ssim_d;
   int64_t c1, c2;
 
@@ -87,23 +65,22 @@ static double similarity
   return ssim_n * 1.0 / ssim_d;
 }
 
-static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp,
-                         const vp8_variance_rtcd_vtable_t *rtcd) {
+static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  SSIMPF_INVOKE(rtcd, 16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+  vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                       &sum_sxr);
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
 }
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp,
-                       const vp8_variance_rtcd_vtable_t *rtcd) {
+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  SSIMPF_INVOKE(rtcd, 8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+  vp8_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
 }
 
 // TODO: (jbb) tried to scale this function such that we may be able to use it
 // for distortion metric in mode selection code ( provided we do a reconstruction)
-long dssim(unsigned char *s, int sp, unsigned char *r, int rp,
-           const vp8_variance_rtcd_vtable_t *rtcd) {
+long dssim(unsigned char *s, int sp, unsigned char *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   int64_t ssim3;
   int64_t ssim_n1, ssim_n2;
@@ -115,7 +92,8 @@ long dssim(unsigned char *s, int sp, unsigned char *r, int rp,
   c1 = cc1 * 16;
   c2 = cc2 * 16;
 
-  SSIMPF_INVOKE(rtcd, 16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+  vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                       &sum_sxr);
   ssim_n1 = (2 * sum_s * sum_r + c1);
 
   ssim_n2 = ((int64_t) 2 * 256 * sum_sxr - (int64_t) 2 * sum_s * sum_r + c2);
@@ -137,16 +115,8 @@ long dssim(unsigned char *s, int sp, unsigned char *r, int rp,
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-double vp8_ssim2
-(
-  unsigned char *img1,
-  unsigned char *img2,
-  int stride_img1,
-  int stride_img2,
-  int width,
-  int height,
-  const vp8_variance_rtcd_vtable_t *rtcd
-) {
+double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -154,7 +124,7 @@ double vp8_ssim2
   // sample point start with each 4x4 location
   for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
     for (j = 0; j < width - 8; j += 4) {
-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2, rtcd);
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
       ssim_total += v;
       samples++;
     }
@@ -162,28 +132,22 @@ double vp8_ssim2
   ssim_total /= samples;
   return ssim_total;
 }
-double vp8_calc_ssim
-(
-  YV12_BUFFER_CONFIG *source,
-  YV12_BUFFER_CONFIG *dest,
-  int lumamask,
-  double *weight,
-  const vp8_variance_rtcd_vtable_t *rtcd
-) {
+double vp8_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                     int lumamask, double *weight) {
   double a, b, c;
   double ssimv;
 
   a = vp8_ssim2(source->y_buffer, dest->y_buffer,
                 source->y_stride, dest->y_stride, source->y_width,
-                source->y_height, rtcd);
+                source->y_height);
 
   b = vp8_ssim2(source->u_buffer, dest->u_buffer,
                 source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height, rtcd);
+                source->uv_height);
 
   c = vp8_ssim2(source->v_buffer, dest->v_buffer,
                 source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height, rtcd);
+                source->uv_height);
 
   ssimv = a * .8 + .1 * (b + c);
 
@@ -192,29 +156,22 @@ double vp8_calc_ssim
   return ssimv;
 }
 
-double vp8_calc_ssimg
-(
-  YV12_BUFFER_CONFIG *source,
-  YV12_BUFFER_CONFIG *dest,
-  double *ssim_y,
-  double *ssim_u,
-  double *ssim_v,
-  const vp8_variance_rtcd_vtable_t *rtcd
-) {
+double vp8_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v) {
   double ssim_all = 0;
   double a, b, c;
 
   a = vp8_ssim2(source->y_buffer, dest->y_buffer,
                 source->y_stride, dest->y_stride, source->y_width,
-                source->y_height, rtcd);
+                source->y_height);
 
   b = vp8_ssim2(source->u_buffer, dest->u_buffer,
                 source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height, rtcd);
+                source->uv_height);
 
   c = vp8_ssim2(source->v_buffer, dest->v_buffer,
                 source->uv_stride, dest->uv_stride, source->uv_width,
-                source->uv_height, rtcd);
+                source->uv_height);
   *ssim_y = a;
   *ssim_u = b;
   *ssim_v = c;
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index c72c1e7e7..d46637a3e 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -171,6 +171,7 @@ static void tokenize1st_order_b_16x16(MACROBLOCKD *xd,
 
     t->skip_eob_node = pt == 0 && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
                                    (band > 1 && type == PLANE_TYPE_Y_NO_DC));
+    assert(vp8_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
     if (!dry_run) {
 #if CONFIG_HYBRIDTRANSFORM16X16
       if (tx_type != DCT_DCT)
@@ -310,8 +311,7 @@ static void tokenize1st_order_b_8x8(MACROBLOCKD *xd,
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const short *qcoeff_ptr = b->qcoeff;
 #if CONFIG_HYBRIDTRANSFORM8X8
-  TX_TYPE tx_type = xd->mode_info_context->mbmi.mode == I8X8_PRED ?
-      get_tx_type(xd, b) : DCT_DCT;
+  TX_TYPE tx_type = get_tx_type(xd, b);
 #endif
   const int eob = b->eob;
   int seg_eob = 64;
@@ -427,103 +427,6 @@ static void tokenize1st_order_chroma_4x4(MACROBLOCKD *xd,
   }
 }
 
-#if CONFIG_HYBRIDTRANSFORM
-static void tokenize1st_order_ht_4x4(MACROBLOCKD *xd,
-                                     TOKENEXTRA **tp,
-                                     PLANE_TYPE type,
-                                     VP8_COMP *cpi,
-                                     int dry_run) {
-  unsigned int block;
-  const BLOCKD *b = xd->block;
-  int pt;             /* near block/prev token context index */
-  TOKENEXTRA *t = *tp;/* store tokens starting here */
-  ENTROPY_CONTEXT * a;
-  ENTROPY_CONTEXT * l;
-  int const *pt_scan ;
-  int seg_eob = 16;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  if ( segfeature_active( xd, segment_id, SEG_LVL_EOB ) ) {
-    seg_eob = get_segdata( xd, segment_id, SEG_LVL_EOB );
-  }
-
-  /* Luma */
-  for (block = 0; block < 16; block++, b++) {
-    const int eob = b->eob;
-    TX_TYPE tx_type = DCT_DCT;
-    const int tmp1 = vp8_block2above[block];
-    const int tmp2 = vp8_block2left[block];
-    const int16_t *qcoeff_ptr = b->qcoeff;
-    int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
-
-    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
-    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
-    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
-    if( xd->mode_info_context->mbmi.mode == B_PRED ) {
-      tx_type = get_tx_type(xd, b);
-    }
-
-    // assign scanning order for luma components coded in intra4x4 mode
-    if ((xd->mode_info_context->mbmi.mode == B_PRED) &&
-        (type == PLANE_TYPE_Y_WITH_DC)) {
-      switch (tx_type) {
-        case ADST_DCT:
-          pt_scan = vp8_row_scan;
-          break;
-        case DCT_ADST:
-          pt_scan = vp8_col_scan;
-          break;
-        default :
-          pt_scan = vp8_default_zig_zag1d;
-          break;
-      }
-    } else {
-      pt_scan = vp8_default_zig_zag1d;
-    }
-
-    do {
-      const int band = vp8_coef_bands[c];
-      int token;
-
-      if (c < eob) {
-        const int rc = pt_scan[c];
-        const int v = qcoeff_ptr[rc];
-
-        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
-        token    = vp8_dct_value_tokens_ptr[v].Token;
-      } else
-        token = DCT_EOB_TOKEN;
-
-      t->Token = token;
-      if (tx_type != DCT_DCT)
-        t->context_tree = cpi->common.fc.hybrid_coef_probs[type][band][pt];
-      else
-        t->context_tree = cpi->common.fc.coef_probs[type][band][pt];
-
-      t->skip_eob_node = pt == 0 && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
-                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));
-      assert(vp8_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
-
-      if (!dry_run) {
-        if (tx_type != DCT_DCT)
-          ++cpi->hybrid_coef_counts[type][band][pt][token];
-        else
-          ++cpi->coef_counts       [type][band][pt][token];
-      }
-      pt = vp8_prev_token_class[token];
-      ++t;
-    } while (c < eob && ++c < seg_eob);
-
-    *tp = t;
-    pt = (c != !type); /* 0 <-> all coeff data is zero */
-    *a = *l = pt;
-  }
-
-  tokenize1st_order_chroma_4x4(xd, tp, cpi, dry_run);
-}
-#endif
-
 static void tokenize1st_order_b_4x4(MACROBLOCKD *xd,
                                     TOKENEXTRA **tp,
                                     PLANE_TYPE type,
@@ -536,6 +439,7 @@ static void tokenize1st_order_b_4x4(MACROBLOCKD *xd,
   ENTROPY_CONTEXT *a, *l;
   int seg_eob = 16;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int const *pt_scan = vp8_default_zig_zag1d;
 
   if (segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
     seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);
@@ -547,6 +451,20 @@ static void tokenize1st_order_b_4x4(MACROBLOCKD *xd,
     const int16_t *qcoeff_ptr = b->qcoeff;
     int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
 
+#if CONFIG_HYBRIDTRANSFORM
+    TX_TYPE tx_type = get_tx_type(xd, &xd->block[block]);
+    switch (tx_type) {
+      case ADST_DCT:
+        pt_scan = vp8_row_scan;
+        break;
+      case DCT_ADST:
+        pt_scan = vp8_col_scan;
+        break;
+      default :
+        pt_scan = vp8_default_zig_zag1d;
+        break;
+    }
+#endif
     a = (ENTROPY_CONTEXT *)xd->above_context + vp8_block2above[block];
     l = (ENTROPY_CONTEXT *)xd->left_context + vp8_block2left[block];
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -558,7 +476,7 @@ static void tokenize1st_order_b_4x4(MACROBLOCKD *xd,
       int token;
 
       if (c < eob) {
-        const int rc = vp8_default_zig_zag1d[c];
+        const int rc = pt_scan[c];
         const int v = qcoeff_ptr[rc];
 
         t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
@@ -567,13 +485,24 @@ static void tokenize1st_order_b_4x4(MACROBLOCKD *xd,
         token = DCT_EOB_TOKEN;
 
       t->Token = token;
-      t->context_tree = cpi->common.fc.coef_probs[type][band][pt];
+#if CONFIG_HYBRIDTRANSFORM
+      if (tx_type != DCT_DCT)
+        t->context_tree = cpi->common.fc.hybrid_coef_probs[type][band][pt];
+      else
+#endif
+        t->context_tree = cpi->common.fc.coef_probs[type][band][pt];
 
       t->skip_eob_node = pt == 0 && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
                                      (band > 1 && type == PLANE_TYPE_Y_NO_DC));
       assert(vp8_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
-      if (!dry_run)
-        ++cpi->coef_counts[type][band][pt][token];
+      if (!dry_run) {
+#if CONFIG_HYBRIDTRANSFORM
+        if (tx_type != DCT_DCT)
+          ++cpi->hybrid_coef_counts[type][band][pt][token];
+        else
+#endif
+          ++cpi->coef_counts[type][band][pt][token];
+      }
       pt = vp8_prev_token_class[token];
       ++t;
     } while (c < eob && ++c < seg_eob);
@@ -674,12 +603,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi,
   int skip_inc;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
 
-#if CONFIG_HYBRIDTRANSFORM
-    int QIndex = cpi->mb.q_index;
-    int active_ht = (QIndex < ACTIVE_HT) &&
-                    (xd->mode_info_context->mbmi.mode == B_PRED);
-#endif
-
   if (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
       (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {
     skip_inc = 1;
@@ -784,12 +707,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi,
       }
     }
   } else {
-#if CONFIG_HYBRIDTRANSFORM
-    if (active_ht)
-      tokenize1st_order_ht_4x4(xd, t, plane_type, cpi, dry_run);
-    else
-#endif
-      tokenize1st_order_b_4x4(xd, t, plane_type, cpi, dry_run);
+    tokenize1st_order_b_4x4(xd, t, plane_type, cpi, dry_run);
   }
   if (dry_run)
     *t = t_backup;
@@ -1078,8 +996,7 @@ static __inline void stuff1st_order_b_8x8(MACROBLOCKD *xd,
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
 #if CONFIG_HYBRIDTRANSFORM8X8
-  TX_TYPE tx_type = xd->mode_info_context->mbmi.mode == I8X8_PRED ?
-      get_tx_type(xd, b) : DCT_DCT;
+  TX_TYPE tx_type = get_tx_type(xd, b);
 #endif
   const int band = vp8_coef_bands_8x8[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index a2fadfc4c..cdeb390c3 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -12,507 +12,73 @@
 #ifndef VARIANCE_H
 #define VARIANCE_H
 
-#include "vpx_config.h"
-
-#define prototype_sad(sym)\
-  unsigned int (sym)\
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   const unsigned char *ref_ptr, \
-   int  ref_stride, \
-   int max_sad\
-  )
-
-#define prototype_sad_multi_same_address(sym)\
-  void (sym)\
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   const unsigned char *ref_ptr, \
-   int  ref_stride, \
-   unsigned int *sad_array\
-  )
-
-#define prototype_sad_multi_same_address_1(sym)\
-  void (sym)\
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   const unsigned char *ref_ptr, \
-   int  ref_stride, \
-   unsigned short *sad_array\
-  )
-
-#define prototype_sad_multi_dif_address(sym)\
-  void (sym)\
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   unsigned char *ref_ptr[4], \
-   int  ref_stride, \
-   unsigned int *sad_array\
-  )
-
-#define prototype_variance(sym) \
-  unsigned int (sym) \
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   const unsigned char *ref_ptr, \
-   int  ref_stride, \
-   unsigned int *sse\
-  )
-
-#define prototype_variance2(sym) \
-  unsigned int (sym) \
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   const unsigned char *ref_ptr, \
-   int  ref_stride, \
-   unsigned int *sse,\
-   int *sum\
-  )
-
-#define prototype_subpixvariance(sym) \
-  unsigned int (sym) \
-  ( \
-    const unsigned char  *src_ptr, \
-    int  source_stride, \
-    int  xoffset, \
-    int  yoffset, \
-    const unsigned char *ref_ptr, \
-    int Refstride, \
-    unsigned int *sse \
-  );
-
-#define prototype_ssimpf(sym) \
-  void (sym) \
-  ( \
-    unsigned char *s, \
-    int sp, \
-    unsigned char *r, \
-    int rp, \
-    unsigned long *sum_s, \
-    unsigned long *sum_r, \
-    unsigned long *sum_sq_s, \
-    unsigned long *sum_sq_r, \
-    unsigned long *sum_sxr \
-  );
-
-#define prototype_getmbss(sym) unsigned int (sym)(const short *)
-
-#define prototype_get16x16prederror(sym)\
-  unsigned int (sym)\
-  (\
-   const unsigned char *src_ptr, \
-   int source_stride, \
-   const unsigned char *ref_ptr, \
-   int  ref_stride \
-  )
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/variance_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/variance_arm.h"
-#endif
-
-#ifndef vp8_variance_sad4x4
-#define vp8_variance_sad4x4 vp8_sad4x4_c
-#endif
-extern prototype_sad(vp8_variance_sad4x4);
-
-#ifndef vp8_variance_sad8x8
-#define vp8_variance_sad8x8 vp8_sad8x8_c
-#endif
-extern prototype_sad(vp8_variance_sad8x8);
-
-#ifndef vp8_variance_sad8x16
-#define vp8_variance_sad8x16 vp8_sad8x16_c
-#endif
-extern prototype_sad(vp8_variance_sad8x16);
-
-#ifndef vp8_variance_sad16x8
-#define vp8_variance_sad16x8 vp8_sad16x8_c
-#endif
-extern prototype_sad(vp8_variance_sad16x8);
-
-#ifndef vp8_variance_sad16x16
-#define vp8_variance_sad16x16 vp8_sad16x16_c
-#endif
-extern prototype_sad(vp8_variance_sad16x16);
-
-#ifndef vp8_variance_sad32x32
-#define vp8_variance_sad32x32 vp8_sad32x32_c
-#endif
-extern prototype_sad(vp8_variance_sad32x32);
-
-// -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-#ifndef vp8_variance_sad32x32x3
-#define vp8_variance_sad32x32x3 vp8_sad32x32x3_c
-#endif
-extern prototype_sad_multi_same_address(vp8_variance_sad32x32x3);
-
-#ifndef vp8_variance_sad16x16x3
-#define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
-#endif
-extern prototype_sad_multi_same_address(vp8_variance_sad16x16x3);
-
-#ifndef vp8_variance_sad16x8x3
-#define vp8_variance_sad16x8x3 vp8_sad16x8x3_c
-#endif
-extern prototype_sad_multi_same_address(vp8_variance_sad16x8x3);
-
-#ifndef vp8_variance_sad8x8x3
-#define vp8_variance_sad8x8x3 vp8_sad8x8x3_c
-#endif
-extern prototype_sad_multi_same_address(vp8_variance_sad8x8x3);
-
-#ifndef vp8_variance_sad8x16x3
-#define vp8_variance_sad8x16x3 vp8_sad8x16x3_c
-#endif
-extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
-
-#ifndef vp8_variance_sad4x4x3
-#define vp8_variance_sad4x4x3 vp8_sad4x4x3_c
-#endif
-extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
-
-#ifndef vp8_variance_sad32x32x8
-#define vp8_variance_sad32x32x8 vp8_sad32x32x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad32x32x8);
-
-#ifndef vp8_variance_sad16x16x8
-#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
-
-#ifndef vp8_variance_sad16x8x8
-#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
-
-#ifndef vp8_variance_sad8x8x8
-#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
-
-#ifndef vp8_variance_sad8x16x8
-#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
-
-#ifndef vp8_variance_sad4x4x8
-#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
-
-// -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-#ifndef vp8_variance_sad32x32x4d
-#define vp8_variance_sad32x32x4d vp8_sad32x32x4d_c
-#endif
-extern prototype_sad_multi_dif_address(vp8_variance_sad32x32x4d);
-
-#ifndef vp8_variance_sad16x16x4d
-#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
-#endif
-extern prototype_sad_multi_dif_address(vp8_variance_sad16x16x4d);
-
-#ifndef vp8_variance_sad16x8x4d
-#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_c
-#endif
-extern prototype_sad_multi_dif_address(vp8_variance_sad16x8x4d);
-
-#ifndef vp8_variance_sad8x8x4d
-#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_c
-#endif
-extern prototype_sad_multi_dif_address(vp8_variance_sad8x8x4d);
-
-#ifndef vp8_variance_sad8x16x4d
-#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_c
-#endif
-extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d);
-
-#ifndef vp8_variance_sad4x4x4d
-#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_c
-#endif
-extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d);
-
-#if ARCH_X86 || ARCH_X86_64
-#ifndef vp8_variance_copy32xn
-#define vp8_variance_copy32xn vp8_copy32xn_c
-#endif
-extern prototype_sad(vp8_variance_copy32xn);
-#endif
-
-// -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-#ifndef vp8_variance_var4x4
-#define vp8_variance_var4x4 vp8_variance4x4_c
-#endif
-extern prototype_variance(vp8_variance_var4x4);
-
-#ifndef vp8_variance_var8x8
-#define vp8_variance_var8x8 vp8_variance8x8_c
-#endif
-extern prototype_variance(vp8_variance_var8x8);
-
-#ifndef vp8_variance_var8x16
-#define vp8_variance_var8x16 vp8_variance8x16_c
-#endif
-extern prototype_variance(vp8_variance_var8x16);
-
-#ifndef vp8_variance_var16x8
-#define vp8_variance_var16x8 vp8_variance16x8_c
-#endif
-extern prototype_variance(vp8_variance_var16x8);
-
-#ifndef vp8_variance_var16x16
-#define vp8_variance_var16x16 vp8_variance16x16_c
-#endif
-extern prototype_variance(vp8_variance_var16x16);
-
-#ifndef vp8_variance_var32x32
-#define vp8_variance_var32x32 vp8_variance32x32_c
-#endif
-extern prototype_variance(vp8_variance_var32x32);
-
-// -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-#ifndef vp8_variance_subpixvar4x4
-#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixvar4x4);
-
-#ifndef vp8_variance_subpixvar8x8
-#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixvar8x8);
-
-#ifndef vp8_variance_subpixvar8x16
-#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixvar8x16);
-
-#ifndef vp8_variance_subpixvar16x8
-#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
-
-#ifndef vp8_variance_subpixvar16x16
-#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
-
-#ifndef vp8_variance_subpixvar32x32
-#define vp8_variance_subpixvar32x32 vp8_sub_pixel_variance32x32_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixvar32x32);
-
-#ifndef vp8_variance_halfpixvar16x16_h
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
-#endif
-extern prototype_variance(vp8_variance_halfpixvar16x16_h);
-
-#ifndef vp8_variance_halfpixvar32x32_h
-#define vp8_variance_halfpixvar32x32_h vp8_variance_halfpixvar32x32_h_c
-#endif
-extern prototype_variance(vp8_variance_halfpixvar32x32_h);
-
-#ifndef vp8_variance_halfpixvar16x16_v
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-#endif
-extern prototype_variance(vp8_variance_halfpixvar16x16_v);
-
-#ifndef vp8_variance_halfpixvar32x32_v
-#define vp8_variance_halfpixvar32x32_v vp8_variance_halfpixvar32x32_v_c
-#endif
-extern prototype_variance(vp8_variance_halfpixvar32x32_v);
-
-#ifndef vp8_variance_halfpixvar16x16_hv
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
-#endif
-extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
-
-#ifndef vp8_variance_halfpixvar32x32_hv
-#define vp8_variance_halfpixvar32x32_hv vp8_variance_halfpixvar32x32_hv_c
-#endif
-extern prototype_variance(vp8_variance_halfpixvar32x32_hv);
-
-#ifndef vp8_variance_subpixmse16x16
-#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
-
-#ifndef vp8_variance_subpixmse32x32
-#define vp8_variance_subpixmse32x32 vp8_sub_pixel_mse32x32_c
-#endif
-extern prototype_subpixvariance(vp8_variance_subpixmse32x32);
-
-// -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-#ifndef vp8_variance_getmbss
-#define vp8_variance_getmbss vp8_get_mb_ss_c
-#endif
-extern prototype_getmbss(vp8_variance_getmbss);
-
-#ifndef vp8_variance_mse16x16
-#define vp8_variance_mse16x16 vp8_mse16x16_c
-#endif
-extern prototype_variance(vp8_variance_mse16x16);
-
-#ifndef vp8_ssimpf_8x8
-#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c
-#endif
-extern prototype_ssimpf(vp8_ssimpf_8x8)
-
-#ifndef vp8_ssimpf_16x16
-#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c
-#endif
-extern prototype_ssimpf(vp8_ssimpf_16x16)
-
-#ifndef vp8_variance_satd16x16
-#define vp8_variance_satd16x16 vp8_satd16x16_c
-#endif
-extern prototype_variance(vp8_variance_satd16x16);
-
-typedef prototype_sad(*vp8_sad_fn_t);
-typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
-typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
-typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
-typedef prototype_variance(*vp8_variance_fn_t);
-typedef prototype_variance2(*vp8_variance2_fn_t);
-typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
-typedef prototype_getmbss(*vp8_getmbss_fn_t);
-typedef prototype_ssimpf(*vp8_ssimpf_fn_t);
-typedef prototype_get16x16prederror(*vp8_get16x16prederror_fn_t);
-
-typedef struct {
-  vp8_sad_fn_t             sad4x4;
-  vp8_sad_fn_t             sad8x8;
-  vp8_sad_fn_t             sad8x16;
-  vp8_sad_fn_t             sad16x8;
-  vp8_sad_fn_t             sad16x16;
-#if CONFIG_SUPERBLOCKS
-  vp8_sad_fn_t             sad32x32;
-#endif
-
-  vp8_variance_fn_t        var4x4;
-  vp8_variance_fn_t        var8x8;
-  vp8_variance_fn_t        var8x16;
-  vp8_variance_fn_t        var16x8;
-  vp8_variance_fn_t        var16x16;
-#if CONFIG_SUPERBLOCKS
-  vp8_variance_fn_t        var32x32;
-#endif
-
-  vp8_subpixvariance_fn_t  subpixvar4x4;
-  vp8_subpixvariance_fn_t  subpixvar8x8;
-  vp8_subpixvariance_fn_t  subpixvar8x16;
-  vp8_subpixvariance_fn_t  subpixvar16x8;
-  vp8_subpixvariance_fn_t  subpixvar16x16;
-#if CONFIG_SUPERBLOCKS
-  vp8_subpixvariance_fn_t  subpixvar32x32;
-#endif
-  vp8_variance_fn_t        halfpixvar16x16_h;
-  vp8_variance_fn_t        halfpixvar32x32_h;
-  vp8_variance_fn_t        halfpixvar16x16_v;
-#if CONFIG_SUPERBLOCKS
-  vp8_variance_fn_t        halfpixvar32x32_v;
-#endif
-  vp8_variance_fn_t        halfpixvar16x16_hv;
-#if CONFIG_SUPERBLOCKS
-  vp8_variance_fn_t        halfpixvar32x32_hv;
-#endif
-  vp8_subpixvariance_fn_t  subpixmse16x16;
-#if CONFIG_SUPERBLOCKS
-  vp8_subpixvariance_fn_t  subpixmse32x32;
-#endif
-
-  vp8_getmbss_fn_t         getmbss;
-  vp8_variance_fn_t        mse16x16;
-
-#if CONFIG_SUPERBLOCKS
-  vp8_sad_multi_fn_t       sad32x32x3;
-#endif
-  vp8_sad_multi_fn_t       sad16x16x3;
-  vp8_sad_multi_fn_t       sad16x8x3;
-  vp8_sad_multi_fn_t       sad8x16x3;
-  vp8_sad_multi_fn_t       sad8x8x3;
-  vp8_sad_multi_fn_t       sad4x4x3;
-
-#if CONFIG_SUPERBLOCKS
-  vp8_sad_multi1_fn_t      sad32x32x8;
-#endif
-  vp8_sad_multi1_fn_t      sad16x16x8;
-  vp8_sad_multi1_fn_t      sad16x8x8;
-  vp8_sad_multi1_fn_t      sad8x16x8;
-  vp8_sad_multi1_fn_t      sad8x8x8;
-  vp8_sad_multi1_fn_t      sad4x4x8;
-
-#if CONFIG_SUPERBLOCKS
-  vp8_sad_multi_d_fn_t     sad32x32x4d;
-#endif
-  vp8_sad_multi_d_fn_t     sad16x16x4d;
-  vp8_sad_multi_d_fn_t     sad16x8x4d;
-  vp8_sad_multi_d_fn_t     sad8x16x4d;
-  vp8_sad_multi_d_fn_t     sad8x8x4d;
-  vp8_sad_multi_d_fn_t     sad4x4x4d;
-
-#if ARCH_X86 || ARCH_X86_64
-  vp8_sad_fn_t             copy32xn;
-#endif
-
-#if CONFIG_INTERNAL_STATS
-  vp8_ssimpf_fn_t          ssimpf_8x8;
-  vp8_ssimpf_fn_t          ssimpf_16x16;
-#endif
-
-  vp8_variance_fn_t        satd16x16;
-} vp8_variance_rtcd_vtable_t;
-
-typedef struct {
-  vp8_sad_fn_t            sdf;
-  vp8_variance_fn_t       vf;
-  vp8_subpixvariance_fn_t svf;
-  vp8_variance_fn_t       svf_halfpix_h;
-  vp8_variance_fn_t       svf_halfpix_v;
-  vp8_variance_fn_t       svf_halfpix_hv;
-  vp8_sad_multi_fn_t      sdx3f;
-  vp8_sad_multi1_fn_t     sdx8f;
-  vp8_sad_multi_d_fn_t    sdx4df;
-#if ARCH_X86 || ARCH_X86_64
-  vp8_sad_fn_t            copymem;
-#endif
+typedef unsigned int(*vp8_sad_fn_t)(const unsigned char *src_ptr,
+                                    int source_stride,
+                                    const unsigned char *ref_ptr,
+                                    int ref_stride,
+                                    unsigned int max_sad);
+
+typedef void (*vp8_copy32xn_fn_t)(const unsigned char *src_ptr,
+                                  int source_stride,
+                                  const unsigned char *ref_ptr,
+                                  int ref_stride,
+                                  int n);
+
+typedef void (*vp8_sad_multi_fn_t)(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int  ref_stride,
+                                   unsigned int *sad_array);
+
+typedef void (*vp8_sad_multi1_fn_t)(const unsigned char *src_ptr,
+                                    int source_stride,
+                                    const unsigned char *ref_ptr,
+                                    int  ref_stride,
+                                    unsigned short *sad_array);
+
+typedef void (*vp8_sad_multi_d_fn_t)(const unsigned char *src_ptr,
+                                     int source_stride,
+                                     const unsigned char * const ref_ptr[],
+                                     int  ref_stride, unsigned int *sad_array);
+
+typedef unsigned int (*vp8_variance_fn_t)(const unsigned char *src_ptr,
+                                          int source_stride,
+                                          const unsigned char *ref_ptr,
+                                          int ref_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*vp8_subpixvariance_fn_t)(const unsigned char  *src_ptr,
+                                                int source_stride,
+                                                int xoffset,
+                                                int yoffset,
+                                                const unsigned char *ref_ptr,
+                                                int Refstride,
+                                                unsigned int *sse);
+
+typedef void (*vp8_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,
+                                int rp, unsigned long *sum_s,
+                                unsigned long *sum_r, unsigned long *sum_sq_s,
+                                unsigned long *sum_sq_r,
+                                unsigned long *sum_sxr);
+
+typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
+
+typedef unsigned int (*vp8_get16x16prederror_fn_t)(const unsigned char *src_ptr,
+                                                   int source_stride,
+                                                   const unsigned char *ref_ptr,
+                                                   int  ref_stride);
+
+typedef struct variance_vtable {
+    vp8_sad_fn_t            sdf;
+    vp8_variance_fn_t       vf;
+    vp8_subpixvariance_fn_t svf;
+    vp8_variance_fn_t       svf_halfpix_h;
+    vp8_variance_fn_t       svf_halfpix_v;
+    vp8_variance_fn_t       svf_halfpix_hv;
+    vp8_sad_multi_fn_t      sdx3f;
+    vp8_sad_multi1_fn_t     sdx8f;
+    vp8_sad_multi_d_fn_t    sdx4df;
+    vp8_copy32xn_fn_t       copymem;
 } vp8_variance_fn_ptr_t;
 
-#if CONFIG_RUNTIME_CPU_DETECT
-#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
-#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn
-#else
-#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
-#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn
-#endif
-
-#if CONFIG_NEWBESTREFMV
-unsigned int vp8_sad2x16_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad);
-unsigned int vp8_sad16x2_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad);
-#endif
-
 #endif
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
deleted file mode 100644
index 0971f11b0..000000000
--- a/vp8/encoder/x86/variance_x86.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VARIANCE_X86_H
-#define VARIANCE_X86_H
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-extern prototype_sad(vp8_sad4x4_mmx);
-extern prototype_sad(vp8_sad8x8_mmx);
-extern prototype_sad(vp8_sad8x16_mmx);
-extern prototype_sad(vp8_sad16x8_mmx);
-extern prototype_sad(vp8_sad16x16_mmx);
-extern prototype_variance(vp8_variance4x4_mmx);
-extern prototype_variance(vp8_variance8x8_mmx);
-extern prototype_variance(vp8_variance8x16_mmx);
-extern prototype_variance(vp8_variance16x8_mmx);
-extern prototype_variance(vp8_variance16x16_mmx);
-extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx);
-extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
-extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
-extern prototype_variance(vp8_variance_halfpixvar16x16_h_mmx);
-extern prototype_variance(vp8_variance_halfpixvar16x16_v_mmx);
-extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx);
-extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
-extern prototype_getmbss(vp8_get_mb_ss_mmx);
-extern prototype_variance(vp8_mse16x16_mmx);
-extern prototype_variance2(vp8_get8x8var_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_variance_sad4x4
-#define vp8_variance_sad4x4 vp8_sad4x4_mmx
-
-#undef  vp8_variance_sad8x8
-#define vp8_variance_sad8x8 vp8_sad8x8_mmx
-
-#undef  vp8_variance_sad8x16
-#define vp8_variance_sad8x16 vp8_sad8x16_mmx
-
-#undef  vp8_variance_sad16x8
-#define vp8_variance_sad16x8 vp8_sad16x8_mmx
-
-#undef  vp8_variance_sad16x16
-#define vp8_variance_sad16x16 vp8_sad16x16_mmx
-
-#undef  vp8_variance_var4x4
-#define vp8_variance_var4x4 vp8_variance4x4_mmx
-
-#undef  vp8_variance_var8x8
-#define vp8_variance_var8x8 vp8_variance8x8_mmx
-
-#undef  vp8_variance_var8x16
-#define vp8_variance_var8x16 vp8_variance8x16_mmx
-
-#undef  vp8_variance_var16x8
-#define vp8_variance_var16x8 vp8_variance16x8_mmx
-
-#undef  vp8_variance_var16x16
-#define vp8_variance_var16x16 vp8_variance16x16_mmx
-
-#undef  vp8_variance_subpixvar4x4
-#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx
-
-#undef  vp8_variance_subpixvar8x8
-#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx
-
-#undef  vp8_variance_subpixvar8x16
-#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx
-
-#undef  vp8_variance_subpixvar16x8
-#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx
-
-#undef  vp8_variance_subpixvar16x16
-#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
-
-#undef  vp8_variance_halfpixvar16x16_h
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_mmx
-
-#undef  vp8_variance_halfpixvar16x16_v
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_mmx
-
-#undef  vp8_variance_halfpixvar16x16_hv
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_mmx
-
-#undef  vp8_variance_subpixmse16x16
-#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
-
-#undef  vp8_variance_getmbss
-#define vp8_variance_getmbss vp8_get_mb_ss_mmx
-
-#undef  vp8_variance_mse16x16
-#define vp8_variance_mse16x16 vp8_mse16x16_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_sad(vp8_sad4x4_wmt);
-extern prototype_sad(vp8_sad8x8_wmt);
-extern prototype_sad(vp8_sad8x16_wmt);
-extern prototype_sad(vp8_sad16x8_wmt);
-extern prototype_sad(vp8_sad16x16_wmt);
-extern prototype_sad(vp8_copy32xn_sse2);
-extern prototype_variance(vp8_variance4x4_wmt);
-extern prototype_variance(vp8_variance8x8_wmt);
-extern prototype_variance(vp8_variance8x16_wmt);
-extern prototype_variance(vp8_variance16x8_wmt);
-extern prototype_variance(vp8_variance16x16_wmt);
-extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt);
-extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
-extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
-extern prototype_variance(vp8_variance_halfpixvar16x16_h_wmt);
-extern prototype_variance(vp8_variance_halfpixvar16x16_v_wmt);
-extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt);
-extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
-extern prototype_getmbss(vp8_get_mb_ss_sse2);
-extern prototype_variance(vp8_mse16x16_wmt);
-extern prototype_variance2(vp8_get8x8var_sse2);
-extern prototype_variance2(vp8_get16x16var_sse2);
-extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2)
-extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2)
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_variance_sad4x4
-#define vp8_variance_sad4x4 vp8_sad4x4_wmt
-
-#undef  vp8_variance_sad8x8
-#define vp8_variance_sad8x8 vp8_sad8x8_wmt
-
-#undef  vp8_variance_sad8x16
-#define vp8_variance_sad8x16 vp8_sad8x16_wmt
-
-#undef  vp8_variance_sad16x8
-#define vp8_variance_sad16x8 vp8_sad16x8_wmt
-
-#undef  vp8_variance_sad16x16
-#define vp8_variance_sad16x16 vp8_sad16x16_wmt
-
-#undef  vp8_variance_copy32xn
-#define vp8_variance_copy32xn vp8_copy32xn_sse2
-
-#undef  vp8_variance_var4x4
-#define vp8_variance_var4x4 vp8_variance4x4_wmt
-
-#undef  vp8_variance_var8x8
-#define vp8_variance_var8x8 vp8_variance8x8_wmt
-
-#undef  vp8_variance_var8x16
-#define vp8_variance_var8x16 vp8_variance8x16_wmt
-
-#undef  vp8_variance_var16x8
-#define vp8_variance_var16x8 vp8_variance16x8_wmt
-
-#undef  vp8_variance_var16x16
-#define vp8_variance_var16x16 vp8_variance16x16_wmt
-
-#undef  vp8_variance_subpixvar4x4
-#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt
-
-#undef  vp8_variance_subpixvar8x8
-#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt
-
-#undef  vp8_variance_subpixvar8x16
-#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt
-
-#undef  vp8_variance_subpixvar16x8
-#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt
-
-#undef  vp8_variance_subpixvar16x16
-#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
-
-#undef  vp8_variance_halfpixvar16x16_h
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt
-
-#undef  vp8_variance_halfpixvar16x16_v
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
-
-#undef  vp8_variance_halfpixvar16x16_hv
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt
-
-#undef  vp8_variance_subpixmse16x16
-#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
-
-#undef  vp8_variance_getmbss
-#define vp8_variance_getmbss vp8_get_mb_ss_sse2
-
-#undef  vp8_variance_mse16x16
-#define vp8_variance_mse16x16 vp8_mse16x16_wmt
-
-#if ARCH_X86_64
-#undef  vp8_ssimpf_8x8
-#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2
-
-#undef  vp8_ssimpf_16x16
-#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2
-#endif
-
-#endif
-#endif
-
-
-#if HAVE_SSE3
-extern prototype_sad(vp8_sad16x16_sse3);
-extern prototype_sad(vp8_sad16x8_sse3);
-extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3);
-extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3);
-extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3);
-extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3);
-extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3);
-
-extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3);
-extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
-extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
-extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
-extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
-extern prototype_sad(vp8_copy32xn_sse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp8_variance_sad16x16
-#define vp8_variance_sad16x16 vp8_sad16x16_sse3
-
-#undef  vp8_variance_sad16x16x3
-#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3
-
-#undef  vp8_variance_sad16x8x3
-#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3
-
-#undef  vp8_variance_sad8x16x3
-#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3
-
-#undef  vp8_variance_sad8x8x3
-#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3
-
-#undef  vp8_variance_sad4x4x3
-#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
-
-#undef  vp8_variance_sad16x16x4d
-#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_sse3
-
-#undef  vp8_variance_sad16x8x4d
-#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
-
-#undef  vp8_variance_sad8x16x4d
-#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3
-
-#undef  vp8_variance_sad8x8x4d
-#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3
-
-#undef  vp8_variance_sad4x4x4d
-#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
-
-#undef  vp8_variance_copy32xn
-#define vp8_variance_copy32xn vp8_copy32xn_sse3
-
-#endif
-#endif
-
-
-#if HAVE_SSSE3
-extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
-extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
-extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_variance_sad16x16x3
-#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3
-
-#undef  vp8_variance_sad16x8x3
-#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
-
-#undef  vp8_variance_subpixvar16x8
-#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
-
-#undef  vp8_variance_subpixvar16x16
-#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
-
-#endif
-#endif
-
-
-#if HAVE_SSE4_1
-extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_variance_sad16x16x8
-#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
-
-#undef  vp8_variance_sad16x8x8
-#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
-
-#undef  vp8_variance_sad8x16x8
-#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
-
-#undef  vp8_variance_sad8x8x8
-#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
-
-#undef  vp8_variance_sad4x4x8
-#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
-
-#endif
-#endif
-
-#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 71c51c14f..a169b493e 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -90,31 +90,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) {
   /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
   if (flags & HAS_MMX) {
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_mmx;
-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_mmx;
-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_mmx;
-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_mmx;
-
-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_mmx;
-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_mmx;
-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_mmx;
-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_mmx;
-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_mmx;
-
-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_mmx;
-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_mmx;
-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_mmx;
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_mmx;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_mmx;
-    cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_mmx;
-    cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_mmx;
-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_mmx;
-    cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_mmx;
-
-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_mmx;
-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_mmx;
-
     cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
     cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_mmx;
     cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_mmx;
@@ -126,32 +101,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) {
 
 #if HAVE_SSE2
   if (flags & HAS_SSE2) {
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_wmt;
-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_wmt;
-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_wmt;
-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_wmt;
-    cpi->rtcd.variance.copy32xn              = vp8_copy32xn_sse2;
-
-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_wmt;
-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_wmt;
-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_wmt;
-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_wmt;
-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_wmt;
-
-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_wmt;
-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_wmt;
-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_wmt;
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_wmt;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_wmt;
-    cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_wmt;
-    cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_wmt;
-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_wmt;
-    cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_wmt;
-
-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_wmt;
-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_sse2;
-
     cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
     cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
     cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
@@ -160,54 +109,20 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) {
     cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;
     cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
 
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-    cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse2;
-    cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_sse2;
-#endif
-#endif
   }
 #endif
 
 #if HAVE_SSE3
   if (flags & HAS_SSE3) {
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
-    cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_sse3;
-    cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_sse3;
-    cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_sse3;
-    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_sse3;
-    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_sse3;
     cpi->rtcd.search.full_search             = vp8_full_search_sadx3;
-    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_sse3;
-    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_sse3;
-    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
-    cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_sse3;
-    cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
-    cpi->rtcd.variance.copy32xn              = vp8_copy32xn_sse3;
     cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
     cpi->rtcd.search.refining_search         = vp8_refining_search_sadx4;
   }
 #endif
 
-#if HAVE_SSSE3
-  if (flags & HAS_SSSE3) {
-    cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
-    cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
-
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_ssse3;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
-  }
-#endif
-
-
 
 #if HAVE_SSE4_1
   if (flags & HAS_SSE4_1) {
-    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_sse4;
-    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_sse4;
-    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
-    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
-    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
     cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
   }
 #endif
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 7058e316b..6d2f18080 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -92,7 +92,6 @@ VP8_CX_SRCS-yes += encoder/mbgraph.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodemb_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/dct_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c