125 files changed, 11695 insertions, 22081 deletions
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
index b02f3f083..79092cd0e 100644
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -11,8 +11,6 @@
 
 #include "./vpx_config.h"
 #include "vp9_rtcd.h"
-#include "vp9/common/vp9_subpixel.h"
-#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 void vp9_machine_specific_config(VP9_COMMON *ctx) {
diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c
index 106a2b763..02035191f 100644
--- a/vp9/common/ppc/vp9_systemdependent.c
+++ b/vp9/common/ppc/vp9_systemdependent.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_subpixel.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "recon.h"
 #include "vp9/common/vp9_onyxc_int.h"
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index a2306f0d1..c3d6dae93 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -80,7 +80,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++) {
     oci->fb_idx_ref_cnt[i] = 0;
-    oci->yv12_fb[i].flags = 0;
     if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height,
                                     VP9BORDERINPIXELS) < 0) {
       vp9_de_alloc_frame_buffers(oci);
@@ -88,15 +87,16 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
     }
   }
 
-  oci->new_fb_idx = 0;
-  oci->lst_fb_idx = 1;
-  oci->gld_fb_idx = 2;
-  oci->alt_fb_idx = 3;
+  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
 
-  oci->fb_idx_ref_cnt[0] = 1;
-  oci->fb_idx_ref_cnt[1] = 1;
-  oci->fb_idx_ref_cnt[2] = 1;
-  oci->fb_idx_ref_cnt[3] = 1;
+  for (i = 0; i < 3; i++)
+    oci->active_ref_idx[i] = i;
+
+  for (i = 0; i < NUM_REF_FRAMES; i++) {
+    oci->ref_frame_map[i] = i;
+    oci->fb_idx_ref_cnt[i] = 1;
+  }
 
   if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,
                                   VP9BORDERINPIXELS) < 0) {
@@ -134,7 +134,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
 
   oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
 
-  oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+  oci->above_context =
+    vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * (3 + oci->mb_cols), 1);
 
   if (!oci->above_context) {
     vp9_de_alloc_frame_buffers(oci);
@@ -146,6 +147,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
 
   return 0;
 }
+
 void vp9_setup_version(VP9_COMMON *cm) {
   if (cm->version & 0x4) {
     if (!CONFIG_EXPERIMENTAL)
@@ -204,9 +206,6 @@ void vp9_create_common(VP9_COMMON *oci) {
   /* Initialise reference frame sign bias structure to defaults */
   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
 
-  /* Default disable buffer to buffer copying */
-  oci->copy_buffer_to_gf = 0;
-  oci->copy_buffer_to_arf = 0;
   oci->kf_ymode_probs_update = 0;
 }
 
@@ -220,8 +219,4 @@ void vp9_initialize_common() {
   vp9_entropy_mode_init();
 
   vp9_entropy_mv_init();
-
-#if CONFIG_NEWCOEFCONTEXT
-  vp9_init_neighbors();
-#endif
 }
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 4ae8132bb..1eda3cc38 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -12,15 +12,15 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25] = {
-  {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8},
-  {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}
+const uint8_t vp9_block2left[TX_SIZE_MAX_SB][24] = {
+  {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7},
+  {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6}
 };
-const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25] = {
-  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8},
-  {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}
+const uint8_t vp9_block2above[TX_SIZE_MAX_SB][24] = {
+  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7},
+  {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6}
 };
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index e838da221..b35c1c246 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -16,9 +16,9 @@ void vpx_log(const char *format, ...);
 
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_treecoder.h"
-#include "vp9/common/vp9_subpixel.h"
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_common.h"
 
@@ -47,27 +47,13 @@ void vpx_log(const char *format, ...);
 #define MAX_MV_REFS 9
 #define MAX_MV_REF_CANDIDATES 4
 
-#if CONFIG_DWTDCTHYBRID
-#define DWT_MAX_LENGTH     64
-#define DWT_TYPE           26    // 26/53/97
-#define DWT_PRECISION_BITS 2
-#define DWT_PRECISION_RND  ((1 << DWT_PRECISION_BITS) / 2)
-
-#define DWTDCT16X16        0
-#define DWTDCT16X16_LEAN   1
-#define DWTDCT8X8          2
-#define DWTDCT_TYPE        DWTDCT16X16_LEAN
-#endif
-
 typedef struct {
   int r, c;
 } POS;
 
-typedef enum PlaneType {
-  PLANE_TYPE_Y_NO_DC = 0,
-  PLANE_TYPE_Y2,
-  PLANE_TYPE_UV,
+typedef enum {
   PLANE_TYPE_Y_WITH_DC,
+  PLANE_TYPE_UV,
 } PLANE_TYPE;
 
 typedef char ENTROPY_CONTEXT;
@@ -75,10 +61,9 @@ typedef struct {
   ENTROPY_CONTEXT y1[4];
   ENTROPY_CONTEXT u[2];
   ENTROPY_CONTEXT v[2];
-  ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;
 
-#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+#define VP9_COMBINEENTROPYCONTEXTS(Dest, A, B) \
   Dest = ((A)!=0) + ((B)!=0);
 
 typedef enum {
@@ -86,8 +71,7 @@ typedef enum {
   INTER_FRAME = 1
 } FRAME_TYPE;
 
-typedef enum
-{
+typedef enum {
 #if CONFIG_ENABLE_6TAP
   SIXTAP,
 #endif
@@ -98,8 +82,7 @@ typedef enum
   SWITCHABLE  /* should be the last one */
 } INTERPOLATIONFILTERTYPE;
 
-typedef enum
-{
+typedef enum {
   DC_PRED,            /* average of above and left pixels */
   V_PRED,             /* vertical prediction */
   H_PRED,             /* horizontal prediction */
@@ -125,10 +108,9 @@ typedef enum {
   SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
   SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
   SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
-  SEG_LVL_MODE = 3,                // Optional Segment mode
-  SEG_LVL_EOB = 4,                 // EOB end stop marker.
-  SEG_LVL_TRANSFORM = 5,           // Block transform size.
-  SEG_LVL_MAX = 6                  // Number of MB level features supported
+  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
+  SEG_LVL_TRANSFORM = 4,           // Block transform size.
+  SEG_LVL_MAX = 5                  // Number of MB level features supported
 } SEG_LVL_FEATURES;
 
 // Segment level features.
@@ -155,10 +137,7 @@ typedef enum {
 
 #define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
 
-#if CONFIG_LOSSLESS
-#define WHT_UPSCALE_FACTOR 3
-#define Y2_WHT_UPSCALE_FACTOR 2
-#endif
+#define WHT_UPSCALE_FACTOR 2
 
 typedef enum {
   B_DC_PRED,          /* average of above and left pixels */
@@ -219,10 +198,7 @@ union b_mode_info {
     B_PREDICTION_MODE context;
 #endif
   } as_mode;
-  struct {
-    int_mv first;
-    int_mv second;
-  } as_mv;
+  int_mv as_mv[2];  // first, second inter predictor motion vectors
 };
 
 typedef enum {
@@ -298,36 +274,46 @@ typedef struct blockd {
   int dst;
   int dst_stride;
 
-  int eob;
-
   union b_mode_info bmi;
 } BLOCKD;
 
 typedef struct superblockd {
-  /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */
+  /* 32x32 Y and 16x16 U/V */
   DECLARE_ALIGNED(16, int16_t, diff[32*32+16*16*2]);
   DECLARE_ALIGNED(16, int16_t, qcoeff[32*32+16*16*2]);
   DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]);
 } SUPERBLOCKD;
 
+struct scale_factors {
+  int x_num;
+  int x_den;
+  int x_offset_q4;
+  int x_step_q4;
+  int y_num;
+  int y_den;
+  int y_offset_q4;
+  int y_step_q4;
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+};
+
 typedef struct macroblockd {
-  DECLARE_ALIGNED(16, int16_t,  diff[400]);      /* from idct diff */
+  DECLARE_ALIGNED(16, int16_t,  diff[384]);      /* from idct diff */
   DECLARE_ALIGNED(16, uint8_t,  predictor[384]);
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[400]);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[400]);
-  DECLARE_ALIGNED(16, uint16_t, eobs[25]);
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[384]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[384]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[24]);
 
   SUPERBLOCKD sb_coeff_data;
 
-  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
-  BLOCKD block[25];
+  /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */
+  BLOCKD block[24];
   int fullpixel_mask;
 
   YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
-  struct {
-    uint8_t *y_buffer, *u_buffer, *v_buffer;
-  } second_pre;
+  YV12_BUFFER_CONFIG second_pre;
   YV12_BUFFER_CONFIG dst;
+  struct scale_factors scale_factor[2];
+  struct scale_factors scale_factor_uv[2];
 
   MODE_INFO *prev_mode_info_context;
   MODE_INFO *mode_info_context;
@@ -337,8 +323,9 @@ typedef struct macroblockd {
 
   int up_available;
   int left_available;
+  int right_available;
 
-  /* Y,U,V,Y2 */
+  /* Y,U,V */
   ENTROPY_CONTEXT_PLANES *above_context;
   ENTROPY_CONTEXT_PLANES *left_context;
 
@@ -359,6 +346,7 @@ typedef struct macroblockd {
 
   // Probability Tree used to code Segment number
   vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
+  vp9_prob mb_segment_mispred_tree_probs[MAX_MB_SEGMENTS];
 
 #if CONFIG_NEW_MVREF
   vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];
@@ -387,21 +375,20 @@ typedef struct macroblockd {
   unsigned int frames_since_golden;
   unsigned int frames_till_alt_ref_frame;
 
+  int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_xform4x4_1_x8)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_xform4x4_x8)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
-  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
-
-
-  vp9_subpix_fn_t  subpixel_predict4x4;
-  vp9_subpix_fn_t  subpixel_predict8x4;
-  vp9_subpix_fn_t  subpixel_predict8x8;
-  vp9_subpix_fn_t  subpixel_predict16x16;
-  vp9_subpix_fn_t  subpixel_predict_avg4x4;
-  vp9_subpix_fn_t  subpixel_predict_avg8x4;
-  vp9_subpix_fn_t  subpixel_predict_avg8x8;
-  vp9_subpix_fn_t  subpixel_predict_avg16x16;
+  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*itxm_add)(int16_t *input, const int16_t *dq,
+    uint8_t *pred, uint8_t *output, int pitch, int stride, int eob);
+  void (*itxm_add_y_block)(int16_t *q, const int16_t *dq,
+    uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd);
+  void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq,
+    uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride,
+    struct macroblockd *xd);
+
+  struct subpix_fn_table  subpix;
+
   int allow_high_precision_mv;
 
   int corrupted;
@@ -412,74 +399,46 @@ typedef struct macroblockd {
 
 } MACROBLOCKD;
 
-#define ACTIVE_HT 110                // quantization stepsize threshold
+#define ACTIVE_HT   110                // quantization stepsize threshold
 
-#define ACTIVE_HT8 300
+#define ACTIVE_HT8  300
 
 #define ACTIVE_HT16 300
 
 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE
 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
-  B_PREDICTION_MODE b_mode;
   switch (mode) {
-    case DC_PRED:
-      b_mode = B_DC_PRED;
-      break;
-    case V_PRED:
-      b_mode = B_VE_PRED;
-      break;
-    case H_PRED:
-      b_mode = B_HE_PRED;
-      break;
-    case TM_PRED:
-      b_mode = B_TM_PRED;
-      break;
-    case D45_PRED:
-      b_mode = B_LD_PRED;
-      break;
-    case D135_PRED:
-      b_mode = B_RD_PRED;
-      break;
-    case D117_PRED:
-      b_mode = B_VR_PRED;
-      break;
-    case D153_PRED:
-      b_mode = B_HD_PRED;
-      break;
-    case D27_PRED:
-      b_mode = B_HU_PRED;
-      break;
-    case D63_PRED:
-      b_mode = B_VL_PRED;
-      break;
-    default :
-      // for debug purpose, to be removed after full testing
-      assert(0);
-      break;
+    case DC_PRED: return B_DC_PRED;
+    case V_PRED: return B_VE_PRED;
+    case H_PRED: return B_HE_PRED;
+    case TM_PRED: return B_TM_PRED;
+    case D45_PRED: return B_LD_PRED;
+    case D135_PRED: return B_RD_PRED;
+    case D117_PRED: return B_VR_PRED;
+    case D153_PRED: return B_HD_PRED;
+    case D27_PRED: return B_HU_PRED;
+    case D63_PRED: return B_VL_PRED;
+    default:
+       assert(0);
+       return B_MODE_COUNT;  // Dummy value
   }
-  return b_mode;
 }
 
 // transform mapping
 static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
-  // map transform type
-  TX_TYPE tx_type;
   switch (bmode) {
     case B_TM_PRED :
     case B_RD_PRED :
-      tx_type = ADST_ADST;
-      break;
+      return ADST_ADST;
 
     case B_VE_PRED :
     case B_VR_PRED :
-      tx_type = ADST_DCT;
-      break;
+      return ADST_DCT;
 
     case B_HE_PRED :
     case B_HD_PRED :
     case B_HU_PRED :
-      tx_type = DCT_ADST;
-      break;
+      return DCT_ADST;
 
 #if CONFIG_NEWBINTRAMODES
     case B_CONTEXT_PRED:
@@ -487,15 +446,13 @@ static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
       break;
 #endif
 
-    default :
-      tx_type = DCT_DCT;
-      break;
+    default:
+      return DCT_DCT;
   }
-  return tx_type;
 }
 
-extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25];
-extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25];
+extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][24];
+extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][24];
 
 #define USE_ADST_FOR_I16X16_8X8   0
 #define USE_ADST_FOR_I16X16_4X4   0
@@ -509,6 +466,8 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
   int ib = (int)(b - xd->block);
   if (ib >= 16)
     return tx_type;
+  if (xd->lossless)
+    return DCT_DCT;
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
   if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
@@ -625,30 +584,17 @@ static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
   return tx_type;
 }
 
-static int get_2nd_order_usage(const MACROBLOCKD *xd) {
-  int has_2nd_order = (xd->mode_info_context->mbmi.mode != SPLITMV &&
-                       xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-                       xd->mode_info_context->mbmi.mode != B_PRED &&
-                       xd->mode_info_context->mbmi.txfm_size != TX_16X16);
-  if (has_2nd_order)
-    has_2nd_order = (get_tx_type(xd, xd->block) == DCT_DCT);
-  return has_2nd_order;
-}
-
-extern void vp9_build_block_doffsets(MACROBLOCKD *xd);
-extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);
+void vp9_build_block_doffsets(MACROBLOCKD *xd);
+void vp9_setup_block_dptrs(MACROBLOCKD *xd);
 
 static void update_blockd_bmi(MACROBLOCKD *xd) {
-  int i;
-  int is_4x4;
-  is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
-           (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||
-           (xd->mode_info_context->mbmi.mode == B_PRED);
-
-  if (is_4x4) {
-    for (i = 0; i < 16; i++) {
+  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
+
+  if (mode == SPLITMV || mode == I8X8_PRED || mode == B_PRED) {
+    int i;
+    for (i = 0; i < 16; i++)
       xd->block[i].bmi = xd->mode_info_context->bmi[i];
-    }
   }
 }
+
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 2e1ee4b1a..4295eba87 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -42,7 +42,7 @@
 
 #define vp9_zero_array(Dest, N) vpx_memset(Dest, 0, N * sizeof(*Dest));
 
-static __inline uint8_t clip_pixel(int val) {
+static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255u : (val < 0) ? 0u : val;
 }
 
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
new file mode 100644
index 000000000..b062e7dc7
--- /dev/null
+++ b/vp9/common/vp9_convolve.c
@@ -0,0 +1,376 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/common/vp9_convolve.h"
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT  7
+
+/* Assume a bank of 16 filters to choose from. There are two implementations
+ * for filter wrapping behavior, since we want to be able to pick which filter
+ * to start with. We could either:
+ *
+ * 1) make filter_ a pointer to the base of the filter array, and then add an
+ *    additional offset parameter, to choose the starting filter.
+ * 2) use a pointer to 2 periods worth of filters, so that even if the original
+ *    phase offset is at 15/16, we'll have valid data to read. The filter
+ *    tables become [32][8], and the second half is duplicated.
+ * 3) fix the alignment of the filter tables, so that we know the 0/16 is
+ *    always 256 byte aligned.
+ *
+ * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
+ * parameters, and switching between them is trivial, with the
+ * ALIGN_FILTERS_256 macro, below.
+ */
+ #define ALIGN_FILTERS_256 1
+
+static void convolve_horiz_c(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
+                             const int16_t *filter_x0, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h, int taps) {
+  int x, y, k, sum;
+  const int16_t *filter_x_base = filter_x0;
+
+#if ALIGN_FILTERS_256
+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#endif
+
+  /* Adjust base pointer address for this source line */
+  src -= taps / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    /* Pointer to filter to use */
+    const int16_t *filter_x = filter_x0;
+
+    /* Initial phase offset */
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;
+
+    for (x = 0; x < w; ++x) {
+      /* Per-pixel src offset */
+      int src_x = (x_q4 - x0_q4) >> 4;
+
+      for (sum = 0, k = 0; k < taps; ++k) {
+        sum += src[src_x + k] * filter_x[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);
+
+      /* Adjust source and filter to use for the next pixel */
+      x_q4 += x_step_q4;
+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 const int16_t *filter_x0, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h, int taps) {
+  int x, y, k, sum;
+  const int16_t *filter_x_base = filter_x0;
+
+#if ALIGN_FILTERS_256
+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#endif
+
+  /* Adjust base pointer address for this source line */
+  src -= taps / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    /* Pointer to filter to use */
+    const int16_t *filter_x = filter_x0;
+
+    /* Initial phase offset */
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;
+
+    for (x = 0; x < w; ++x) {
+      /* Per-pixel src offset */
+      int src_x = (x_q4 - x0_q4) >> 4;
+
+      for (sum = 0, k = 0; k < taps; ++k) {
+        sum += src[src_x + k] * filter_x[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
+
+      /* Adjust source and filter to use for the next pixel */
+      x_q4 += x_step_q4;
+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert_c(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y0, int y_step_q4,
+                            int w, int h, int taps) {
+  int x, y, k, sum;
+
+  const int16_t *filter_y_base = filter_y0;
+
+#if ALIGN_FILTERS_256
+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+#endif
+
+  /* Adjust base pointer address for this source column */
+  src -= src_stride * (taps / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    /* Pointer to filter to use */
+    const int16_t *filter_y = filter_y0;
+
+    /* Initial phase offset */
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;
+
+    for (y = 0; y < h; ++y) {
+      /* Per-pixel src offset */
+      int src_y = (y_q4 - y0_q4) >> 4;
+
+      for (sum = 0, k = 0; k < taps; ++k) {
+        sum += src[(src_y + k) * src_stride] * filter_y[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);
+
+      /* Adjust source and filter to use for the next pixel */
+      y_q4 += y_step_q4;
+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y0, int y_step_q4,
+                                int w, int h, int taps) {
+  int x, y, k, sum;
+
+  const int16_t *filter_y_base = filter_y0;
+
+#if ALIGN_FILTERS_256
+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+#endif
+
+  /* Adjust base pointer address for this source column */
+  src -= src_stride * (taps / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    /* Pointer to filter to use */
+    const int16_t *filter_y = filter_y0;
+
+    /* Initial phase offset */
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;
+
+    for (y = 0; y < h; ++y) {
+      /* Per-pixel src offset */
+      int src_y = (y_q4 - y0_q4) >> 4;
+
+      for (sum = 0, k = 0; k < taps; ++k) {
+        sum += src[(src_y + k) * src_stride] * filter_y[k];
+      }
+      sum += (VP9_FILTER_WEIGHT >> 1);
+      dst[y * dst_stride] =
+          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
+
+      /* Adjust source and filter to use for the next pixel */
+      y_q4 += y_step_q4;
+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_c(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
+                       const int16_t *filter_x, int x_step_q4,
+                       const int16_t *filter_y, int y_step_q4,
+                       int w, int h, int taps) {
+  /* Fixed size intermediate buffer places limits on parameters.
+   * Maximum intermediate_height is 39, for y_step_q4 == 32,
+   * h == 16, taps == 8.
+   */
+  uint8_t temp[16 * 39];
+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+
+  assert(w <= 16);
+  assert(h <= 16);
+  assert(taps <= 8);
+  assert(y_step_q4 <= 32);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
+                   temp, 16,
+                   filter_x, x_step_q4, filter_y, y_step_q4,
+                   w, intermediate_height, taps);
+  convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
+                  filter_x, x_step_q4, filter_y, y_step_q4,
+                  w, h, taps);
+}
+
+static void convolve_avg_c(const uint8_t *src, int src_stride,
+                           uint8_t *dst, int dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h, int taps) {
+  /* Fixed size intermediate buffer places limits on parameters.
+   * Maximum intermediate_height is 39, for y_step_q4 == 32,
+   * h == 16, taps == 8.
+   */
+  uint8_t temp[16 * 39];
+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+
+  assert(w <= 16);
+  assert(h <= 16);
+  assert(taps <= 8);
+  assert(y_step_q4 <= 32);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
+                   temp, 16,
+                   filter_x, x_step_q4, filter_y, y_step_q4,
+                   w, intermediate_height, taps);
+  convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h, taps);
+}
+
+void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
+                           uint8_t *dst, int dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  convolve_horiz_c(src, src_stride, dst, dst_stride,
+                   filter_x, x_step_q4, filter_y, y_step_q4,
+                   w, h, 8);
+}
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
+                       filter_x, x_step_q4, filter_y, y_step_q4,
+                       w, h, 8);
+}
+
+void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4,
+                          int w, int h) {
+  convolve_vert_c(src, src_stride, dst, dst_stride,
+                  filter_x, x_step_q4, filter_y, y_step_q4,
+                  w, h, 8);
+}
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h, 8);
+}
+
+void vp9_convolve8_c(const uint8_t *src, int src_stride,
+                     uint8_t *dst, int dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  convolve_c(src, src_stride, dst, dst_stride,
+             filter_x, x_step_q4, filter_y, y_step_q4,
+             w, h, 8);
+}
+
+void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
+  assert(w <= 16);
+  assert(h <= 16);
+
+  vp9_convolve8(src, src_stride,
+                temp, 16,
+                filter_x, x_step_q4,
+                filter_y, y_step_q4,
+                w, h);
+  vp9_convolve_avg(temp, 16,
+                   dst, dst_stride,
+                   NULL, 0, /* These unused parameter should be removed! */
+                   NULL, 0, /* These unused parameter should be removed! */
+                   w, h);
+}
+
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
+                       const int16_t *filter_x, int filter_x_stride,
+                       const int16_t *filter_y, int filter_y_stride,
+                       int w, int h) {
+  if (w == 16 && h == 16) {
+    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
+  } else if (w == 8 && h == 8) {
+    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
+  } else if (w == 8 && h == 4) {
+    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
+  } else {
+    int r;
+
+    for (r = h; r > 0; --r) {
+      memcpy(dst, src, w);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride,
+                      const int16_t *filter_x, int filter_x_stride,
+                      const int16_t *filter_y, int filter_y_stride,
+                      int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x] = (dst[x] + src[x] + 1) >> 1;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
new file mode 100644
index 000000000..8c4856187
--- /dev/null
+++ b/vp9/common/vp9_convolve.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_CONVOLVE_H_
+#define VP9_COMMON_CONVOLVE_H_
+
+#include "vpx/vpx_integer.h"
+
+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+// Not a convolution, a block copy conforming to the convolution prototype
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
+                       const int16_t *filter_x, int x_step_q4,
+                       const int16_t *filter_y, int y_step_q4,
+                       int w, int h);
+
+// Not a convolution, a block average conforming to the convolution prototype
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride,
+                      const int16_t *filter_x, int x_step_q4,
+                      const int16_t *filter_y, int y_step_q4,
+                      int w, int h);
+
+struct subpix_fn_table {
+  const int16_t (*filter_x)[8];
+  const int16_t (*filter_y)[8];
+};
+
+#endif  // VP9_COMMON_CONVOLVE_H_
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 5ea7736b7..1953d60c6 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -129,8 +129,8 @@ void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
         mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
         bindex = (b_row & 3) * 4 + (b_col & 3);
         fprintf(mvs, "%3d:%-3d ",
-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,
-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);
+                mi[mb_index].bmi[bindex].as_mv[0].as_mv.row,
+                mi[mb_index].bmi[bindex].as_mv[0].as_mv.col);
 
       }
 
diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h
index 10d3c389f..6309566a7 100644
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -11,1201 +11,603 @@
 
 /*Generated file, included by vp9_entropy.c*/
 
-
-static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES_4X4] = {
-  { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 224, 180, 254, 255, 234, 224, 255, 227, 128, 128, 128 },
-      { 187, 178, 250, 255, 226, 218, 255, 229, 255, 255, 128 },
-      { 145, 171, 243, 253, 219, 211, 254, 226, 255, 224, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      {   1, 187, 252, 255, 231, 220, 255, 229, 255, 255, 128 },
-      { 129, 174, 244, 254, 225, 216, 253, 219, 255, 255, 128 },
-      {  16, 131, 193, 251, 205, 205, 254, 222, 255, 255, 128 },
-      {   2,  93, 136, 236, 159, 179, 255, 197, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      {   1, 188, 254, 255, 241, 236, 254, 220, 255, 255, 128 },
-      { 133, 165, 249, 255, 236, 220, 252, 220, 255, 255, 128 },
-      {  20, 112, 203, 254, 217, 214, 255, 224, 255, 255, 128 },
-      {   4,  61, 106, 240, 155, 189, 252, 202, 255, 255, 128 }
-    }, { /* Coeff Band 4 */
-      {   1, 168, 252, 255, 239, 228, 253, 217, 255, 255, 128 },
-      { 158, 163, 247, 255, 231, 221, 255, 242, 128, 128, 128 },
-      {  23, 127, 205, 253, 212, 224, 255, 234, 255, 255, 128 },
-      {   2,  83, 141, 237, 176, 210, 245, 207, 255, 255, 128 }
-    }, { /* Coeff Band 5 */
-      {   1, 233, 254, 255, 243, 241, 255, 213, 128, 128, 128 },
-      { 155, 213, 253, 255, 240, 221, 216, 112, 255, 255, 128 },
-      {  41, 159, 237, 254, 229, 216, 255, 161, 128, 128, 128 },
-      {  11,  95, 176, 244, 194, 191, 255, 167, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      {   1, 160, 253, 255, 238, 231, 255, 230, 255, 255, 128 },
-      { 174, 152, 248, 255, 230, 223, 255, 223, 255, 255, 128 },
-      {  86, 125, 213, 253, 207, 207, 254, 224, 255, 171, 128 },
-      {  39,  89, 156, 240, 168, 190, 251, 181, 255, 255, 128 }
-    }, { /* Coeff Band 7 */
-      {   1, 101, 255, 255, 243, 244, 255, 255, 128, 128, 128 },
-      { 230,  66, 255, 255, 238, 238, 128, 128, 128, 128, 128 },
-      { 151,  92, 229, 255, 224, 197, 128, 128, 128, 128, 128 },
-      { 109,  57, 171, 255,  73, 255, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 148, 109, 219, 239, 203, 184, 222, 172, 238, 203, 192 },
-      { 101, 110, 206, 229, 181, 178, 224, 171, 250, 206, 180 },
-      {  67, 108, 186, 222, 172, 174, 216, 167, 246, 195, 221 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 184, 249, 254, 226, 220, 253, 241, 255, 255, 128 },
-      {  84, 182, 244, 254, 222, 218, 254, 217, 255, 255, 128 },
-      {  56, 147, 210, 252, 208, 210, 253, 218, 255, 255, 128 },
-      {  32, 124, 170, 233, 165, 178, 249, 196, 255, 253, 128 }
-    }, { /* Coeff Band 2 */
-      {   1, 182, 242, 245, 208, 194, 239, 179, 255, 238, 128 },
-      {  28, 170, 230, 241, 202, 192, 243, 171, 255, 243, 128 },
-      {  16, 109, 165, 231, 182, 184, 237, 168, 255, 249, 255 },
-      {   2,  76, 113, 202, 141, 172, 221, 160, 252, 227, 255 }
-    }, { /* Coeff Band 3 */
-      {   1, 195, 249, 254, 230, 239, 251, 211, 255, 255, 128 },
-      {  39, 164, 242, 254, 224, 222, 255, 235, 255, 255, 128 },
-      {  16, 111, 179, 251, 204, 197, 251, 234, 255, 209, 128 },
-      {   3,  84, 130, 225, 155, 176, 226, 196, 255, 238, 128 }
-    }, { /* Coeff Band 4 */
-      {   1, 180, 248, 254, 227, 219, 254, 211, 255, 255, 128 },
-      {  38, 170, 242, 253, 222, 214, 254, 242, 255, 255, 128 },
-      {   5, 111, 176, 250, 204, 197, 255, 208, 128, 128, 128 },
-      {   1,  75, 120, 233, 146, 186, 250, 203, 255, 255, 128 }
-    }, { /* Coeff Band 5 */
-      {   1, 183, 251, 255, 232, 223, 252, 229, 255, 255, 128 },
-      {  51, 158, 245, 255, 230, 224, 255, 239, 128, 128, 128 },
-      {  13,  80, 158, 253, 206, 216, 255, 233, 128, 128, 128 },
-      {   4,  39,  76, 212, 107, 153, 252, 206, 255, 255, 128 }
-    }, { /* Coeff Band 6 */
-      {   1, 181, 252, 254, 231, 214, 242, 225, 255, 236, 128 },
-      {  81, 167, 247, 254, 229, 217, 252, 226, 255, 255, 128 },
-      {  20, 122, 195, 253, 213, 212, 249, 211, 255, 238, 128 },
-      {  18, 100, 153, 231, 158, 182, 244, 203, 255, 219, 128 }
-    }, { /* Coeff Band 7 */
-      {   1, 100, 254, 255, 242, 246, 255, 230, 128, 128, 128 },
-      { 177,  62, 250, 255, 246, 210, 255, 255, 128, 128, 128 },
-      {  65,  58, 186, 255, 227, 241, 255, 219, 128, 128, 128 },
-      {  45,  23, 118, 244, 162, 208, 255, 228, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 242,  73, 238, 244, 198, 192, 241, 189, 253, 226, 247 },
-      { 171,  70, 204, 231, 180, 183, 228, 172, 247, 215, 221 },
-      {  73,  62, 144, 202, 153, 169, 207, 153, 245, 199, 230 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 163, 241, 245, 201, 192, 243, 191, 255, 229, 255 },
-      { 165, 147, 230, 245, 201, 193, 244, 193, 255, 231, 255 },
-      {  76, 109, 191, 243, 190, 193, 243, 192, 255, 231, 255 },
-      {  22,  63, 111, 202, 138, 164, 225, 164, 252, 218, 248 }
-    }, { /* Coeff Band 2 */
-      {   1, 113, 225, 245, 201, 195, 238, 185, 254, 225, 255 },
-      { 122, 105, 195, 236, 183, 186, 235, 180, 254, 227, 252 },
-      {  38,  79, 135, 217, 154, 172, 229, 171, 253, 220, 250 },
-      {   9,  53,  78, 161, 121, 151, 202, 141, 251, 207, 244 }
-    }, { /* Coeff Band 3 */
-      {   1, 150, 238, 250, 213, 202, 244, 194, 255, 236, 255 },
-      { 140, 132, 223, 247, 204, 199, 243, 193, 255, 234, 255 },
-      {  51, 101, 182, 240, 188, 189, 240, 186, 255, 232, 255 },
-      {   6,  59, 100, 201, 137, 165, 225, 161, 252, 221, 249 }
-    }, { /* Coeff Band 4 */
-      {   1, 151, 233, 248, 205, 199, 248, 196, 255, 243, 255 },
-      { 133, 140, 214, 244, 193, 193, 245, 194, 255, 236, 255 },
-      {  27, 104, 168, 235, 172, 183, 243, 187, 254, 235, 255 },
-      {   2,  61, 101, 202, 135, 164, 229, 167, 254, 223, 255 }
-    }, { /* Coeff Band 5 */
-      {   1, 227, 246, 254, 225, 215, 254, 217, 255, 255, 128 },
-      { 132, 195, 239, 253, 219, 210, 252, 212, 255, 255, 128 },
-      {  49, 143, 214, 251, 207, 204, 253, 212, 255, 238, 128 },
-      {  11,  93, 151, 235, 169, 185, 247, 190, 255, 238, 128 }
-    }, { /* Coeff Band 6 */
-      {   1, 143, 237, 251, 213, 203, 249, 203, 255, 243, 128 },
-      { 137, 120, 216, 246, 198, 196, 248, 199, 255, 240, 255 },
-      {  50,  94, 166, 233, 169, 181, 245, 189, 255, 240, 255 },
-      {   9,  56,  97, 190, 129, 158, 228, 159, 255, 226, 255 }
-    }, { /* Coeff Band 7 */
-      {   1,  96, 245, 254, 229, 216, 255, 212, 255, 255, 128 },
-      { 179,  81, 234, 253, 217, 209, 255, 230, 255, 255, 128 },
-      { 105,  56, 192, 248, 192, 197, 252, 212, 255, 205, 128 },
-      {  53,  32, 133, 228, 151, 177, 250, 192, 255, 255, 128 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 209,  89, 216, 242, 191, 190, 245, 191, 240, 235, 168 },
-      { 142,  96, 196, 229, 173, 180, 233, 175, 247, 220, 174 },
-      {  66,  89, 157, 205, 155, 171, 209, 156, 243, 200, 197 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 159, 235, 246, 202, 197, 237, 186, 248, 223, 223 },
-      {  96, 137, 223, 247, 203, 198, 242, 188, 241, 202, 209 },
-      {  22,  95, 167, 243, 184, 196, 237, 187, 247, 221, 221 },
-      {   3,  51,  81, 192, 125, 158, 220, 164, 242, 211, 197 }
-    }, { /* Coeff Band 2 */
-      {   1, 145, 226, 244, 196, 194, 240, 191, 247, 225, 233 },
-      {  66, 127, 203, 240, 188, 189, 239, 188, 248, 225, 220 },
-      {   9,  83, 136, 224, 159, 176, 235, 177, 247, 223, 207 },
-      {   2,  46,  71, 169, 121, 152, 210, 149, 241, 212, 199 }
-    }, { /* Coeff Band 3 */
-      {   1, 174, 238, 249, 209, 201, 245, 198, 241, 196, 241 },
-      {  76, 151, 223, 247, 203, 197, 245, 194, 243, 202, 198 },
-      {  12, 102, 170, 240, 183, 187, 242, 191, 247, 225, 209 },
-      {   1,  52,  85, 202, 135, 162, 225, 168, 240, 209, 221 }
-    }, { /* Coeff Band 4 */
-      {   1, 140, 230, 247, 204, 198, 242, 190, 249, 209, 248 },
-      {  94, 126, 213, 244, 195, 194, 240, 190, 247, 210, 237 },
-      {  13,  95, 159, 232, 171, 181, 237, 179, 245, 205, 237 },
-      {   1,  51,  83, 186, 128, 158, 216, 154, 240, 193, 229 }
-    }, { /* Coeff Band 5 */
-      {   1, 218, 244, 251, 214, 202, 243, 199, 253, 214, 255 },
-      {  91, 194, 238, 249, 210, 200, 247, 203, 251, 223, 255 },
-      {  18, 140, 207, 247, 198, 194, 246, 203, 252, 213, 255 },
-      {   3,  76, 126, 223, 156, 172, 233, 185, 251, 206, 255 }
-    }, { /* Coeff Band 6 */
-      {   1, 135, 235, 250, 210, 203, 246, 206, 251, 219, 241 },
-      { 105, 120, 214, 246, 196, 196, 245, 195, 250, 216, 243 },
-      {  24,  91, 154, 231, 166, 180, 241, 183, 250, 214, 242 },
-      {   3,  53,  84, 183, 127, 157, 218, 153, 244, 195, 237 }
-    }, { /* Coeff Band 7 */
-      {   1,  83, 246, 252, 215, 208, 246, 206, 255, 237, 128 },
-      { 184,  61, 233, 250, 208, 204, 245, 198, 254, 227, 255 },
-      {  83,  58, 190, 246, 189, 195, 244, 198, 255, 229, 128 },
-      {  41,  38, 125, 214, 144, 169, 229, 171, 251, 216, 255 }
-    }
-  }
-};
-static const vp9_coeff_probs default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4] = {
+static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES] = {
   { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 208,  26, 124, 168, 135, 159, 164, 134, 213, 172, 169 },
+        { 112,  36, 114, 171, 139, 161, 165, 138, 208, 167, 168 },
+        {  21,  27,  55, 109, 115, 147, 126, 121, 190, 151, 167 }
+      }, { /* Coeff Band 1 */
+        {   1,  94, 156, 203, 156, 169, 200, 154, 230, 184, 206 },
+        {  85,  91, 161, 202, 155, 170, 198, 151, 233, 184, 214 },
+        {  57,  78, 131, 203, 157, 169, 198, 152, 231, 184, 214 },
+        {  36,  68, 104, 191, 135, 164, 199, 153, 231, 183, 208 },
+        {  16,  51,  68, 154, 115, 150, 192, 140, 231, 184, 210 },
+        {   5,  31,  32,  83, 100, 140, 121, 115, 192, 153, 182 }
+      }, { /* Coeff Band 2 */
+        {   1,  72, 146, 177, 149, 168, 157, 135, 200, 159, 184 },
+        {  68,  72, 148, 180, 153, 166, 157, 139, 198, 156, 187 },
+        {  29,  68, 112, 178, 150, 162, 159, 144, 191, 153, 183 },
+        {  12,  57,  83, 164, 125, 157, 162, 141, 186, 156, 178 },
+        {   4,  42,  52, 125, 108, 145, 161, 130, 190, 166, 170 },
+        {   1,  28,  25,  67,  98, 138,  99, 110, 175, 143, 167 }
+      }, { /* Coeff Band 3 */
+        {   1, 113, 176, 199, 161, 171, 167, 146, 188, 154, 194 },
+        {  75,  97, 166, 206, 161, 172, 188, 156, 203, 164, 208 },
+        {  31,  83, 131, 200, 152, 168, 191, 157, 200, 169, 206 },
+        {  18,  70,  99, 185, 131, 162, 194, 153, 202, 177, 201 },
+        {   8,  55,  70, 146, 115, 150, 187, 136, 215, 188, 191 },
+        {   2,  46,  42,  87, 109, 144, 111, 117, 185, 148, 182 }
+      }, { /* Coeff Band 4 */
+        {   1, 128, 191, 217, 169, 174, 203, 163, 201, 178, 196 },
+        {  73, 105, 177, 220, 168, 175, 212, 167, 222, 185, 212 },
+        {  22,  82, 135, 212, 157, 172, 212, 165, 220, 187, 213 },
+        {  10,  65,  95, 194, 133, 162, 210, 160, 223, 194, 208 },
+        {   5,  45,  59, 145, 108, 147, 196, 142, 230, 196, 197 },
+        {   2,  30,  29,  76,  98, 140, 119, 112, 205, 158, 185 }
+      }, { /* Coeff Band 5 */
+        {   1, 101, 208, 232, 179, 179, 236, 181, 243, 216, 210 },
+        { 110,  84, 194, 231, 177, 180, 233, 177, 246, 213, 224 },
+        {  50,  68, 148, 224, 166, 177, 229, 173, 245, 209, 215 },
+        {  29,  55, 105, 207, 139, 168, 224, 167, 244, 207, 225 },
+        {  17,  38,  65, 157, 111, 148, 206, 148, 242, 202, 215 },
+        {   7,  18,  28,  76,  96, 138, 125, 111, 219, 162, 206 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 221, 105, 211, 220, 170, 171, 233, 173, 241, 200, 201 },
+        { 144, 102, 184, 206, 160, 167, 209, 163, 227, 186, 193 },
+        {  51,  84, 132, 174, 146, 161, 165, 144, 190, 163, 175 }
+      }, { /* Coeff Band 1 */
+        {   1, 167, 216, 217, 170, 171, 217, 178, 213, 176, 216 },
+        {  89, 146, 210, 227, 185, 182, 198, 165, 203, 166, 216 },
+        {  37, 117, 171, 232, 185, 185, 197, 175, 191, 159, 212 },
+        {  30,  99, 128, 224, 150, 177, 210, 179, 183, 162, 211 },
+        {  25,  84,  92, 173, 121, 150, 222, 161, 201, 199, 201 },
+        {   8,  56,  65, 121, 119, 148, 150, 124, 198, 170, 191 }
+      }, { /* Coeff Band 2 */
+        {   1, 133, 198, 206, 166, 172, 188, 157, 211, 167, 206 },
+        {  62, 122, 178, 203, 170, 174, 160, 151, 193, 148, 203 },
+        {  19,  94, 126, 195, 156, 172, 156, 159, 172, 135, 199 },
+        {  15,  78,  89, 173, 122, 158, 163, 155, 153, 138, 191 },
+        {  13,  63,  61, 120, 109, 141, 167, 138, 159, 174, 174 },
+        {   1,  39,  44,  77, 113, 145,  92, 116, 162, 144, 166 }
+      }, { /* Coeff Band 3 */
+        {   1, 157, 214, 222, 176, 176, 208, 168, 213, 174, 219 },
+        {  80, 134, 199, 223, 180, 181, 191, 162, 200, 161, 218 },
+        {  27, 104, 155, 219, 168, 177, 192, 168, 186, 154, 214 },
+        {  23,  87, 114, 203, 138, 166, 199, 165, 178, 163, 210 },
+        {  16,  74,  84, 153, 118, 150, 198, 144, 194, 189, 198 },
+        {   1,  50,  64, 110, 121, 149, 118, 125, 177, 149, 194 }
+      }, { /* Coeff Band 4 */
+        {   1, 164, 216, 229, 181, 178, 223, 172, 228, 197, 221 },
+        {  86, 140, 196, 225, 176, 179, 215, 169, 220, 189, 222 },
+        {  30, 107, 149, 217, 160, 175, 216, 169, 212, 187, 219 },
+        {  24,  85, 109, 197, 133, 161, 215, 162, 211, 195, 214 },
+        {  17,  67,  76, 150, 114, 148, 202, 144, 222, 203, 204 },
+        {   3,  46,  55, 100, 111, 144, 140, 117, 215, 173, 197 }
+      }, { /* Coeff Band 5 */
+        {   1, 120, 224, 237, 184, 181, 241, 188, 249, 228, 231 },
+        { 139,  95, 209, 236, 184, 184, 237, 182, 247, 224, 230 },
+        {  67,  79, 160, 232, 172, 181, 236, 182, 246, 219, 233 },
+        {  48,  65, 120, 216, 141, 168, 234, 177, 245, 219, 229 },
+        {  32,  52,  85, 171, 119, 151, 222, 156, 246, 216, 224 },
+        {  13,  39,  58, 112, 111, 144, 157, 121, 229, 182, 211 }
+      }
     }
   }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 191,  34, 178, 193, 160, 173, 196, 142, 247, 191, 244 },
-      {  84,  45, 129, 187, 145, 170, 189, 145, 240, 186, 212 },
-      {  14,  36,  69, 149, 120, 154, 177, 136, 231, 177, 196 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1,  76, 169, 226, 167, 180, 227, 171, 247, 218, 226 },
-      {  72,  75, 162, 226, 166, 181, 231, 172, 242, 200, 219 },
-      {  30,  63, 130, 218, 153, 175, 226, 170, 247, 216, 219 },
-      {   5,  39,  67, 156, 119, 151, 194, 140, 239, 202, 216 }
-    }, { /* Coeff Band 2 */
-      {   1,  79, 182, 228, 175, 183, 224, 170, 247, 215, 220 },
-      {  69,  77, 168, 224, 170, 180, 223, 168, 246, 215, 223 },
-      {  24,  63, 126, 209, 153, 171, 219, 160, 247, 215, 225 },
-      {   3,  35,  58, 151, 115, 151, 191, 138, 240, 199, 220 }
-    }, { /* Coeff Band 3 */
-      {   1, 139, 213, 238, 194, 192, 234, 180, 244, 193, 236 },
-      {  82, 127, 204, 238, 190, 186, 234, 175, 244, 191, 235 },
-      {  26,  93, 161, 230, 173, 179, 233, 178, 249, 217, 241 },
-      {   3,  48,  78, 186, 132, 158, 212, 157, 244, 205, 233 }
-    }, { /* Coeff Band 4 */
-      {   1, 100, 208, 233, 180, 182, 238, 175, 250, 206, 225 },
-      {  84,  87, 184, 230, 175, 180, 236, 179, 250, 209, 243 },
-      {  14,  61, 111, 217, 146, 171, 236, 174, 249, 207, 245 },
-      {   1,  32,  49, 150, 106, 142, 212, 145, 242, 191, 237 }
-    }, { /* Coeff Band 5 */
-      {   1, 130, 223, 241, 192, 189, 231, 176, 250, 209, 246 },
-      { 101, 120, 207, 239, 188, 187, 240, 196, 250, 202, 255 },
-      {  19,  90, 155, 232, 169, 181, 238, 190, 250, 207, 249 },
-      {   1,  54,  86, 197, 130, 161, 220, 170, 248, 196, 248 }
-    }, { /* Coeff Band 6 */
-      {   1, 103, 208, 236, 183, 185, 235, 190, 243, 202, 219 },
-      {  95,  92, 185, 230, 175, 181, 233, 174, 242, 203, 225 },
-      {  24,  72, 131, 213, 152, 171, 226, 164, 241, 202, 220 },
-      {   3,  45,  74, 169, 123, 154, 204, 145, 238, 188, 222 }
-    }, { /* Coeff Band 7 */
-      {   1,  63, 236, 247, 205, 194, 241, 189, 252, 222, 255 },
-      { 151,  48, 224, 245, 200, 193, 240, 187, 255, 234, 255 },
-      {  76,  45, 178, 240, 180, 189, 239, 182, 253, 231, 255 },
-      {  38,  31, 111, 187, 125, 154, 217, 155, 253, 214, 255 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 220,  21, 181, 217, 157, 178, 234, 145, 248, 236, 173 },
+        { 125,  26, 143, 206, 148, 172, 213, 151, 240, 207, 199 },
+        {  44,  28,  84, 150, 125, 154, 171, 133, 225, 179, 192 }
+      }, { /* Coeff Band 1 */
+        {   1, 137, 209, 231, 181, 181, 223, 173, 245, 202, 236 },
+        { 147, 130, 214, 232, 181, 183, 224, 172, 245, 204, 220 },
+        { 111, 112, 183, 234, 188, 186, 223, 175, 246, 202, 237 },
+        {  89, 100, 159, 227, 163, 178, 222, 173, 246, 203, 220 },
+        {  55,  80, 124, 201, 142, 166, 219, 163, 246, 205, 223 },
+        {  23,  45,  70, 130, 119, 151, 157, 128, 224, 170, 207 }
+      }, { /* Coeff Band 2 */
+        {   1,  62, 195, 228, 177, 179, 220, 170, 244, 201, 226 },
+        {  87,  84, 172, 218, 165, 176, 212, 163, 242, 199, 228 },
+        {  28,  87, 124, 206, 154, 168, 209, 159, 241, 195, 227 },
+        {  10,  72,  94, 181, 127, 159, 200, 150, 240, 193, 226 },
+        {   4,  47,  58, 129, 109, 145, 176, 132, 237, 183, 222 },
+        {   1,  24,  26,  65,  95, 137, 109, 104, 210, 151, 197 }
+      }, { /* Coeff Band 3 */
+        {   1, 127, 206, 236, 183, 183, 230, 180, 247, 211, 234 },
+        { 113, 118, 195, 228, 174, 180, 225, 172, 248, 208, 231 },
+        {  43, 109, 162, 221, 166, 175, 220, 168, 248, 207, 232 },
+        {  17,  88, 126, 208, 152, 171, 214, 161, 247, 203, 236 },
+        {   5,  60,  84, 172, 125, 154, 199, 149, 244, 194, 237 },
+        {   1,  29,  41,  99, 104, 147, 146, 116, 227, 170, 223 }
+      }, { /* Coeff Band 4 */
+        {   1, 151, 222, 239, 193, 188, 231, 177, 250, 218, 241 },
+        { 114, 126, 203, 230, 180, 181, 226, 171, 249, 212, 246 },
+        {  51,  97, 175, 218, 166, 176, 220, 165, 250, 211, 231 },
+        {  23,  77, 136, 204, 155, 169, 213, 157, 248, 205, 241 },
+        {   6,  50,  85, 169, 126, 158, 197, 146, 245, 197, 243 },
+        {   1,  21,  37,  97, 101, 146, 146, 119, 232, 169, 232 }
+      }, { /* Coeff Band 5 */
+        {   1, 117, 230, 239, 194, 187, 233, 179, 252, 222, 248 },
+        { 148, 109, 210, 232, 184, 182, 227, 173, 252, 211, 244 },
+        {  80,  84, 162, 222, 168, 178, 225, 167, 252, 207, 244 },
+        {  43,  64, 122, 201, 142, 169, 218, 162, 251, 208, 254 },
+        {  17,  41,  76, 155, 120, 154, 200, 141, 249, 204, 248 },
+        {   5,  19,  35,  89,  99, 151, 140, 115, 241, 174, 244 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 240,  71, 232, 234, 178, 179, 246, 180, 251, 225, 232 },
+        { 168,  71, 198, 225, 167, 173, 229, 173, 247, 211, 218 },
+        {  75,  63, 144, 195, 150, 164, 192, 147, 245, 202, 213 }
+      }, { /* Coeff Band 1 */
+        {   1, 165, 237, 243, 186, 184, 247, 206, 255, 238, 255 },
+        { 159, 149, 229, 241, 192, 183, 244, 194, 255, 237, 239 },
+        { 110, 126, 195, 243, 196, 196, 239, 187, 255, 237, 242 },
+        {  89, 114, 170, 237, 168, 181, 239, 192, 254, 232, 241 },
+        {  54,  96, 145, 210, 151, 166, 237, 173, 253, 234, 249 },
+        {  17,  65, 108, 187, 140, 165, 194, 148, 244, 199, 227 }
+      }, { /* Coeff Band 2 */
+        {   1, 124, 227, 239, 183, 184, 240, 195, 249, 224, 240 },
+        { 112, 132, 206, 235, 183, 184, 232, 180, 246, 220, 234 },
+        {  36, 116, 161, 228, 170, 180, 229, 176, 244, 218, 239 },
+        {  22, 107, 126, 210, 139, 167, 225, 169, 244, 218, 229 },
+        {   9,  82,  90, 163, 122, 151, 210, 149, 246, 212, 227 },
+        {   1,  43,  51, 102, 105, 144, 152, 117, 234, 182, 213 }
+      }, { /* Coeff Band 3 */
+        {   1, 160, 234, 244, 195, 188, 244, 197, 251, 231, 250 },
+        { 119, 142, 220, 241, 192, 189, 241, 188, 251, 229, 243 },
+        {  38, 110, 180, 238, 183, 185, 238, 185, 251, 227, 246 },
+        {  27,  95, 130, 229, 164, 181, 234, 178, 251, 223, 233 },
+        {  13,  79,  97, 185, 125, 153, 223, 164, 250, 217, 238 },
+        {   1,  45,  57, 110, 111, 143, 164, 119, 235, 183, 220 }
+      }, { /* Coeff Band 4 */
+        {   1, 166, 239, 247, 207, 196, 244, 198, 251, 225, 245 },
+        { 119, 146, 224, 244, 199, 192, 240, 192, 251, 223, 240 },
+        {  46, 108, 189, 237, 180, 191, 236, 186, 249, 218, 248 },
+        {  29,  89, 154, 223, 165, 177, 228, 173, 250, 213, 224 },
+        {   8,  63, 104, 189, 139, 163, 207, 154, 246, 200, 241 },
+        {   1,  27,  40, 103, 102, 144, 146, 118, 230, 165, 223 }
+      }, { /* Coeff Band 5 */
+        {   1, 131, 242, 247, 207, 193, 244, 199, 251, 225, 248 },
+        { 150, 118, 231, 244, 202, 191, 239, 191, 252, 214, 241 },
+        {  79,  98, 188, 236, 185, 186, 232, 182, 251, 212, 249 },
+        {  55,  80, 145, 217, 154, 174, 222, 172, 250, 204, 253 },
+        {  27,  56,  94, 162, 128, 153, 198, 143, 248, 199, 240 },
+        {   4,  19,  33,  77,  98, 144, 129, 110, 237, 167, 241 }
+      }
     }
   }
 };
-static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES_8X8] = {
+static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES] = {
   { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 179, 203, 246, 252, 217, 208, 249, 197, 238, 237, 255 },
-      { 136, 193, 232, 247, 202, 199, 245, 194, 255, 235, 255 },
-      {  66, 170, 209, 244, 190, 191, 250, 199, 255, 242, 192 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      {   1, 191, 232, 250, 204, 201, 248, 199, 254, 243, 213 },
-      {  50, 161, 209, 247, 196, 197, 250, 206, 253, 240, 213 },
-      {   6, 118, 160, 239, 173, 186, 249, 203, 254, 235, 255 },
-      {   2,  90, 110, 211, 141, 166, 242, 181, 254, 235, 255 }
-    }, { /* Coeff Band 3 */
-      {   1, 209, 242, 254, 223, 215, 253, 218, 255, 253, 128 },
-      {  58, 168, 227, 253, 216, 211, 254, 226, 255, 251, 128 },
-      {   7, 111, 178, 249, 195, 202, 253, 222, 254, 240, 255 },
-      {   2,  63, 103, 226, 142, 175, 250, 202, 255, 246, 128 }
-    }, { /* Coeff Band 4 */
-      {   1, 207, 241, 252, 213, 205, 252, 215, 255, 228, 255 },
-      {  55, 171, 225, 251, 209, 205, 251, 212, 254, 234, 255 },
-      {   5, 108, 173, 247, 187, 195, 251, 211, 255, 231, 128 },
-      {   2,  56,  97, 220, 138, 169, 248, 191, 253, 237, 255 }
-    }, { /* Coeff Band 5 */
-      {   1, 211, 245, 255, 227, 219, 255, 233, 255, 255, 128 },
-      {  58, 175, 228, 254, 217, 215, 255, 231, 255, 255, 128 },
-      {   6, 124, 181, 249, 191, 199, 255, 222, 255, 251, 128 },
-      {   2,  85, 122, 227, 149, 172, 250, 195, 255, 245, 128 }
-    }, { /* Coeff Band 6 */
-      {   1, 216, 246, 255, 231, 217, 254, 220, 255, 250, 128 },
-      {  74, 177, 236, 254, 222, 214, 254, 221, 255, 255, 128 },
-      {  13, 125, 192, 250, 200, 203, 254, 217, 255, 245, 128 },
-      {   2,  70, 114, 227, 147, 175, 251, 198, 255, 240, 128 }
-    }, { /* Coeff Band 7 */
-      {   1, 199, 246, 255, 238, 229, 255, 226, 255, 255, 128 },
-      { 132, 162, 240, 255, 229, 222, 255, 239, 255, 255, 128 },
-      {  79, 125, 207, 253, 213, 214, 255, 232, 255, 255, 128 },
-      {  41,  89, 149, 240, 161, 187, 250, 216, 255, 255, 128 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 158,  29, 127, 187, 147, 164, 183, 146, 227, 188, 162 },
+        {  74,  36, 101, 162, 138, 162, 154, 134, 206, 165, 167 },
+        {  15,  28,  56, 109, 119, 151, 122, 120, 190, 151, 164 }
+      }, { /* Coeff Band 1 */
+        {   1, 129, 178, 205, 163, 170, 200, 152, 236, 185, 215 },
+        {  61, 129, 178, 205, 162, 170, 201, 152, 237, 189, 212 },
+        {  42, 113, 161, 203, 159, 168, 200, 153, 237, 188, 212 },
+        {  30,  91, 129, 196, 149, 166, 201, 152, 236, 186, 213 },
+        {  12,  63,  86, 169, 126, 155, 195, 145, 236, 187, 213 },
+        {   6,  34,  33,  89, 100, 139, 132, 115, 206, 157, 183 }
+      }, { /* Coeff Band 2 */
+        {   1,  75, 147, 182, 152, 162, 189, 141, 223, 179, 198 },
+        {  36,  71, 125, 184, 141, 161, 204, 147, 241, 200, 202 },
+        {  10,  56,  83, 163, 129, 153, 194, 140, 241, 194, 215 },
+        {   6,  44,  59, 139, 110, 146, 178, 131, 237, 186, 219 },
+        {   5,  35,  35,  96, 101, 140, 152, 117, 227, 170, 210 },
+        {   2,  25,  14,  46,  88, 129,  90,  99, 186, 138, 173 }
+      }, { /* Coeff Band 3 */
+        {   1, 135, 179, 191, 161, 166, 198, 136, 234, 184, 215 },
+        {  55, 116, 171, 216, 163, 174, 214, 163, 232, 196, 201 },
+        {  17,  89, 134, 205, 153, 166, 214, 159, 241, 200, 209 },
+        {   9,  69,  98, 187, 132, 159, 206, 149, 243, 198, 215 },
+        {   9,  53,  58, 142, 113, 151, 189, 135, 240, 187, 219 },
+        {   3,  36,  23,  69,  90, 133, 121, 109, 206, 155, 183 }
+      }, { /* Coeff Band 4 */
+        {   1, 163, 194, 208, 171, 171, 214, 140, 240, 191, 227 },
+        {  45, 129, 180, 226, 172, 180, 216, 169, 229, 186, 224 },
+        {  13,  94, 138, 216, 160, 171, 219, 167, 238, 198, 217 },
+        {  13,  72,  99, 196, 131, 160, 213, 156, 243, 201, 213 },
+        {  18,  62,  54, 136, 109, 149, 197, 132, 242, 193, 212 },
+        {   5,  40,  25,  60,  92, 133, 111, 105, 200, 150, 179 }
+      }, { /* Coeff Band 5 */
+        {   1, 187, 223, 230, 197, 185, 216, 139, 241, 174, 241 },
+        {  58, 144, 205, 236, 189, 188, 209, 168, 231, 172, 234 },
+        {  18, 104, 160, 226, 171, 180, 211, 170, 234, 180, 230 },
+        {  11,  76, 115, 205, 143, 166, 205, 161, 234, 182, 218 },
+        {  14,  66,  66, 138, 116, 150, 192, 128, 231, 180, 204 },
+        {   1,  35,  32,  61, 104, 140,  89, 105, 187, 138, 171 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 177, 107, 211, 210, 163, 167, 237, 156, 238, 209, 204 },
+        { 119, 101, 183, 200, 161, 168, 215, 148, 234, 191, 204 },
+        {  39,  81, 127, 173, 144, 162, 182, 137, 226, 176, 202 }
+      }, { /* Coeff Band 1 */
+        {   1, 175, 199, 199, 161, 158, 242, 141, 254, 226, 249 },
+        {  89, 154, 205, 213, 178, 172, 235, 138, 254, 221, 245 },
+        {  32, 123, 180, 231, 183, 185, 229, 158, 253, 216, 237 },
+        {  23, 102, 134, 226, 155, 177, 231, 175, 253, 215, 244 },
+        {  20,  88,  95, 176, 119, 151, 228, 154, 253, 218, 240 },
+        {   7,  54,  64, 120, 115, 146, 168, 119, 238, 186, 212 }
+      }, { /* Coeff Band 2 */
+        {   1, 151, 196, 204, 163, 163, 238, 144, 252, 219, 235 },
+        {  43, 128, 179, 218, 171, 175, 227, 152, 251, 214, 231 },
+        {  15,  94, 126, 216, 158, 174, 223, 165, 250, 211, 231 },
+        {  17,  82,  90, 190, 120, 157, 219, 160, 249, 209, 228 },
+        {  15,  87,  66, 123, 104, 139, 201, 130, 247, 202, 228 },
+        {   1,  43,  35,  70,  98, 134, 134, 105, 226, 168, 203 }
+      }, { /* Coeff Band 3 */
+        {   1, 172, 203, 207, 167, 163, 242, 146, 254, 225, 243 },
+        {  52, 139, 194, 224, 179, 179, 232, 153, 253, 219, 237 },
+        {  19, 102, 148, 225, 166, 180, 227, 170, 252, 217, 236 },
+        {  24,  87, 105, 205, 132, 161, 225, 167, 252, 215, 235 },
+        {  23,  90,  76, 140, 108, 144, 213, 138, 251, 211, 235 },
+        {   2,  42,  39,  80,  97, 134, 151, 109, 236, 180, 216 }
+      }, { /* Coeff Band 4 */
+        {   1, 183, 216, 216, 178, 168, 245, 145, 255, 226, 245 },
+        {  48, 149, 203, 231, 186, 185, 233, 155, 254, 220, 243 },
+        {  20, 108, 154, 227, 170, 181, 227, 169, 253, 219, 240 },
+        {  32,  87, 109, 205, 136, 163, 223, 166, 253, 217, 241 },
+        {  33,  91,  76, 139, 110, 144, 212, 135, 252, 212, 241 },
+        {   2,  39,  39,  83,  99, 136, 150, 108, 239, 181, 226 }
+      }, { /* Coeff Band 5 */
+        {   1, 196, 231, 239, 202, 187, 244, 160, 254, 222, 242 },
+        {  60, 151, 213, 240, 193, 191, 236, 175, 254, 220, 242 },
+        {  13, 107, 164, 231, 173, 181, 232, 177, 253, 219, 240 },
+        {   9,  78, 118, 210, 145, 169, 227, 169, 253, 218, 242 },
+        {  18,  65,  76, 160, 117, 151, 210, 144, 251, 210, 239 },
+        {   1,  28,  38,  92, 101, 140, 148, 113, 237, 177, 227 }
+      }
     }
   }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 138,  65, 189, 212, 172, 169, 200, 153, 233, 182, 214 },
-      {  93,  60, 162, 203, 160, 169, 200, 153, 239, 190, 213 },
-      {  66,  55, 141, 195, 152, 166, 199, 152, 238, 190, 212 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 102, 221, 247, 205, 198, 248, 201, 255, 235, 128 },
-      { 122,  95, 215, 247, 200, 197, 248, 200, 254, 227, 255 },
-      {  60,  81, 166, 241, 177, 190, 245, 193, 255, 246, 255 },
-      {  32,  61, 108, 195, 133, 159, 230, 163, 254, 230, 238 }
-    }, { /* Coeff Band 2 */
-      {   1,  58, 203, 242, 194, 193, 229, 177, 253, 225, 249 },
-      { 113,  62, 192, 237, 184, 187, 231, 181, 253, 220, 249 },
-      {  50,  50, 135, 225, 159, 177, 229, 172, 254, 222, 241 },
-      {  24,  34,  82, 185, 125, 152, 223, 158, 253, 212, 219 }
-    }, { /* Coeff Band 3 */
-      {   1,   1, 220, 253, 218, 209, 251, 213, 255, 255, 128 },
-      { 154,   1, 216, 252, 211, 206, 252, 212, 255, 252, 128 },
-      { 102,   1, 157, 249, 184, 200, 253, 214, 255, 247, 128 },
-      {  68,   1, 101, 213, 129, 161, 247, 186, 255, 237, 255 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 229,  64, 235, 236, 189, 190, 227, 179, 247, 203, 226 },
-      { 148,  70, 194, 228, 175, 182, 216, 170, 238, 192, 224 },
-      {  53,  63, 134, 207, 150, 169, 213, 161, 247, 204, 232 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 173, 234, 244, 201, 193, 239, 180, 252, 214, 255 },
-      { 160, 156, 222, 243, 200, 193, 237, 179, 253, 216, 255 },
-      {  55, 119, 187, 240, 189, 192, 236, 180, 253, 226, 255 },
-      {  14,  65, 105, 193, 142, 165, 205, 151, 249, 200, 250 }
-    }, { /* Coeff Band 2 */
-      {   1, 124, 218, 246, 195, 196, 242, 198, 254, 229, 255 },
-      {  85, 114, 180, 240, 179, 187, 239, 191, 253, 223, 239 },
-      {  18,  81, 128, 220, 152, 173, 232, 176, 252, 221, 254 },
-      {   2,  42,  64, 150, 115, 149, 192, 137, 247, 197, 247 }
-    }, { /* Coeff Band 3 */
-      {   1, 164, 230, 251, 210, 204, 245, 201, 255, 238, 255 },
-      {  96, 137, 210, 248, 199, 199, 244, 198, 254, 218, 255 },
-      {  20,  97, 169, 240, 179, 188, 242, 190, 254, 228, 255 },
-      {   2,  58,  95, 197, 137, 164, 220, 158, 252, 217, 248 }
-    }, { /* Coeff Band 4 */
-      {   1, 193, 236, 245, 203, 194, 243, 191, 254, 223, 255 },
-      {  86, 163, 217, 241, 190, 188, 242, 189, 253, 220, 255 },
-      {  14, 108, 161, 228, 167, 178, 238, 180, 253, 224, 255 },
-      {   1,  51,  84, 186, 127, 159, 216, 155, 251, 208, 243 }
-    }, { /* Coeff Band 5 */
-      {   1, 183, 235, 248, 209, 197, 244, 195, 253, 236, 239 },
-      {  79, 144, 208, 243, 193, 190, 244, 191, 254, 231, 255 },
-      {  13, 100, 151, 227, 163, 176, 240, 180, 255, 233, 244 },
-      {   1,  48,  77, 171, 121, 153, 214, 150, 252, 214, 245 }
-    }, { /* Coeff Band 6 */
-      {   1, 202, 234, 252, 215, 207, 248, 207, 254, 242, 255 },
-      {  75, 153, 216, 249, 203, 201, 248, 203, 255, 239, 255 },
-      {  11, 104, 168, 241, 179, 189, 245, 194, 255, 237, 128 },
-      {   1,  57,  95, 201, 134, 163, 229, 165, 254, 223, 246 }
-    }, { /* Coeff Band 7 */
-      {   1, 184, 236, 254, 222, 212, 254, 225, 255, 255, 128 },
-      {  74, 149, 220, 252, 210, 208, 253, 223, 255, 249, 128 },
-      {  18, 109, 175, 247, 184, 195, 253, 211, 255, 250, 128 },
-      {   3,  64, 113, 219, 144, 171, 246, 187, 255, 250, 128 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 140, 101, 214, 227, 176, 182, 218, 167, 233, 205, 164 },
-      {  96, 101, 176, 204, 161, 173, 193, 152, 223, 182, 182 },
-      {  27,  84, 123, 176, 140, 162, 190, 142, 238, 189, 210 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 178, 218, 240, 189, 189, 238, 184, 250, 232, 189 },
-      {  69, 146, 204, 239, 187, 189, 238, 183, 251, 226, 221 },
-      {  16,  98, 157, 234, 170, 185, 237, 183, 252, 220, 218 },
-      {   3,  49,  78, 172, 122, 154, 204, 150, 242, 198, 207 }
-    }, { /* Coeff Band 2 */
-      {   1, 165, 207, 230, 179, 181, 234, 172, 252, 228, 218 },
-      {  25, 130, 175, 224, 169, 177, 232, 169, 252, 230, 207 },
-      {   4,  81, 118, 205, 144, 167, 227, 162, 252, 225, 219 },
-      {   2,  51,  63, 150, 114, 148, 197, 138, 244, 202, 204 }
-    }, { /* Coeff Band 3 */
-      {   1, 181, 222, 247, 200, 197, 246, 199, 252, 232, 228 },
-      {  25, 142, 200, 244, 190, 193, 245, 195, 253, 233, 204 },
-      {   3,  90, 146, 233, 166, 181, 242, 188, 252, 229, 216 },
-      {   1,  47,  79, 188, 124, 157, 222, 162, 245, 213, 203 }
-    }, { /* Coeff Band 4 */
-      {   1, 179, 220, 242, 195, 191, 237, 182, 251, 217, 231 },
-      {  27, 144, 200, 241, 188, 190, 238, 185, 250, 224, 235 },
-      {   3,  93, 149, 230, 166, 180, 235, 180, 249, 222, 221 },
-      {   1,  47,  79, 181, 125, 157, 211, 154, 241, 205, 198 }
-    }, { /* Coeff Band 5 */
-      {   1, 176, 222, 247, 202, 198, 247, 199, 252, 234, 219 },
-      {  24, 139, 197, 244, 190, 192, 246, 196, 253, 232, 220 },
-      {   2,  89, 140, 229, 161, 178, 243, 185, 253, 233, 234 },
-      {   1,  49,  76, 176, 121, 154, 214, 153, 243, 209, 208 }
-    }, { /* Coeff Band 6 */
-      {   1, 197, 233, 251, 213, 205, 247, 206, 249, 222, 247 },
-      {  35, 159, 216, 249, 203, 201, 246, 203, 250, 222, 223 },
-      {   4, 108, 167, 240, 178, 188, 244, 195, 248, 220, 235 },
-      {   1,  58,  93, 198, 133, 161, 220, 167, 233, 195, 221 }
-    }, { /* Coeff Band 7 */
-      {   1, 188, 240, 253, 221, 209, 248, 207, 252, 223, 255 },
-      {  84, 153, 227, 251, 212, 205, 247, 205, 254, 215, 255 },
-      {  25, 117, 182, 244, 186, 192, 243, 198, 250, 209, 255 },
-      {   7,  72, 108, 197, 138, 162, 203, 161, 240, 178, 247 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 202,  29, 181, 221, 168, 177, 217, 162, 235, 202, 157 },
+        { 117,  39, 146, 207, 155, 172, 203, 155, 236, 192, 208 },
+        {  46,  40,  99, 171, 136, 161, 176, 140, 229, 177, 208 }
+      }, { /* Coeff Band 1 */
+        {   1, 138, 204, 227, 179, 181, 224, 161, 249, 203, 237 },
+        { 116, 138, 209, 227, 179, 180, 222, 165, 248, 204, 241 },
+        {  63, 112, 184, 227, 183, 178, 223, 167, 248, 206, 237 },
+        {  47,  84, 140, 219, 163, 177, 223, 160, 249, 207, 241 },
+        {  25,  53,  76, 179, 120, 156, 217, 152, 248, 205, 232 },
+        {  10,  23,  29,  76,  91, 132, 145, 109, 228, 169, 214 }
+      }, { /* Coeff Band 2 */
+        {   1,  69, 198, 223, 179, 177, 225, 154, 251, 208, 227 },
+        {  78,  78, 170, 223, 170, 179, 218, 162, 248, 203, 245 },
+        {  26,  69, 117, 209, 154, 170, 215, 160, 249, 205, 239 },
+        {  16,  54,  79, 180, 119, 156, 208, 151, 248, 201, 238 },
+        {  12,  43,  45, 119, 102, 142, 186, 126, 245, 193, 236 },
+        {   1,  24,  22,  60,  92, 133, 114,  99, 221, 154, 210 }
+      }, { /* Coeff Band 3 */
+        {   1, 135, 214, 222, 183, 178, 230, 144, 252, 208, 241 },
+        { 107, 122, 201, 229, 181, 182, 221, 165, 250, 202, 243 },
+        {  38, 100, 168, 221, 168, 176, 220, 166, 250, 208, 240 },
+        {  21,  83, 125, 206, 149, 167, 217, 160, 250, 209, 238 },
+        {  16,  65,  80, 164, 122, 156, 208, 139, 250, 206, 246 },
+        {   3,  37,  43, 104, 103, 143, 156, 118, 237, 173, 227 }
+      }, { /* Coeff Band 4 */
+        {   1, 169, 223, 233, 193, 184, 234, 150, 254, 206, 243 },
+        {  83, 140, 201, 233, 184, 185, 228, 168, 252, 203, 223 },
+        {  19, 104, 158, 225, 168, 179, 228, 169, 253, 207, 248 },
+        {  10,  76, 117, 209, 145, 168, 223, 166, 252, 210, 243 },
+        {   8,  59,  79, 163, 119, 153, 213, 142, 250, 205, 230 },
+        {   1,  31,  43, 100, 103, 144, 149, 116, 240, 171, 221 }
+      }, { /* Coeff Band 5 */
+        {   1, 190, 234, 247, 211, 197, 239, 172, 255, 208, 236 },
+        {  65, 152, 218, 244, 199, 194, 236, 184, 252, 199, 249 },
+        {  17, 109, 173, 237, 179, 186, 235, 183, 250, 205, 255 },
+        {   6,  78, 127, 219, 153, 173, 231, 177, 251, 210, 249 },
+        {   3,  56,  77, 172, 121, 157, 215, 152, 249, 209, 247 },
+        {   1,  29,  38,  96,  97, 144, 152, 114, 239, 169, 243 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 223,  71, 225, 221, 176, 169, 242, 165, 248, 216, 201 },
+        { 147,  79, 197, 215, 175, 172, 230, 154, 243, 203, 184 },
+        {  69,  75, 152, 197, 158, 168, 203, 144, 231, 187, 177 }
+      }, { /* Coeff Band 1 */
+        {   1, 168, 219, 195, 168, 151, 249, 131, 255, 221, 255 },
+        { 152, 156, 226, 210, 189, 173, 240, 121, 255, 215, 238 },
+        {  82, 128, 198, 239, 201, 194, 220, 151, 254, 202, 251 },
+        {  74, 107, 150, 236, 163, 187, 222, 177, 255, 204, 255 },
+        {  59, 103, 120, 181, 125, 148, 232, 157, 255, 219, 245 },
+        {  21,  63,  84, 129, 122, 150, 171, 118, 246, 196, 226 }
+      }, { /* Coeff Band 2 */
+        {   1, 133, 219, 202, 174, 158, 244, 133, 255, 214, 237 },
+        { 101, 132, 204, 221, 187, 183, 225, 131, 253, 201, 247 },
+        {  41, 107, 147, 228, 174, 187, 211, 162, 252, 201, 246 },
+        {  40, 107, 107, 205, 129, 162, 213, 164, 252, 206, 232 },
+        {  24, 140,  90, 122, 111, 141, 210, 127, 251, 208, 239 },
+        {   1,  59,  55,  91, 111, 141, 144, 109, 241, 180, 226 }
+      }, { /* Coeff Band 3 */
+        {   1, 170, 226, 200, 179, 153, 245, 138, 255, 214, 241 },
+        { 111, 149, 217, 226, 194, 186, 223, 137, 255, 211, 253 },
+        {  40, 113, 174, 228, 180, 183, 211, 165, 255, 212, 247 },
+        {  44, 101, 126, 210, 151, 167, 212, 161, 255, 217, 241 },
+        {  43, 131, 103, 146, 119, 148, 211, 136, 254, 216, 250 },
+        {   1,  57,  63, 112, 116, 145, 158, 115, 249, 193, 236 }
+      }, { /* Coeff Band 4 */
+        {   1, 186, 233, 216, 191, 163, 241, 143, 255, 210, 255 },
+        {  91, 161, 214, 225, 190, 181, 224, 150, 255, 212, 253 },
+        {  26, 117, 163, 220, 172, 180, 218, 148, 255, 215, 252 },
+        {  27,  90, 122, 203, 143, 167, 212, 159, 255, 213, 255 },
+        {  21,  98, 113, 163, 130, 153, 208, 141, 255, 215, 248 },
+        {   1,  47,  66, 130, 118, 151, 167, 123, 252, 199, 235 }
+      }, { /* Coeff Band 5 */
+        {   1, 195, 236, 245, 211, 195, 238, 171, 255, 209, 248 },
+        {  65, 156, 218, 245, 200, 196, 230, 185, 255, 212, 248 },
+        {  13, 112, 172, 238, 180, 189, 231, 185, 255, 213, 250 },
+        {   6,  83, 130, 224, 155, 177, 227, 180, 255, 214, 244 },
+        {   5,  71,  91, 185, 133, 160, 214, 154, 254, 212, 248 },
+        {   1,  45,  63, 128, 112, 147, 169, 129, 248, 190, 236 }
+      }
     }
   }
 };
-static const vp9_coeff_probs default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] = {
+static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES] = {
   { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {  22,  27,  75, 145, 125, 152, 158, 133, 203, 164, 150 },
+        {   6,  27,  63, 124, 120, 150, 135, 127, 190, 154, 152 },
+        {   1,  19,  36,  82, 107, 143, 101, 114, 176, 140, 152 }
+      }, { /* Coeff Band 1 */
+        {   1, 104, 143, 189, 150, 164, 194, 146, 239, 191, 205 },
+        {  49, 105, 143, 188, 149, 164, 194, 146, 238, 191, 204 },
+        {  29,  96, 133, 186, 147, 163, 194, 146, 238, 192, 202 },
+        {  14,  79, 112, 178, 139, 160, 193, 144, 237, 191, 205 },
+        {   5,  50,  74, 151, 119, 150, 187, 137, 237, 190, 205 },
+        {   1,  20,  29,  76,  98, 138, 116, 111, 197, 153, 168 }
+      }, { /* Coeff Band 2 */
+        {   1,  61, 124, 173, 145, 162, 176, 137, 234, 179, 218 },
+        {  22,  56,  98, 158, 134, 157, 171, 133, 234, 178, 216 },
+        {   7,  44,  70, 137, 122, 151, 162, 128, 232, 175, 214 },
+        {   2,  33,  50, 114, 110, 146, 149, 121, 229, 169, 213 },
+        {   1,  21,  32,  84, 100, 139, 127, 112, 220, 158, 207 },
+        {   1,  11,  16,  46,  91, 133,  79, 100, 175, 133, 163 }
+      }, { /* Coeff Band 3 */
+        {   1, 121, 166, 205, 160, 170, 204, 153, 240, 195, 210 },
+        {  34, 101, 146, 198, 153, 167, 202, 152, 239, 193, 213 },
+        {   9,  78, 118, 187, 142, 163, 198, 148, 238, 192, 211 },
+        {   3,  60,  90, 170, 130, 157, 192, 143, 237, 190, 210 },
+        {   1,  39,  59, 138, 112, 148, 177, 132, 233, 183, 207 },
+        {   1,  18,  28,  75,  96, 137, 117, 110, 199, 153, 173 }
+      }, { /* Coeff Band 4 */
+        {   1, 148, 183, 220, 169, 175, 217, 164, 244, 203, 216 },
+        {  24, 115, 157, 211, 159, 171, 214, 160, 243, 201, 217 },
+        {   3,  81, 120, 197, 145, 166, 209, 155, 243, 200, 216 },
+        {   1,  56,  88, 176, 129, 158, 200, 147, 241, 196, 216 },
+        {   1,  33,  53, 134, 108, 147, 178, 132, 236, 184, 213 },
+        {   1,  13,  20,  62,  91, 135, 107, 106, 197, 148, 179 }
+      }, { /* Coeff Band 5 */
+        {   1, 195, 212, 238, 191, 187, 229, 176, 247, 210, 222 },
+        {  22, 136, 185, 230, 176, 182, 226, 173, 247, 208, 219 },
+        {   3,  88, 137, 215, 156, 173, 222, 167, 246, 207, 220 },
+        {   1,  57,  94, 190, 133, 162, 213, 157, 245, 204, 217 },
+        {   1,  30,  52, 138, 107, 147, 188, 135, 241, 193, 215 },
+        {   1,  11,  19,  61,  89, 136, 110, 104, 203, 153, 175 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  27,  72, 214, 225, 182, 183, 188, 159, 196, 172, 138 },
+        {  13,  69, 175, 209, 167, 173, 188, 152, 216, 174, 171 },
+        {   5,  52, 103, 162, 138, 160, 159, 137, 202, 164, 167 }
+      }, { /* Coeff Band 1 */
+        {   1, 174, 218, 237, 187, 186, 229, 176, 247, 212, 225 },
+        { 122, 158, 210, 236, 185, 185, 228, 174, 247, 210, 217 },
+        {  48, 133, 188, 234, 182, 184, 228, 173, 247, 210, 229 },
+        {  24, 109, 157, 227, 165, 179, 227, 172, 248, 211, 226 },
+        {  12,  82, 114, 198, 137, 162, 223, 162, 247, 209, 220 },
+        {   6,  49,  66, 127, 116, 148, 159, 125, 221, 175, 178 }
+      }, { /* Coeff Band 2 */
+        {   1, 151, 207, 229, 180, 181, 223, 168, 247, 210, 211 },
+        {  40, 122, 175, 222, 170, 177, 220, 164, 245, 207, 209 },
+        {   4,  84, 125, 207, 151, 169, 215, 159, 244, 205, 209 },
+        {   1,  58,  89, 180, 129, 159, 206, 150, 243, 202, 204 },
+        {   1,  35,  53, 131, 108, 145, 181, 130, 238, 192, 198 },
+        {   1,  16,  24,  67,  95, 137, 109, 106, 192, 153, 155 }
+      }, { /* Coeff Band 3 */
+        {   1, 172, 215, 238, 189, 187, 231, 178, 248, 213, 210 },
+        {  44, 136, 190, 233, 179, 183, 229, 175, 248, 212, 212 },
+        {   4,  94, 144, 222, 161, 176, 226, 170, 247, 211, 212 },
+        {   1,  66, 103, 201, 139, 165, 219, 161, 247, 208, 212 },
+        {   1,  39,  61, 153, 111, 149, 199, 142, 243, 200, 210 },
+        {   1,  16,  25,  74,  93, 136, 124, 109, 204, 160, 171 }
+      }, { /* Coeff Band 4 */
+        {   1, 185, 218, 241, 192, 190, 231, 180, 248, 213, 213 },
+        {  32, 143, 191, 235, 181, 185, 229, 176, 248, 211, 215 },
+        {   2,  97, 144, 223, 162, 177, 226, 171, 248, 210, 222 },
+        {   1,  65, 102, 199, 138, 165, 218, 160, 247, 208, 214 },
+        {   1,  35,  56, 145, 109, 147, 193, 137, 243, 198, 213 },
+        {   1,  13,  21,  65,  91, 135, 115, 105, 205, 157, 179 }
+      }, { /* Coeff Band 5 */
+        {   1, 209, 222, 243, 199, 192, 233, 181, 249, 215, 228 },
+        {  23, 147, 197, 237, 185, 187, 231, 177, 249, 214, 227 },
+        {   1,  94, 146, 224, 163, 178, 228, 172, 249, 213, 229 },
+        {   1,  60,  99, 197, 136, 164, 220, 161, 248, 210, 227 },
+        {   1,  31,  54, 141, 107, 147, 194, 136, 246, 201, 226 },
+        {   1,  12,  21,  65,  90, 135, 119, 104, 217, 159, 201 }
+      }
     }
   }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      { 118,  27, 105, 170, 137, 166, 183, 137, 243, 189, 241 },
-      {  44,  34,  85, 142, 127, 158, 161, 128, 232, 174, 213 },
-      {   8,  26,  47, 104, 108, 145, 143, 117, 226, 168, 207 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 134, 172, 217, 163, 175, 226, 167, 251, 220, 204 },
-      {  56, 129, 168, 217, 161, 174, 223, 164, 249, 218, 223 },
-      {  20, 110, 151, 215, 158, 174, 221, 165, 249, 209, 221 },
-      {   2,  59,  88, 169, 128, 157, 192, 143, 239, 189, 214 }
-    }, { /* Coeff Band 2 */
-      {   1,  65, 126, 191, 140, 163, 218, 153, 252, 218, 229 },
-      {  21,  57,  92, 175, 126, 156, 214, 148, 252, 218, 229 },
-      {   4,  44,  66, 148, 114, 148, 200, 136, 251, 211, 228 },
-      {   1,  28,  42, 108, 104, 141, 158, 119, 235, 180, 210 }
-    }, { /* Coeff Band 3 */
-      {   1, 114, 172, 227, 166, 177, 236, 178, 252, 226, 233 },
-      {  41,  94, 152, 218, 156, 172, 233, 172, 251, 223, 231 },
-      {   9,  69, 116, 202, 142, 165, 226, 162, 251, 221, 227 },
-      {   1,  36,  60, 151, 113, 148, 195, 140, 241, 198, 211 }
-    }, { /* Coeff Band 4 */
-      {   1, 186, 200, 227, 174, 178, 230, 169, 248, 210, 238 },
-      {  27, 148, 181, 221, 167, 176, 226, 166, 250, 218, 228 },
-      {   3,  96, 139, 208, 154, 170, 219, 161, 249, 214, 229 },
-      {   1,  44,  70, 156, 120, 152, 188, 139, 239, 193, 200 }
-    }, { /* Coeff Band 5 */
-      {   1, 169, 203, 238, 186, 186, 238, 184, 252, 224, 230 },
-      {  32, 119, 173, 232, 172, 181, 236, 182, 252, 222, 237 },
-      {   6,  84, 128, 215, 150, 170, 232, 172, 251, 221, 235 },
-      {   1,  49,  78, 167, 124, 154, 200, 145, 243, 198, 217 }
-    }, { /* Coeff Band 6 */
-      {   1, 193, 215, 244, 197, 195, 239, 192, 249, 213, 240 },
-      {  52, 136, 193, 239, 184, 189, 237, 189, 248, 211, 226 },
-      {  13,  90, 146, 227, 162, 178, 233, 182, 248, 211, 231 },
-      {   1,  49,  79, 177, 124, 156, 201, 154, 234, 188, 212 }
-    }, { /* Coeff Band 7 */
-      {   1, 189, 238, 248, 219, 196, 232, 180, 253, 211, 255 },
-      { 104, 148, 224, 245, 211, 194, 225, 171, 251, 206, 255 },
-      {  43, 116, 190, 231, 179, 183, 217, 168, 249, 199, 255 },
-      {  13,  65,  92, 154, 131, 152, 167, 132, 238, 174, 243 }
-    }
-  }
-};
-static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES_16X16] = {
-  { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 223,  34, 236, 234, 193, 185, 216, 169, 239, 189, 229 },
-      { 125,  40, 195, 221, 173, 175, 209, 165, 220, 181, 196 },
-      {  41,  37, 127, 185, 145, 162, 191, 150, 227, 180, 219 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 160, 224, 239, 193, 190, 213, 178, 244, 174, 255 },
-      { 199, 154, 212, 238, 190, 190, 210, 173, 246, 183, 249 },
-      {  88, 122, 178, 234, 180, 187, 213, 174, 244, 182, 247 },
-      {  27,  69, 100, 174, 139, 165, 159, 142, 225, 157, 240 }
-    }, { /* Coeff Band 2 */
-      {   1, 118, 207, 237, 179, 185, 234, 189, 241, 194, 237 },
-      {  86, 103, 161, 227, 163, 176, 231, 183, 241, 196, 234 },
-      {  19,  69, 113, 205, 140, 166, 220, 169, 240, 188, 242 },
-      {   3,  32,  49, 106, 111, 144, 132, 121, 225, 151, 237 }
-    }, { /* Coeff Band 3 */
-      {   1, 160, 218, 245, 197, 195, 235, 189, 254, 218, 255 },
-      {  90, 127, 193, 240, 186, 189, 235, 187, 251, 217, 230 },
-      {  18,  92, 148, 229, 164, 179, 228, 180, 254, 212, 229 },
-      {   2,  50,  79, 163, 126, 156, 186, 140, 247, 191, 236 }
-    }, { /* Coeff Band 4 */
-      {   1, 196, 231, 240, 203, 191, 225, 171, 253, 214, 255 },
-      {  71, 167, 210, 234, 194, 188, 218, 165, 253, 215, 236 },
-      {  11, 119, 165, 217, 171, 177, 213, 155, 252, 209, 255 },
-      {   1,  46,  70, 145, 121, 153, 180, 131, 249, 192, 246 }
-    }, { /* Coeff Band 5 */
-      {   1, 176, 223, 242, 202, 194, 222, 169, 253, 211, 244 },
-      {  62, 131, 191, 233, 185, 186, 219, 164, 251, 211, 252 },
-      {   7,  89, 133, 207, 156, 173, 211, 157, 251, 206, 247 },
-      {   1,  36,  56, 127, 113, 147, 166, 125, 243, 183, 242 }
-    }, { /* Coeff Band 6 */
-      {   1, 203, 232, 249, 213, 202, 245, 193, 254, 237, 255 },
-      {  51, 155, 212, 245, 199, 195, 244, 192, 254, 234, 255 },
-      {   7, 101, 158, 233, 170, 181, 244, 185, 253, 242, 255 },
-      {   1,  49,  82, 185, 123, 157, 226, 156, 252, 225, 240 }
-    }, { /* Coeff Band 7 */
-      {   1, 222, 233, 252, 220, 207, 247, 206, 255, 240, 128 },
-      {  40, 159, 216, 250, 205, 201, 248, 207, 249, 219, 255 },
-      {   6, 106, 163, 240, 176, 188, 247, 198, 251, 222, 255 },
-      {   1,  51,  88, 196, 127, 159, 232, 169, 252, 214, 255 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      {  14,  78, 225, 217, 173, 181, 198, 153, 228, 185, 176 },
-      {   9,  74, 179, 191, 157, 171, 178, 143, 229, 175, 209 },
-      {   3,  48,  92, 128, 130, 155, 135, 123, 220, 155, 219 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 178, 209, 214, 173, 175, 208, 152, 252, 210, 237 },
-      { 142, 151, 193, 212, 170, 175, 209, 151, 251, 208, 237 },
-      {  38, 105, 150, 206, 159, 173, 208, 151, 250, 209, 238 },
-      {   5,  44,  61, 128, 114, 147, 167, 125, 239, 184, 217 }
-    }, { /* Coeff Band 2 */
-      {   1, 154, 195, 202, 166, 173, 184, 144, 245, 184, 236 },
-      {  49, 110, 150, 188, 155, 168, 180, 141, 244, 183, 239 },
-      {   4,  63,  90, 158, 132, 157, 171, 134, 243, 179, 239 },
-      {   1,  25,  37,  93, 104, 141, 133, 114, 231, 161, 226 }
-    }, { /* Coeff Band 3 */
-      {   1, 184, 201, 223, 173, 177, 224, 164, 253, 220, 238 },
-      {  42, 127, 170, 215, 164, 173, 223, 162, 253, 219, 233 },
-      {   4,  75, 114, 195, 142, 164, 218, 155, 253, 217, 235 },
-      {   1,  32,  50, 128, 108, 144, 180, 127, 247, 197, 219 }
-    }, { /* Coeff Band 4 */
-      {   1, 190, 207, 232, 181, 184, 228, 172, 251, 216, 212 },
-      {  35, 136, 180, 227, 173, 180, 227, 171, 251, 216, 218 },
-      {   2,  85, 131, 214, 154, 173, 224, 166, 250, 214, 225 },
-      {   1,  44,  71, 162, 120, 153, 195, 143, 240, 195, 197 }
-    }, { /* Coeff Band 5 */
-      {   1, 185, 201, 230, 177, 180, 232, 172, 253, 225, 235 },
-      {  27, 122, 165, 221, 164, 175, 230, 169, 253, 224, 220 },
-      {   1,  72, 108, 197, 139, 163, 224, 159, 253, 224, 226 },
-      {   1,  33,  51, 132, 107, 144, 186, 130, 245, 201, 206 }
-    }, { /* Coeff Band 6 */
-      {   1, 203, 214, 240, 193, 191, 235, 178, 252, 225, 224 },
-      {  20, 140, 188, 235, 182, 186, 234, 177, 252, 226, 226 },
-      {   1,  85, 132, 218, 155, 174, 230, 170, 251, 224, 227 },
-      {   1,  39,  62, 154, 114, 150, 199, 141, 241, 203, 214 }
-    }, { /* Coeff Band 7 */
-      {   1, 217, 224, 244, 202, 193, 241, 187, 252, 227, 239 },
-      {  22, 151, 200, 239, 187, 188, 240, 184, 252, 226, 237 },
-      {   2,  90, 138, 222, 158, 174, 237, 176, 252, 226, 239 },
-      {   1,  41,  66, 163, 116, 151, 206, 146, 243, 201, 230 }
-    }
-  }
-};
-static const vp9_coeff_probs default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] = {
-  { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      {   3,  29,  86, 140, 130, 163, 135, 131, 190, 148, 186 },
-      {   1,  26,  61, 105, 124, 156, 105, 119, 178, 138, 173 },
-      {   1,  15,  28,  60, 105, 142,  80, 105, 173, 128, 178 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 130, 142, 172, 141, 161, 191, 140, 244, 193, 216 },
-      {  61, 124, 141, 173, 141, 161, 190, 139, 244, 194, 215 },
-      {  28, 103, 124, 171, 138, 160, 190, 140, 243, 194, 225 },
-      {   1,  36,  51, 111, 109, 144, 152, 120, 227, 173, 205 }
-    }, { /* Coeff Band 2 */
-      {   1,  60, 125, 153, 143, 159, 156, 127, 234, 170, 233 },
-      {  22,  48,  78, 129, 124, 152, 151, 123, 234, 170, 233 },
-      {   3,  32,  46,  98, 107, 142, 138, 114, 232, 165, 232 },
-      {   1,  15,  23,  61,  96, 135, 101, 103, 210, 144, 213 }
-    }, { /* Coeff Band 3 */
-      {   1, 102, 144, 182, 146, 162, 194, 143, 246, 196, 239 },
-      {  34,  76, 116, 171, 136, 159, 192, 140, 246, 195, 239 },
-      {   4,  51,  81, 153, 124, 153, 184, 135, 246, 192, 239 },
-      {   1,  23,  37,  98, 102, 140, 142, 116, 230, 167, 227 }
-    }, { /* Coeff Band 4 */
-      {   1, 165, 171, 214, 163, 174, 214, 160, 245, 203, 219 },
-      {  16, 120, 154, 210, 158, 172, 212, 159, 245, 201, 219 },
-      {   1,  80, 122, 199, 147, 167, 208, 154, 244, 200, 223 },
-      {   1,  40,  65, 145, 118, 151, 171, 135, 226, 175, 202 }
-    }, { /* Coeff Band 5 */
-      {   1, 146, 162, 215, 159, 172, 226, 165, 251, 218, 231 },
-      {  16,  92, 131, 205, 147, 167, 224, 162, 252, 217, 228 },
-      {   2,  60,  92, 182, 129, 158, 216, 152, 251, 214, 234 },
-      {   1,  32,  50, 126, 107, 144, 176, 128, 240, 189, 216 }
-    }, { /* Coeff Band 6 */
-      {   1, 178, 186, 224, 172, 178, 224, 167, 251, 214, 232 },
-      {  14, 118, 158, 215, 160, 173, 223, 164, 250, 214, 228 },
-      {   2,  70, 109, 194, 139, 164, 217, 156, 250, 213, 227 },
-      {   1,  32,  51, 129, 108, 146, 175, 128, 240, 187, 218 }
-    }, { /* Coeff Band 7 */
-      {   1, 210, 214, 240, 192, 188, 235, 182, 251, 221, 228 },
-      {  22, 140, 187, 233, 177, 183, 234, 178, 251, 219, 233 },
-      {   3,  82, 130, 215, 152, 171, 229, 171, 250, 217, 232 },
-      {   1,  38,  63, 154, 115, 149, 195, 141, 240, 196, 219 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 198,  28, 192, 217, 170, 174, 201, 162, 219, 179, 159 },
+        {  96,  36, 145, 198, 153, 167, 193, 153, 222, 180, 177 },
+        {  31,  35,  89, 156, 131, 157, 166, 136, 214, 170, 178 }
+      }, { /* Coeff Band 1 */
+        {   1, 138, 202, 225, 174, 178, 218, 164, 243, 200, 201 },
+        { 147, 134, 202, 223, 174, 177, 215, 162, 243, 204, 220 },
+        {  65, 115, 179, 224, 176, 177, 215, 162, 243, 202, 227 },
+        {  25,  86, 141, 217, 163, 177, 216, 159, 243, 201, 225 },
+        {   6,  48,  79, 181, 125, 157, 209, 151, 244, 201, 212 },
+        {   1,  16,  25,  77,  91, 134, 132, 112, 210, 162, 180 }
+      }, { /* Coeff Band 2 */
+        {   1,  78, 195, 222, 172, 177, 219, 162, 245, 205, 227 },
+        {  67,  79, 154, 211, 158, 171, 212, 159, 243, 201, 222 },
+        {  18,  63, 108, 192, 140, 163, 205, 152, 242, 197, 214 },
+        {   6,  49,  77, 163, 121, 154, 192, 142, 239, 191, 216 },
+        {   1,  34,  49, 112, 106, 143, 160, 122, 233, 178, 213 },
+        {   1,  14,  20,  56,  93, 135,  94, 102, 189, 141, 170 }
+      }, { /* Coeff Band 3 */
+        {   1, 137, 210, 229, 182, 181, 223, 164, 247, 214, 201 },
+        {  89, 123, 189, 226, 176, 180, 217, 165, 245, 207, 216 },
+        {  24, 100, 155, 217, 162, 176, 215, 163, 242, 198, 215 },
+        {   8,  78, 121, 199, 147, 167, 206, 155, 241, 198, 212 },
+        {   2,  52,  81, 161, 125, 156, 185, 139, 236, 186, 207 },
+        {   1,  22,  35,  88, 102, 141, 121, 116, 199, 153, 179 }
+      }, { /* Coeff Band 4 */
+        {   1, 169, 220, 239, 196, 191, 220, 173, 242, 201, 226 },
+        {  64, 139, 195, 231, 183, 184, 215, 169, 240, 196, 211 },
+        {  12, 103, 153, 217, 162, 174, 212, 163, 236, 195, 211 },
+        {   3,  71, 109, 190, 141, 164, 202, 152, 240, 192, 220 },
+        {   1,  38,  61, 139, 114, 149, 175, 133, 233, 183, 211 },
+        {   1,  13,  22,  61,  93, 134, 101, 106, 194, 145, 185 }
+      }, { /* Coeff Band 5 */
+        {   1, 204, 220, 234, 193, 185, 220, 166, 247, 207, 237 },
+        {  42, 139, 187, 221, 174, 177, 215, 161, 246, 201, 242 },
+        {   5,  83, 132, 204, 152, 168, 212, 158, 246, 203, 225 },
+        {   1,  48,  84, 175, 126, 157, 203, 148, 245, 199, 233 },
+        {   1,  24,  46, 123, 103, 142, 178, 128, 243, 189, 235 },
+        {   1,  10,  19,  58,  88, 134, 109, 101, 216, 151, 216 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 227,  36, 243, 237, 206, 186, 210, 157, 245, 195, 200 },
+        { 144,  41, 214, 226, 190, 182, 207, 155, 238, 193, 177 },
+        {  63,  37, 153, 199, 162, 169, 193, 145, 227, 187, 152 }
+      }, { /* Coeff Band 1 */
+        {   1, 170, 247, 248, 213, 201, 239, 188, 238, 203, 255 },
+        { 214, 166, 242, 248, 212, 198, 236, 191, 221, 219, 199 },
+        { 139, 148, 224, 247, 207, 197, 236, 189, 249, 241, 128 },
+        { 102, 127, 195, 244, 190, 198, 235, 189, 239, 202, 228 },
+        {  76, 106, 154, 227, 159, 176, 234, 182, 243, 216, 229 },
+        {  52,  69,  93, 158, 125, 155, 173, 139, 225, 170, 209 }
+      }, { /* Coeff Band 2 */
+        {   1, 139, 241, 245, 205, 193, 230, 177, 239, 198, 183 },
+        { 131, 139, 214, 240, 191, 189, 224, 181, 236, 203, 194 },
+        {  32, 102, 157, 228, 167, 177, 221, 174, 235, 191, 194 },
+        {  12,  75, 112, 201, 142, 163, 208, 161, 227, 180, 200 },
+        {   2,  45,  66, 142, 119, 154, 178, 141, 220, 171, 213 },
+        {   1,  15,  20,  56, 102, 151,  87, 104, 182, 136, 175 }
+      }, { /* Coeff Band 3 */
+        {   1, 174, 243, 248, 212, 201, 237, 194, 249, 207, 255 },
+        { 134, 155, 223, 244, 200, 195, 230, 184, 248, 189, 233 },
+        {  26, 115, 177, 235, 180, 185, 225, 176, 245, 198, 255 },
+        {   8,  82, 129, 217, 156, 175, 220, 168, 243, 204, 228 },
+        {   3,  48,  75, 165, 122, 155, 193, 145, 245, 189, 199 },
+        {   1,  15,  27,  73, 101, 139, 117, 112, 212, 157, 209 }
+      }, { /* Coeff Band 4 */
+        {   1, 191, 244, 248, 214, 200, 229, 185, 249, 207, 255 },
+        { 106, 167, 221, 242, 198, 192, 223, 178, 245, 202, 246 },
+        {  13, 117, 169, 229, 175, 182, 220, 170, 244, 202, 226 },
+        {   2,  74, 114, 203, 143, 170, 211, 160, 248, 199, 232 },
+        {   1,  35,  58, 141, 111, 144, 184, 132, 244, 196, 239 },
+        {   1,  12,  22,  66,  91, 138, 114, 102, 225, 156, 214 }
+      }, { /* Coeff Band 5 */
+        {   1, 220, 231, 246, 203, 196, 239, 188, 255, 212, 255 },
+        {  42, 155, 203, 241, 189, 191, 235, 184, 253, 220, 255 },
+        {   4,  95, 151, 230, 167, 182, 234, 178, 252, 217, 243 },
+        {   1,  61, 105, 206, 140, 168, 226, 167, 250, 215, 242 },
+        {   1,  31,  60, 151, 109, 148, 204, 142, 250, 208, 230 },
+        {   1,  13,  26,  76,  93, 132, 139, 106, 236, 171, 237 }
+      }
     }
   }
 };
 static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES_32X32] = {
   { /* block Type 0 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 1 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 2 */
-    { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 2 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 3 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }
-  }, { /* block Type 3 */
-    { /* Coeff Band 0 */
-      {   8,  40, 224, 217, 183, 181, 180, 148, 200, 180, 123 },
-      {   6,  37, 178, 193, 173, 171, 160, 139, 205, 166, 173 },
-      {   3,  27,  93, 133, 143, 159, 115, 125, 183, 141, 178 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
-    }, { /* Coeff Band 1 */
-      {   1, 170, 209, 202, 172, 175, 179, 143, 238, 181, 214 },
-      { 184, 164, 199, 199, 169, 173, 180, 143, 238, 184, 217 },
-      {  99, 128, 165, 194, 161, 171, 180, 142, 239, 182, 219 },
-      {  17,  49,  59, 102, 117, 148, 122, 116, 208, 152, 191 }
-    }, { /* Coeff Band 2 */
-      {   1, 136, 200, 197, 172, 172, 168, 142, 226, 170, 216 },
-      {  66, 104, 146, 175, 152, 165, 163, 139, 225, 170, 219 },
-      {  11,  52,  83, 144, 130, 156, 151, 130, 222, 165, 216 },
-      {   1,  16,  25,  65,  99, 137,  96, 106, 190, 138, 184 }
-    }, { /* Coeff Band 3 */
-      {   1, 180, 203, 198, 166, 170, 190, 143, 241, 190, 227 },
-      {  74, 125, 161, 187, 154, 165, 187, 142, 241, 189, 224 },
-      {  15,  70,  98, 163, 133, 157, 182, 137, 241, 187, 226 },
-      {   1,  25,  37,  89, 104, 140, 128, 113, 218, 158, 206 }
-    }, { /* Coeff Band 4 */
-      {   1, 191, 208, 213, 169, 173, 212, 156, 246, 206, 217 },
-      {  53, 136, 170, 205, 159, 170, 211, 156, 246, 205, 208 },
-      {   3,  75, 112, 189, 140, 163, 209, 151, 246, 205, 215 },
-      {   1,  32,  51, 127, 108, 145, 171, 128, 231, 183, 197 }
-    }, { /* Coeff Band 5 */
-      {   1, 183, 195, 202, 161, 168, 206, 150, 247, 202, 229 },
-      {  42, 113, 144, 190, 147, 163, 203, 148, 247, 202, 229 },
-      {   2,  56,  82, 160, 124, 153, 195, 140, 246, 200, 229 },
-      {   1,  22,  34,  93,  99, 138, 143, 115, 227, 170, 206 }
-    }, { /* Coeff Band 6 */
-      {   1, 202, 193, 221, 168, 175, 227, 167, 251, 217, 236 },
-      {  26, 122, 158, 213, 157, 171, 225, 165, 251, 216, 242 },
-      {   1,  68, 105, 194, 136, 162, 221, 158, 251, 215, 239 },
-      {   1,  32,  51, 131, 107, 145, 179, 130, 240, 188, 231 }
-    }, { /* Coeff Band 7 */
-      {   1, 234, 212, 243, 195, 192, 240, 187, 253, 226, 227 },
-      {  14, 141, 186, 237, 181, 186, 239, 184, 253, 226, 233 },
-      {   1,  85, 132, 221, 155, 174, 235, 176, 253, 224, 226 },
-      {   1,  39,  65, 159, 115, 150, 202, 144, 245, 202, 214 }
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {  38,  32, 115, 163, 140, 164, 143, 139, 167, 157, 105 },
+        {  11,  27,  73, 131, 126, 154, 131, 129, 178, 151, 138 },
+        {   2,  19,  36,  83, 107, 144, 102, 116, 169, 140, 149 }
+      }, { /* Coeff Band 1 */
+        {   1, 116, 150, 184, 149, 164, 180, 140, 230, 178, 199 },
+        {  71, 114, 149, 183, 150, 164, 181, 141, 229, 179, 203 },
+        {  39, 102, 139, 182, 148, 164, 181, 142, 229, 179, 197 },
+        {  16,  82, 117, 176, 143, 161, 180, 141, 230, 180, 200 },
+        {   3,  49,  72, 148, 120, 152, 175, 134, 230, 178, 200 },
+        {   1,  14,  21,  56,  94, 135,  92, 103, 179, 141, 158 }
+      }, { /* Coeff Band 2 */
+        {   1,  56, 140, 180, 151, 164, 175, 140, 224, 175, 194 },
+        {  28,  51, 101, 162, 135, 158, 170, 136, 222, 175, 193 },
+        {   9,  38,  68, 137, 120, 151, 160, 129, 221, 172, 193 },
+        {   3,  28,  47, 111, 108, 145, 145, 121, 216, 165, 192 },
+        {   1,  17,  28,  76,  97, 137, 117, 110, 206, 152, 189 },
+        {   1,   7,  11,  34,  89, 131,  62,  96, 154, 123, 148 }
+      }, { /* Coeff Band 3 */
+        {   1, 129, 170, 198, 160, 169, 186, 147, 231, 181, 201 },
+        {  45, 106, 147, 191, 152, 166, 186, 145, 228, 182, 197 },
+        {  14,  81, 117, 178, 141, 161, 183, 143, 227, 184, 187 },
+        {   4,  61,  89, 159, 129, 156, 178, 137, 226, 182, 174 },
+        {   1,  39,  59, 126, 113, 146, 161, 126, 227, 176, 186 },
+        {   1,  18,  26,  67,  98, 137, 103, 107, 190, 146, 166 }
+      }, { /* Coeff Band 4 */
+        {   1, 152, 180, 211, 166, 173, 206, 154, 243, 197, 216 },
+        {  24, 112, 150, 202, 155, 169, 204, 152, 242, 196, 212 },
+        {   3,  76, 112, 186, 141, 163, 199, 148, 241, 195, 212 },
+        {   1,  51,  80, 164, 124, 155, 191, 141, 240, 192, 212 },
+        {   1,  30,  48, 123, 106, 144, 170, 127, 235, 182, 210 },
+        {   1,  13,  20,  60,  92, 134, 102, 105, 189, 146, 160 }
+      }, { /* Coeff Band 5 */
+        {   1, 212, 207, 235, 190, 187, 220, 170, 240, 200, 207 },
+        {  11, 134, 179, 226, 175, 181, 214, 166, 236, 195, 201 },
+        {   1,  86, 133, 210, 155, 172, 210, 161, 236, 194, 201 },
+        {   1,  54,  88, 180, 129, 159, 200, 150, 235, 191, 200 },
+        {   1,  27,  46, 122, 104, 143, 170, 128, 230, 181, 198 },
+        {   1,   8,  15,  45,  88, 132,  81,  99, 171, 135, 154 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  50,  51, 216, 230, 193, 186, 193, 156, 219, 181, 168 },
+        {  36,  44, 174, 210, 175, 174, 186, 149, 218, 179, 172 },
+        {  12,  32, 100, 161, 140, 159, 162, 135, 209, 168, 172 }
+      }, { /* Coeff Band 1 */
+        {   1, 179, 230, 238, 191, 185, 229, 171, 250, 213, 200 },
+        { 167, 173, 225, 237, 190, 186, 231, 171, 245, 209, 223 },
+        { 115, 153, 208, 237, 187, 186, 229, 174, 247, 215, 216 },
+        {  71, 131, 182, 233, 176, 184, 228, 172, 247, 210, 238 },
+        {  41, 108, 145, 214, 151, 169, 228, 169, 246, 208, 210 },
+        {  19,  78,  95, 151, 128, 155, 168, 134, 218, 173, 175 }
+      }, { /* Coeff Band 2 */
+        {   1, 147, 215, 231, 181, 181, 227, 171, 249, 212, 218 },
+        {  65, 122, 179, 222, 168, 175, 223, 166, 248, 213, 216 },
+        {  11,  85, 126, 204, 148, 167, 218, 159, 247, 208, 222 },
+        {   4,  61,  89, 177, 128, 158, 206, 147, 246, 204, 208 },
+        {   1,  38,  54, 130, 109, 145, 179, 128, 241, 191, 203 },
+        {   1,  18,  24,  68,  96, 137, 110, 107, 196, 153, 145 }
+      }, { /* Coeff Band 3 */
+        {   1, 182, 227, 239, 193, 187, 231, 177, 250, 214, 189 },
+        {  73, 147, 202, 234, 182, 183, 230, 174, 248, 213, 219 },
+        {  12, 104, 154, 223, 164, 176, 228, 171, 248, 210, 225 },
+        {   3,  74, 113, 205, 143, 167, 222, 163, 246, 211, 214 },
+        {   1,  45,  69, 163, 116, 151, 205, 144, 244, 202, 205 },
+        {   1,  19,  30,  87,  96, 138, 134, 115, 199, 165, 133 }
+      }, { /* Coeff Band 4 */
+        {   1, 198, 229, 242, 196, 190, 235, 182, 248, 216, 224 },
+        {  55, 154, 201, 236, 183, 185, 233, 179, 247, 214, 190 },
+        {   5, 101, 150, 225, 163, 177, 229, 172, 245, 210, 205 },
+        {   1,  68, 106, 203, 140, 165, 223, 165, 246, 209, 194 },
+        {   1,  38,  62, 154, 112, 149, 199, 143, 241, 198, 191 },
+        {   1,  14,  22,  66,  94, 133, 109, 107, 178, 154, 122 }
+      }, { /* Coeff Band 5 */
+        {   1, 237, 226, 244, 205, 196, 225, 177, 243, 203, 210 },
+        {  24, 154, 200, 238, 189, 189, 221, 173, 240, 199, 210 },
+        {   2,  98, 150, 224, 167, 179, 217, 168, 240, 199, 207 },
+        {   1,  61,  99, 193, 137, 164, 207, 155, 239, 197, 208 },
+        {   1,  28,  49, 128, 105, 145, 177, 130, 234, 185, 206 },
+        {   1,   9,  16,  48,  89, 134,  89,  99, 183, 140, 169 }
+      }
     }
   }
 };
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 352e17c0c..759b90128 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -1,4 +1,4 @@
-/*
+/*
  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
@@ -41,12 +41,20 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]) = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7
+// Unified coefficient band structure used by all block sizes
+DECLARE_ALIGNED(16, const int, vp9_coef_bands[32]) = {
+  0, 1, 2, 3, 3, 3, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 5,
+  5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5
+};
+DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {
+  0, 1, 2, 3, 3, 3, 4, 4,
+  4, 4, 5, 5, 5, 5, 5, 5
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
-  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0
+DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
+  0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {
@@ -70,17 +78,6 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
   12, 13, 14, 15
 };
 
-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = {
-  0, 1, 2, 3, 5, 4, 4, 5,
-  5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7
-};
-
 DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
   0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
   12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
@@ -88,26 +85,6 @@ DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
   58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
 };
 
-// Table can be optimized.
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {
-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
   0,   1,  16,  32,  17,   2,   3,  18,
   33,  48,  64,  49,  34,  19,   4,   5,
@@ -143,694 +120,6 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
   237, 252, 253, 238, 223, 239, 254, 255,
 };
 
-#if CONFIG_DWTDCTHYBRID
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-  0,    1,   32,   64,   33,    2,    3,   34,
-  65,   96, 128,   97,   66,   35,    4,  5,
-  36,   67,   98,  129,  160,  192,  161,  130,
-  99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224, 256,  225,  194,  163,
-  132,  101,   70,   39,    8,    9,   40,   71,
-  102,  133,  164,  195,  226,  257,  288,  320,
-  289,  258,  227,  196,  165,  134,  103,   72,
-  41,   10,   11,   42,   73,  104,  135,  166,
-  197,  228,  259,  290,  321,  352,  384,  353,
-  322,  291,  260,  229,  198,  167,  136,  105,
-  74,   43,   12,   13,   44,   75,  106,  137,
-  168,  199,  230,  261,  292,  323,  354,  385,
-  416,  448,  417,  386,  355,  324,  293,  262,
-  231,  200,  169,  138,  107,   76,   45,   14,
-  15,   46,   77,  108,  139,  170,  201,  232,
-  263,  294,  325,  356,  387,  418,  449,  480,
-  481,  450,  419,  388,  357,  326,  295,  264,
-  233,  202,  171,  140,  109,   78,   47,   79,
-  110,  141,  172,  203,  234,  265,  296,  327,
-  358,  389,  420,  451,  482,  483,  452,  421,
-  390,  359,  328,  297,  266,  235,  204,  173,
-  142,  111,  143,  174,  205,  236,  267,  298,
-  329,  360,  391,  422,  453,  484,  485,  454,
-  423,  392,  361,  330,  299,  268,  237,  206,
-  175,  207,  238,  269,  300,  331,  362,  393,
-  424,  455,  486,  487,  456,  425,  394,  363,
-  332,  301,  270,  239,  271,  302,  333,  364,
-  395,  426,  457,  488,  489,  458,  427,  396,
-  365,  334,  303,  335,  366,  397,  428,  459,
-  490,  491,  460,  429,  398,  367,  399,  430,
-  461,  492,  493,  462,  431,  463,  494,  495,
-
-  16,   512,  528, 17,  513,  529,   48,  544,
-  560, 80,  576,  592,   49,  545,  561,   18,
-  514,  530,   19,  515,  531,   50,  546,  562,
-  81,  577,  593,  112,  608,  624,  144,  640,
-  656,  113,  609,  625,   82,  578,  594,   51,
-  547,  563,   20,  516,  532,   21,  517,  533,
-  52,  548,  564,   83,  579,  595,  114,  610,
-  626,  145,  641,  657,  176,  672,  688,  208,
-  704,  720,  177,  673,  689,  146,  642,  658,
-  115,  611,  627,   84,  580,  596,   53,  549,
-  565,   22,  518,  534,   23,  519,  535,   54,
-  550,  566,   85,  581,  597,  116,  612,  628,
-  147,  643,  659,  178,  674,  690,  209,  705,
-  721,  240,  736,  752,  272,  768,  784,  241,
-  737,  753,  210,  706,  722,  179,  675,  691,
-  148,  644,  660,  117,  613,  629,   86,  582,
-  598,   55,  551,  567,   24,  520,  536,   25,
-  521,  537,   56,  552,  568,   87,  583,  599,
-  118,  614,  630,  149,  645,  661,  180,  676,
-  692,  211,  707,  723,  242,  738,  754,  273,
-  769,  785,  304,  800,  816,  336,  832,  848,
-  305,  801,  817,  274,  770,  786,  243,  739,
-  755,  212,  708,  724,  181,  677,  693,  150,
-  646,  662,  119,  615,  631,   88,  584,  600,
-  57,  553,  569,   26,  522,  538,   27,  523,
-  539,   58,  554,  570,   89,  585,  601,  120,
-  616,  632,  151,  647,  663,  182,  678,  694,
-  213,  709,  725,  244,  740,  756,  275,  771,
-  787,  306,  802,  818,  337,  833,  849,  368,
-  864,  880,  400,  896,  912,  369,  865,  881,
-  338,  834,  850,  307,  803,  819,  276,  772,
-  788,  245,  741,  757,  214,  710,  726,  183,
-
-  679,  695,  152,  648,  664,  121,  617,  633,
-  90,  586,  602,   59,  555,  571,   28,  524,
-  540,   29,  525,  541,   60,  556,  572,   91,
-  587,  603,  122,  618,  634,  153,  649,  665,
-  184,  680,  696,  215,  711,  727,  246,  742,
-  758,  277,  773,  789,  308,  804,  820,  339,
-  835,  851,  370,  866,  882,  401,  897,  913,
-  432,  928,  944,  464,  960,  976,  433,  929,
-  945,  402,  898,  914,  371,  867,  883,  340,
-  836,  852,  309,  805,  821,  278,  774,  790,
-  247,  743,  759,  216,  712,  728,  185,  681,
-  697,  154,  650,  666,  123,  619,  635,   92,
-  588,  604,   61,  557,  573,   30,  526,  542,
-  31,  527,  543,   62,  558,  574,   93,  589,
-  605,  124,  620,  636,  155,  651,  667,  186,
-  682,  698,  217,  713,  729,  248,  744,  760,
-  279,  775,  791,  310,  806,  822,  341,  837,
-  853,  372,  868,  884,  403,  899,  915,  434,
-  930,  946,  465,  961,  977,  496,  992, 1008,
-  497,  993, 1009,  466,  962,  978,  435,  931,
-  947,  404,  900,  916,  373,  869,  885,  342,
-  838,  854,  311,  807,  823,  280,  776,  792,
-  249,  745,  761,  218,  714,  730,  187,  683,
-  699,  156,  652,  668,  125,  621,  637,   94,
-  590,  606,   63,  559,  575,   95,  591,  607,
-  126,  622,  638,  157,  653,  669,  188,  684,
-  700,  219,  715,  731,  250,  746,  762,  281,
-  777,  793,  312,  808,  824,  343,  839,  855,
-  374,  870,  886,  405,  901,  917,  436,  932,
-  948,  467,  963,  979,  498,  994, 1010,  499,
-  995, 1011,  468,  964,  980,  437,  933,  949,
-  406,  902,  918,  375,  871,  887,  344,  840,
-
-  856,  313,  809,  825,  282,  778,  794,  251,
-  747,  763,  220,  716,  732,  189,  685,  701,
-  158,  654,  670,  127,  623,  639,  159,  655,
-  671,  190,  686,  702,  221,  717,  733,  252,
-  748,  764,  283,  779,  795,  314,  810,  826,
-  345,  841,  857,  376,  872,  888,  407,  903,
-  919,  438,  934,  950,  469,  965,  981,  500,
-  996, 1012,  501,  997, 1013,  470,  966,  982,
-  439,  935,  951,  408,  904,  920,  377,  873,
-  889,  346,  842,  858,  315,  811,  827,  284,
-  780,  796,  253,  749,  765,  222,  718,  734,
-  191,  687,  703,  223,  719,  735,  254,  750,
-  766,  285,  781,  797,  316,  812,  828,  347,
-  843,  859,  378,  874,  890,  409,  905,  921,
-  440,  936,  952,  471,  967,  983,  502,  998,
-  1014,  503,  999, 1015,  472,  968,  984,  441,
-  937,  953,  410,  906,  922,  379,  875,  891,
-  348,  844,  860,  317,  813,  829,  286,  782,
-  798,  255,  751,  767,  287,  783,  799,  318,
-  814,  830,  349,  845,  861,  380,  876,  892,
-  411,  907,  923,  442,  938,  954,  473,  969,
-  985,  504, 1000, 1016,  505, 1001, 1017,  474,
-  970,  986,  443,  939,  955,  412,  908,  924,
-  381,  877,  893,  350,  846,  862,  319,  815,
-  831,  351,  847,  863,  382,  878,  894,  413,
-  909,  925,  444,  940,  956,  475,  971,  987,
-  506, 1002, 1018,  507, 1003, 1019,  476,  972,
-  988,  445,  941,  957,  414,  910,  926,  383,
-  879,  895,  415,  911,  927,  446,  942,  958,
-  477,  973,  989,  508, 1004, 1020,  509, 1005,
-  1021,  478,  974,  990,  447,  943,  959,  479,
-  975,  991,  510, 1006, 1022,  511, 1007, 1023,
-};
-
-#elif DWTDCT_TYPE == DWTDCT16X16
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6,
-  6, 6, 6,
-  6,
-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-  0,    1,   32,   64,   33,    2,    3,   34,
-  65,   96, 128,   97,   66,   35,    4,
-  16,   512,  528,
-  5,
-  36,   67,   98,  129,  160,  192,  161,  130,
-  99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224, 256,  225,  194,  163,
-  132,  101,   70,   39,    8,    9,   40,   71,
-  102,  133,  164,  195,  226,  257,  288,  320,
-  289,  258,  227,  196,  165,  134,  103,   72,
-  41,   10,   11,   42,   73,  104,  135,  166,
-  197,  228,  259,  290,  321,  352,  384,  353,
-  322,  291,  260,  229,  198,  167,  136,  105,
-  74,   43,   12,   13,   44,   75,  106,  137,
-  168,  199,  230,  261,  292,  323,  354,  385,
-  416,  448,  417,  386,  355,  324,  293,  262,
-  231,  200,  169,  138,  107,   76,   45,   14,
-  15,   46,   77,  108,  139,  170,  201,  232,
-  263,  294,  325,  356,  387,  418,  449,  480,
-  481,  450,  419,  388,  357,  326,  295,  264,
-  233,  202,  171,  140,  109,   78,   47,   79,
-  110,  141,  172,  203,  234,  265,  296,  327,
-  358,  389,  420,  451,  482,  483,  452,  421,
-  390,  359,  328,  297,  266,  235,  204,  173,
-  142,  111,  143,  174,  205,  236,  267,  298,
-  329,  360,  391,  422,  453,  484,  485,  454,
-  423,  392,  361,  330,  299,  268,  237,  206,
-  175,  207,  238,  269,  300,  331,  362,  393,
-  424,  455,  486,  487,  456,  425,  394,  363,
-  332,  301,  270,  239,  271,  302,  333,  364,
-  395,  426,  457,  488,  489,  458,  427,  396,
-  365,  334,  303,  335,  366,  397,  428,  459,
-  490,  491,  460,  429,  398,  367,  399,  430,
-  461,  492,  493,  462,  431,  463,  494,  495,
-
-  17,  513,  529,   48,  544,
-  560, 80,  576,  592,   49,  545,  561,   18,
-  514,  530,   19,  515,  531,   50,  546,  562,
-  81,  577,  593,  112,  608,  624,  144,  640,
-  656,  113,  609,  625,   82,  578,  594,   51,
-  547,  563,   20,  516,  532,   21,  517,  533,
-  52,  548,  564,   83,  579,  595,  114,  610,
-  626,  145,  641,  657,  176,  672,  688,  208,
-  704,  720,  177,  673,  689,  146,  642,  658,
-  115,  611,  627,   84,  580,  596,   53,  549,
-  565,   22,  518,  534,   23,  519,  535,   54,
-  550,  566,   85,  581,  597,  116,  612,  628,
-  147,  643,  659,  178,  674,  690,  209,  705,
-  721,  240,  736,  752,  272,  768,  784,  241,
-  737,  753,  210,  706,  722,  179,  675,  691,
-  148,  644,  660,  117,  613,  629,   86,  582,
-  598,   55,  551,  567,   24,  520,  536,   25,
-  521,  537,   56,  552,  568,   87,  583,  599,
-  118,  614,  630,  149,  645,  661,  180,  676,
-  692,  211,  707,  723,  242,  738,  754,  273,
-  769,  785,  304,  800,  816,  336,  832,  848,
-  305,  801,  817,  274,  770,  786,  243,  739,
-  755,  212,  708,  724,  181,  677,  693,  150,
-  646,  662,  119,  615,  631,   88,  584,  600,
-  57,  553,  569,   26,  522,  538,   27,  523,
-  539,   58,  554,  570,   89,  585,  601,  120,
-  616,  632,  151,  647,  663,  182,  678,  694,
-  213,  709,  725,  244,  740,  756,  275,  771,
-  787,  306,  802,  818,  337,  833,  849,  368,
-  864,  880,  400,  896,  912,  369,  865,  881,
-  338,  834,  850,  307,  803,  819,  276,  772,
-  788,  245,  741,  757,  214,  710,  726,  183,
-
-  679,  695,  152,  648,  664,  121,  617,  633,
-  90,  586,  602,   59,  555,  571,   28,  524,
-  540,   29,  525,  541,   60,  556,  572,   91,
-  587,  603,  122,  618,  634,  153,  649,  665,
-  184,  680,  696,  215,  711,  727,  246,  742,
-  758,  277,  773,  789,  308,  804,  820,  339,
-  835,  851,  370,  866,  882,  401,  897,  913,
-  432,  928,  944,  464,  960,  976,  433,  929,
-  945,  402,  898,  914,  371,  867,  883,  340,
-  836,  852,  309,  805,  821,  278,  774,  790,
-  247,  743,  759,  216,  712,  728,  185,  681,
-  697,  154,  650,  666,  123,  619,  635,   92,
-  588,  604,   61,  557,  573,   30,  526,  542,
-  31,  527,  543,   62,  558,  574,   93,  589,
-  605,  124,  620,  636,  155,  651,  667,  186,
-  682,  698,  217,  713,  729,  248,  744,  760,
-  279,  775,  791,  310,  806,  822,  341,  837,
-  853,  372,  868,  884,  403,  899,  915,  434,
-  930,  946,  465,  961,  977,  496,  992, 1008,
-  497,  993, 1009,  466,  962,  978,  435,  931,
-  947,  404,  900,  916,  373,  869,  885,  342,
-  838,  854,  311,  807,  823,  280,  776,  792,
-  249,  745,  761,  218,  714,  730,  187,  683,
-  699,  156,  652,  668,  125,  621,  637,   94,
-  590,  606,   63,  559,  575,   95,  591,  607,
-  126,  622,  638,  157,  653,  669,  188,  684,
-  700,  219,  715,  731,  250,  746,  762,  281,
-  777,  793,  312,  808,  824,  343,  839,  855,
-  374,  870,  886,  405,  901,  917,  436,  932,
-  948,  467,  963,  979,  498,  994, 1010,  499,
-  995, 1011,  468,  964,  980,  437,  933,  949,
-  406,  902,  918,  375,  871,  887,  344,  840,
-
-  856,  313,  809,  825,  282,  778,  794,  251,
-  747,  763,  220,  716,  732,  189,  685,  701,
-  158,  654,  670,  127,  623,  639,  159,  655,
-  671,  190,  686,  702,  221,  717,  733,  252,
-  748,  764,  283,  779,  795,  314,  810,  826,
-  345,  841,  857,  376,  872,  888,  407,  903,
-  919,  438,  934,  950,  469,  965,  981,  500,
-  996, 1012,  501,  997, 1013,  470,  966,  982,
-  439,  935,  951,  408,  904,  920,  377,  873,
-  889,  346,  842,  858,  315,  811,  827,  284,
-  780,  796,  253,  749,  765,  222,  718,  734,
-  191,  687,  703,  223,  719,  735,  254,  750,
-  766,  285,  781,  797,  316,  812,  828,  347,
-  843,  859,  378,  874,  890,  409,  905,  921,
-  440,  936,  952,  471,  967,  983,  502,  998,
-  1014,  503,  999, 1015,  472,  968,  984,  441,
-  937,  953,  410,  906,  922,  379,  875,  891,
-  348,  844,  860,  317,  813,  829,  286,  782,
-  798,  255,  751,  767,  287,  783,  799,  318,
-  814,  830,  349,  845,  861,  380,  876,  892,
-  411,  907,  923,  442,  938,  954,  473,  969,
-  985,  504, 1000, 1016,  505, 1001, 1017,  474,
-  970,  986,  443,  939,  955,  412,  908,  924,
-  381,  877,  893,  350,  846,  862,  319,  815,
-  831,  351,  847,  863,  382,  878,  894,  413,
-  909,  925,  444,  940,  956,  475,  971,  987,
-  506, 1002, 1018,  507, 1003, 1019,  476,  972,
-  988,  445,  941,  957,  414,  910,  926,  383,
-  879,  895,  415,  911,  927,  446,  942,  958,
-  477,  973,  989,  508, 1004, 1020,  509, 1005,
-  1021,  478,  974,  990,  447,  943,  959,  479,
-  975,  991,  510, 1006, 1022,  511, 1007, 1023,
-};
-
-#elif DWTDCT_TYPE == DWTDCT8X8
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5,
-  5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-
-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-  0,    1,   32,   64,   33,    2,    3,   34,
-  65,   96,  128,   97,   66,   35,    4,    5,
-  36,   67,   98,  129,  160,  192,  161,  130,
-  99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224,  225,  194,  163,  132,
-  101,   70,   39,   71,  102,  133,  164,  195,
-  226,  227,  196,  165,  134,  103,  135,  166,
-  197,  228,  229,  198,  167,  199,  230,  231,
-
-  8,  256,  264,    9,  257,  265,   40,  288, 296, 72,  320,  328,
-  41,  289,  297,   10, 258,  266, 11,  259,  267,   42,  290,  298,
-  73,  321,  329,  104,  352,  360,  136,  384, 392,  105,  353,  361,
-  74,  322,  330,   43, 291,  299,   12,  260,  268,   13,  261,  269,
-  44,  292,  300,   75,  323,  331,  106,  354, 362,  137,  385,  393,
-  168,  416,  424,  200, 448,  456,  169,  417,  425,  138,  386,  394,
-  107,  355,  363,   76,  324,  332,   45,  293, 301,   14,  262,  270,
-  15,  263,  271,   46, 294,  302,   77,  325,  333,  108,  356,  364,
-  139,  387,  395,  170, 418,  426,  201,  449, 457,  232,  480,  488,
-  233,  481,  489,  202, 450,  458,  171,  419,  427,  140,  388,  396,
-  109,  357,  365,   78,  326,  334,   47,  295, 303,   79,  327,  335,
-  110,  358,  366,  141, 389,  397,  172,  420,  428,  203,  451,  459,
-  234,  482,  490,  235,  483,  491,  204,  452, 460,  173,  421,  429,
-  142,  390,  398,  111, 359,  367,  143,  391,  399,  174,  422,  430,
-  205,  453,  461,  236,  484,  492,  237,  485, 493,  206,  454,  462,
-  175,  423,  431,  207, 455,  463,  238,  486,  494,  239,  487,  495,
-
-  16,  512,  528,   17,  513,  529,   18,  514,
-  530,   19,  515,  531,   20,  516,  532,   21,
-  517,  533,   22,  518,  534,   23,  519,  535,
-  24,  520,  536,   25,  521,  537,   26,  522,
-  538,   27,  523,  539,   28,  524,  540,   29,
-  525,  541,   30,  526,  542,   31,  527,  543,
-  48,  544,  560,   49,  545,  561,   50,  546,
-  562,   51,  547,  563,   52,  548,  564,   53,
-  549,  565,   54,  550,  566,   55,  551,  567,
-  56,  552,  568,   57,  553,  569,   58,  554,
-  570,   59,  555,  571,   60,  556,  572,   61,
-  557,  573,   62,  558,  574,   63,  559,  575,
-  80,  576,  592,   81,  577,  593,   82,  578,
-  594,   83,  579,  595,   84,  580,  596,   85,
-  581,  597,   86,  582,  598,   87,  583,  599,
-  88,  584,  600,   89,  585,  601,   90,  586,
-  602,   91,  587,  603,   92,  588,  604,   93,
-  589,  605,   94,  590,  606,   95,  591,  607,
-  112,  608,  624,  113,  609,  625,  114,  610,
-  626,  115,  611,  627,  116,  612,  628,  117,
-  613,  629,  118,  614,  630,  119,  615,  631,
-  120,  616,  632,  121,  617,  633,  122,  618,
-  634,  123,  619,  635,  124,  620,  636,  125,
-  621,  637,  126,  622,  638,  127,  623,  639,
-  144,  640,  656,  145,  641,  657,  146,  642,
-  658,  147,  643,  659,  148,  644,  660,  149,
-  645,  661,  150,  646,  662,  151,  647,  663,
-  152,  648,  664,  153,  649,  665,  154,  650,
-  666,  155,  651,  667,  156,  652,  668,  157,
-  653,  669,  158,  654,  670,  159,  655,  671,
-  176,  672,  688,  177,  673,  689,  178,  674,
-  690,  179,  675,  691,  180,  676,  692,  181,
-  677,  693,  182,  678,  694,  183,  679,  695,
-  184,  680,  696,  185,  681,  697,  186,  682,
-  698,  187,  683,  699,  188,  684,  700,  189,
-  685,  701,  190,  686,  702,  191,  687,  703,
-  208,  704,  720,  209,  705,  721,  210,  706,
-  722,  211,  707,  723,  212,  708,  724,  213,
-  709,  725,  214,  710,  726,  215,  711,  727,
-  216,  712,  728,  217,  713,  729,  218,  714,
-  730,  219,  715,  731,  220,  716,  732,  221,
-  717,  733,  222,  718,  734,  223,  719,  735,
-  240,  736,  752,  241,  737,  753,  242,  738,
-  754,  243,  739,  755,  244,  740,  756,  245,
-  741,  757,  246,  742,  758,  247,  743,  759,
-  248,  744,  760,  249,  745,  761,  250,  746,
-  762,  251,  747,  763,  252,  748,  764,  253,
-  749,  765,  254,  750,  766,  255,  751,  767,
-  272,  768,  784,  273,  769,  785,  274,  770,
-  786,  275,  771,  787,  276,  772,  788,  277,
-  773,  789,  278,  774,  790,  279,  775,  791,
-  280,  776,  792,  281,  777,  793,  282,  778,
-  794,  283,  779,  795,  284,  780,  796,  285,
-  781,  797,  286,  782,  798,  287,  783,  799,
-  304,  800,  816,  305,  801,  817,  306,  802,
-  818,  307,  803,  819,  308,  804,  820,  309,
-  805,  821,  310,  806,  822,  311,  807,  823,
-  312,  808,  824,  313,  809,  825,  314,  810,
-  826,  315,  811,  827,  316,  812,  828,  317,
-  813,  829,  318,  814,  830,  319,  815,  831,
-  336,  832,  848,  337,  833,  849,  338,  834,
-  850,  339,  835,  851,  340,  836,  852,  341,
-  837,  853,  342,  838,  854,  343,  839,  855,
-  344,  840,  856,  345,  841,  857,  346,  842,
-  858,  347,  843,  859,  348,  844,  860,  349,
-  845,  861,  350,  846,  862,  351,  847,  863,
-  368,  864,  880,  369,  865,  881,  370,  866,
-  882,  371,  867,  883,  372,  868,  884,  373,
-  869,  885,  374,  870,  886,  375,  871,  887,
-  376,  872,  888,  377,  873,  889,  378,  874,
-  890,  379,  875,  891,  380,  876,  892,  381,
-  877,  893,  382,  878,  894,  383,  879,  895,
-  400,  896,  912,  401,  897,  913,  402,  898,
-  914,  403,  899,  915,  404,  900,  916,  405,
-  901,  917,  406,  902,  918,  407,  903,  919,
-  408,  904,  920,  409,  905,  921,  410,  906,
-  922,  411,  907,  923,  412,  908,  924,  413,
-  909,  925,  414,  910,  926,  415,  911,  927,
-  432,  928,  944,  433,  929,  945,  434,  930,
-  946,  435,  931,  947,  436,  932,  948,  437,
-  933,  949,  438,  934,  950,  439,  935,  951,
-  440,  936,  952,  441,  937,  953,  442,  938,
-  954,  443,  939,  955,  444,  940,  956,  445,
-  941,  957,  446,  942,  958,  447,  943,  959,
-  464,  960,  976,  465,  961,  977,  466,  962,
-  978,  467,  963,  979,  468,  964,  980,  469,
-  965,  981,  470,  966,  982,  471,  967,  983,
-  472,  968,  984,  473,  969,  985,  474,  970,
-  986,  475,  971,  987,  476,  972,  988,  477,
-  973,  989,  478,  974,  990,  479,  975,  991,
-  496,  992, 1008,  497,  993, 1009,  498,  994,
-  1010,  499,  995, 1011,  500,  996, 1012,  501,
-  997, 1013,  502,  998, 1014,  503,  999, 1015,
-  504, 1000, 1016,  505, 1001, 1017,  506, 1002,
-  1018,  507, 1003, 1019,  508, 1004, 1020,  509,
-  1005, 1021,  510, 1006, 1022,  511, 1007, 1023,
-};
-#endif
-
-#else
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
     0,    1,   32,   64,   33,    2,    3,   34,   65,   96,  128,   97,   66,   35,    4,    5,   36,   67,   98,  129,  160,  192,  161,  130,   99,   68,   37,    6,    7,   38,   69,  100,
   131,  162,  193,  224,  256,  225,  194,  163,  132,  101,   70,   39,    8,    9,   40,   71,  102,  133,  164,  195,  226,  257,  288,  320,  289,  258,  227,  196,  165,  134,  103,   72,
@@ -865,7 +154,6 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
   951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
   923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
 };
-#endif  // CONFIG_DWTDCTHYBRID
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
@@ -937,162 +225,40 @@ vp9_extra_bit_struct vp9_extra_bits[12] = {
 
 #include "vp9/common/vp9_default_coef_probs.h"
 
-#if CONFIG_NEWCOEFCONTEXT
-
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]);
-
-static int find_in_scan(const int *scan, int l, int m) {
-  int i, l2 = l * l;
-  for (i = 0; i < l2; ++i) {
-    if (scan[i] == m)
-      return i;
-  }
-  return -1;
-}
-
-static void init_scan_neighbors(const int *scan, int l, int *neighbors) {
-  int l2 = l * l;
-  int m, n, i, j, k;
-  for (n = 0; n < l2; ++n) {
-    int locn = find_in_scan(scan, l, n);
-    int z = -1;
-    i = n / l;
-    j = n % l;
-    for (k = 0; k < MAX_NEIGHBORS; ++k)
-      neighbors[MAX_NEIGHBORS * n + k] = -1;
-    if (i - 1 >= 0) {
-      m = (i - 1) * l + j;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n] = m;
-        if (m == 0) z = 0;
-      }
+// This function updates and then returns n AC coefficient context
+// This is currently a placeholder function to allow experimentation
+// using various context models based on the energy earlier tokens
+// within the current block.
+//
+// For now it just returns the previously used context.
+int vp9_get_coef_context(int * recent_energy, int token) {
+  // int token_energy;
+  // int av_energy;
+
+  /*token_energy = ((token != DCT_EOB_TOKEN) ? token : 0);
+  if (!token_energy) {
+    if (!(*recent_energy)) {
+      av_energy = 0;
+    } else {
+      av_energy = 1;
     }
-    if (j - 1 >= 0) {
-      m = i * l + j - 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 1] = m;
-        if (m == 0) z = 1;
-      }
-    }
-    if (i - 1 >= 0 && j - 1 >= 0) {
-      m = (i - 1) * l + j - 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 2] = m;
-        if (m == 0) z = 2;
-      }
-    }
-    if (i - 1 >= 0 && j + 1 < l) {
-      m = (i - 1) * l + j + 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 3] = m;
-        if (m == 0) z = 3;
-      }
-    }
-    if (i + 1 < l && j - 1 >= 0) {
-       m = (i + 1) * l + j - 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 4] = m;
-        if (m == 0) z = 4;
-      }
-    }
-    if (z != -1) {  // zero exists
-      int v = 0;
-      for (k = 0; k < MAX_NEIGHBORS; ++k)
-        v += (neighbors[MAX_NEIGHBORS * n + k] > 0);
-      if (v) {
-        neighbors[MAX_NEIGHBORS * n + z] = -1;
-      }
-    }
-  }
-}
-
-void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4,
-                      vp9_default_zig_zag1d_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8,
-                      vp9_default_zig_zag1d_8x8_neighbors);
-  init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16,
-                      vp9_default_zig_zag1d_16x16_neighbors);
-  init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32,
-                      vp9_default_zig_zag1d_32x32_neighbors);
-}
-
-const int *vp9_get_coef_neighbors_handle(const int *scan) {
-  if (scan == vp9_default_zig_zag1d_4x4) {
-    return vp9_default_zig_zag1d_4x4_neighbors;
-  } else if (scan == vp9_row_scan_4x4) {
-    return vp9_row_scan_4x4_neighbors;
-  } else if (scan == vp9_col_scan_4x4) {
-    return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_8x8) {
-    return vp9_default_zig_zag1d_8x8_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_16x16) {
-    return vp9_default_zig_zag1d_16x16_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_32x32) {
-    return vp9_default_zig_zag1d_32x32_neighbors;
+  } else {
+    av_energy = ((token_energy + *recent_energy + 1) >> 1) + 1;
+    if (av_energy > DCT_VAL_CATEGORY6)
+      av_energy = DCT_VAL_CATEGORY6;
   }
-  return vp9_default_zig_zag1d_4x4_neighbors;
-}
+  *recent_energy = token_energy;*/
 
-int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
-                                  const int *neigbor_handle, int rc) {
-  static int neighbors_used = MAX_NEIGHBORS;   // maximum is MAX_NEIGHBORS
-  const int *nb = neigbor_handle + rc * MAX_NEIGHBORS;
-  int i, v, val = 0, n = 0;
-  for (i = 0; i < neighbors_used; ++i) {
-    if (nb[i] == -1 || (nb[i] == 0 && nodc)) {
-      continue;
-    }
-    v = abs(qcoeff_ptr[nb[i]]);
-    val = (v > val ? v : val);
-    n++;
-  }
-  if (n == 0)
-    return 0;
-  else if (val <= 1)
-    return val;
-  else if (val < 4)
-    return 2;
-  else
-    return 3;
-}
-#endif  /* CONFIG_NEWCOEFCONTEXT */
+  return vp9_pt_energy_class[token];
+};
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
   vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,
              sizeof(pc->fc.coef_probs_4x4));
-  vpx_memcpy(pc->fc.hybrid_coef_probs_4x4, default_hybrid_coef_probs_4x4,
-             sizeof(pc->fc.hybrid_coef_probs_4x4));
-
   vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
              sizeof(pc->fc.coef_probs_8x8));
-  vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,
-             sizeof(pc->fc.hybrid_coef_probs_8x8));
-
   vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
              sizeof(pc->fc.coef_probs_16x16));
-  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
-             default_hybrid_coef_probs_16x16,
-             sizeof(pc->fc.hybrid_coef_probs_16x16));
   vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
              sizeof(pc->fc.coef_probs_32x32));
 }
@@ -1115,28 +281,30 @@ static void update_coef_probs(vp9_coeff_probs *dst_coef_probs,
                               vp9_coeff_probs *pre_coef_probs,
                               int block_types, vp9_coeff_count *coef_counts,
                               int count_sat, int update_factor) {
-  int t, i, j, k, count;
+  int t, i, j, k, l, count;
   unsigned int branch_ct[ENTROPY_NODES][2];
   vp9_prob coef_probs[ENTROPY_NODES];
   int factor;
 
   for (i = 0; i < block_types; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
-                                         vp9_coef_encodings, vp9_coef_tree,
-                                         coef_probs, branch_ct,
-                                         coef_counts[i][j][k]);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          dst_coef_probs[i][j][k][t] = weighted_prob(pre_coef_probs[i][j][k][t],
-                                                     coef_probs[t], factor);
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          if (l >= 3 && k == 0)
+            continue;
+          vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
+                                           vp9_coef_encodings, vp9_coef_tree,
+                                           coef_probs, branch_ct,
+                                           coef_counts[i][j][k][l]);
+          for (t = 0; t < ENTROPY_NODES; ++t) {
+            count = branch_ct[t][0] + branch_ct[t][1];
+            count = count > count_sat ? count_sat : count;
+            factor = (update_factor * count / count_sat);
+            dst_coef_probs[i][j][k][l][t] =
+                weighted_prob(pre_coef_probs[i][j][k][l][t],
+                              coef_probs[t], factor);
+          }
         }
-      }
 }
 
 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
@@ -1158,85 +326,14 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
     count_sat = COEF_COUNT_SAT;
   }
 
-#ifdef COEF_COUNT_TESTING
-  {
-    printf("static const unsigned int\ncoef_counts"
-           "[BLOCK_TYPES] [COEF_BANDS]"
-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
-    for (i = 0; i < BLOCK_TYPES; ++i) {
-      printf("  {\n");
-      for (j = 0; j < COEF_BANDS; ++j) {
-        printf("    {\n");
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          printf("      {");
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            printf("%d, ", cm->fc.coef_counts[i][j][k][t]);
-          printf("},\n");
-        }
-        printf("    },\n");
-      }
-      printf("  },\n");
-    }
-    printf("};\n");
-    printf("static const unsigned int\ncoef_counts_8x8"
-           "[BLOCK_TYPES_8X8] [COEF_BANDS]"
-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
-      printf("  {\n");
-      for (j = 0; j < COEF_BANDS; ++j) {
-        printf("    {\n");
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          printf("      {");
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);
-          printf("},\n");
-        }
-        printf("    },\n");
-      }
-      printf("  },\n");
-    }
-    printf("};\n");
-    printf("static const unsigned int\nhybrid_coef_counts"
-           "[BLOCK_TYPES] [COEF_BANDS]"
-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
-    for (i = 0; i < BLOCK_TYPES; ++i) {
-      printf("  {\n");
-      for (j = 0; j < COEF_BANDS; ++j) {
-        printf("    {\n");
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          printf("      {");
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);
-          printf("},\n");
-        }
-        printf("    },\n");
-      }
-      printf("  },\n");
-    }
-    printf("};\n");
-  }
-#endif
-
   update_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
-                    BLOCK_TYPES_4X4, cm->fc.coef_counts_4x4,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.hybrid_coef_probs_4x4,
-                    cm->fc.pre_hybrid_coef_probs_4x4,
-                    BLOCK_TYPES_4X4, cm->fc.hybrid_coef_counts_4x4,
+                    BLOCK_TYPES, cm->fc.coef_counts_4x4,
                     count_sat, update_factor);
   update_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
-                    BLOCK_TYPES_8X8, cm->fc.coef_counts_8x8,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.hybrid_coef_probs_8x8,
-                    cm->fc.pre_hybrid_coef_probs_8x8,
-                    BLOCK_TYPES_8X8, cm->fc.hybrid_coef_counts_8x8,
+                    BLOCK_TYPES, cm->fc.coef_counts_8x8,
                     count_sat, update_factor);
   update_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
-                    BLOCK_TYPES_16X16, cm->fc.coef_counts_16x16,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.hybrid_coef_probs_16x16,
-                    cm->fc.pre_hybrid_coef_probs_16x16,
-                    BLOCK_TYPES_16X16, cm->fc.hybrid_coef_counts_16x16,
+                    BLOCK_TYPES, cm->fc.coef_counts_16x16,
                     count_sat, update_factor);
   update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
                     BLOCK_TYPES_32X32, cm->fc.coef_counts_32x32,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 84e5255c2..20559a79b 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -59,31 +59,21 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
-/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-#define BLOCK_TYPES_4X4 4
-
-#define BLOCK_TYPES_8X8 4
-
-#define BLOCK_TYPES_16X16 4
-
-#define BLOCK_TYPES_32X32 4
-
-/* Middle dimension is a coarsening of the coefficient's
-   position within the 4x4 DCT. */
-
-#define COEF_BANDS 8
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]);
-extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);
-
-/* Inside dimension is 3-valued measure of nearby complexity, that is,
-   the extent to which nearby coefficients are nonzero.  For the first
-   coefficient (DC, unless block type is 0), we look at the (already encoded)
-   blocks above and to the left of the current block.  The context index is
-   then the number (0,1,or 2) of these blocks having nonzero coefficients.
-   After decoding a coefficient, the measure is roughly the size of the
-   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+/* Outside dimension.  0 = Y with DC, 1 = UV */
+#define BLOCK_TYPES 2
+#define BLOCK_TYPES_32X32 1
+#define REF_TYPES 2  // intra=0, inter=1
+
+/* Middle dimension reflects the coefficient position within the transform. */
+#define COEF_BANDS 6
+
+/* Inside dimension is measure of nearby complexity, that reflects the energy
+   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
+   block type is 0), we look at the (already encoded) blocks above and to the
+   left of the current block.  The context index is then the number (0,1,or 2)
+   of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is determined by the size of the
+   most recently decoded coefficient.
    Note that the intuitive meaning of this measure changes as coefficients
    are decoded, e.g., prior to the first token, a zero means that my neighbors
    are empty while, after the first token, because of the use of end-of-block,
@@ -94,21 +84,18 @@ extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);
    distinct bands). */
 
 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
-#define PREV_COEF_CONTEXTS          4
+#define PREV_COEF_CONTEXTS          6
 
-typedef unsigned int vp9_coeff_count[COEF_BANDS][PREV_COEF_CONTEXTS]
+typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                     [MAX_ENTROPY_TOKENS];
-typedef unsigned int vp9_coeff_stats[COEF_BANDS][PREV_COEF_CONTEXTS]
+typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                     [ENTROPY_NODES][2];
-typedef vp9_prob vp9_coeff_probs[COEF_BANDS][PREV_COEF_CONTEXTS]
+typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                 [ENTROPY_NODES];
 
 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
 #define MODULUS_PARAM               13  /* Modulus parameter */
 
-extern DECLARE_ALIGNED(16, const uint8_t,
-                       vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
-
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);
@@ -129,26 +116,19 @@ static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
   vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-
-#define MAX_NEIGHBORS 5
-#define NEWCOEFCONTEXT_BAND_COND(b)   ((b) >= 1)
-void vp9_init_neighbors(void);
-
-const int *vp9_get_coef_neighbors_handle(const int *scan);
-int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
-                                  const int *neigbor_handle, int rc);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_4x4_neighbors[
-                       16 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[
-                       16 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[
-                       16 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[
-                       64 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[
-                       256 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[
-                       1024 * MAX_NEIGHBORS]);
-#endif  // CONFIG_NEWCOEFCONTEXT
+extern const int vp9_coef_bands[32];
+extern const int vp9_coef_bands4x4[16];
+
+static int get_coef_band(TX_SIZE tx_size, int coef_index) {
+  if (tx_size == TX_4X4) {
+    return vp9_coef_bands4x4[coef_index];
+  } else {
+    if (coef_index < 32)
+      return vp9_coef_bands[coef_index];
+    else
+      return 5;
+  }
+}
+extern int vp9_get_coef_context(int * recent_energy, int token);
+
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index ecae5e057..23b2abef7 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -11,9 +11,10 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_modecont.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_alloccommon.h"
 #include "vpx_mem/vpx_mem.h"
 
-
 static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
   /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
   {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
@@ -344,6 +345,9 @@ void vp9_init_mbmode_probs(VP9_COMMON *x) {
 #if CONFIG_COMP_INTERINTRA_PRED
   x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB;
 #endif
+  x->ref_pred_probs[0] = 120;
+  x->ref_pred_probs[1] = 80;
+  x->ref_pred_probs[2] = 40;
 }
 
 
@@ -419,6 +423,14 @@ const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1};
 #else
 const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};
 #endif
+#endif  // VP9_SWITCHABLE_FILTERS
+
+// Indicates if the filter is interpolating or non-interpolating
+// Note currently only the EIGHTTAP_SMOOTH is non-interpolating
+#if CONFIG_ENABLE_6TAP
+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 0, 1, 1, 1, -1};
+#else
+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {0, 1, 1, 1, -1};
 #endif
 
 void vp9_entropy_mode_init() {
@@ -480,7 +492,7 @@ void vp9_accum_mv_refs(VP9_COMMON *pc,
 
 #define MVREF_COUNT_SAT 20
 #define MVREF_MAX_UPDATE_FACTOR 128
-void vp9_update_mode_context(VP9_COMMON *pc) {
+void vp9_adapt_mode_context(VP9_COMMON *pc) {
   int i, j;
   unsigned int (*mv_ref_ct)[4][2];
   int (*mode_context)[4];
@@ -631,3 +643,65 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   }
 #endif
 }
+
+static void set_default_lf_deltas(MACROBLOCKD *xd) {
+  xd->mode_ref_lf_delta_enabled = 1;
+  xd->mode_ref_lf_delta_update = 1;
+
+  xd->ref_lf_deltas[INTRA_FRAME] = 2;
+  xd->ref_lf_deltas[LAST_FRAME] = 0;
+  xd->ref_lf_deltas[GOLDEN_FRAME] = -2;
+  xd->ref_lf_deltas[ALTREF_FRAME] = -2;
+
+  xd->mode_lf_deltas[0] = 4;               // BPRED
+  xd->mode_lf_deltas[1] = -2;              // Zero
+  xd->mode_lf_deltas[2] = 2;               // New mv
+  xd->mode_lf_deltas[3] = 4;               // Split mv
+}
+
+void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
+  // Reset the segment feature data to the default stats:
+  // Features disabled, 0, with delta coding (Default state).
+  int i;
+  vp9_clearall_segfeatures(xd);
+  xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+  if (cm->last_frame_seg_map)
+    vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));
+
+  /* reset the mode ref deltas for loop filter */
+  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));
+  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));
+  set_default_lf_deltas(xd);
+
+  vp9_default_coef_probs(cm);
+  vp9_init_mbmode_probs(cm);
+  vp9_default_bmode_probs(cm->fc.bmode_prob);
+  vp9_kf_default_bmode_probs(cm->kf_bmode_prob);
+  vp9_init_mv_probs(cm);
+  // To force update of the sharpness
+  cm->last_sharpness_level = -1;
+
+  vp9_init_mode_contexts(cm);
+
+  for (i = 0; i < NUM_FRAME_CONTEXTS; i++) {
+    vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
+  }
+
+  vpx_memset(cm->prev_mip, 0,
+             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+  vpx_memset(cm->mip, 0,
+             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+
+  vp9_update_mode_info_border(cm, cm->mip);
+  vp9_update_mode_info_in_image(cm, cm->mi);
+
+#if CONFIG_NEW_MVREF
+  // Defaults probabilities for encoding the MV ref id signal
+  vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
+             sizeof(xd->mb_mv_ref_probs));
+#endif
+  cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+  cm->frame_context_idx = 0;
+}
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index e03c6fe6d..345eb0253 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -76,11 +76,14 @@ void vp9_entropy_mode_init(void);
 
 struct VP9Common;
 
+/* sets up common features to forget past dependence */
+void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
+
 void vp9_init_mbmode_probs(struct VP9Common *x);
 
 extern void vp9_init_mode_contexts(struct VP9Common *pc);
 
-extern void vp9_update_mode_context(struct VP9Common *pc);
+extern void vp9_adapt_mode_context(struct VP9Common *pc);
 
 extern void vp9_accum_mv_refs(struct VP9Common *pc,
                               MB_PREDICTION_MODE m,
@@ -101,6 +104,8 @@ extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
 
 extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];
 
+extern const  int vp9_is_interpolating_filter[SWITCHABLE + 1];
+
 extern const  vp9_tree_index vp9_switchable_interp_tree
                   [2 * (VP9_SWITCHABLE_FILTERS - 1)];
 
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 07d8a169f..434c63e7e 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -15,28 +15,29 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
-  { 128,   0 },
-  { 120,   8 },
-  { 112,  16 },
-  { 104,  24 },
-  {  96,  32 },
-  {  88,  40 },
-  {  80,  48 },
-  {  72,  56 },
-  {  64,  64 },
-  {  56,  72 },
-  {  48,  80 },
-  {  40,  88 },
-  {  32,  96 },
-  {  24, 104 },
-  {  16, 112 },
-  {   8, 120 }
+DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+  { 0, 0, 0, 128,   0, 0, 0, 0 },
+  { 0, 0, 0, 120,   8, 0, 0, 0 },
+  { 0, 0, 0, 112,  16, 0, 0, 0 },
+  { 0, 0, 0, 104,  24, 0, 0, 0 },
+  { 0, 0, 0,  96,  32, 0, 0, 0 },
+  { 0, 0, 0,  88,  40, 0, 0, 0 },
+  { 0, 0, 0,  80,  48, 0, 0, 0 },
+  { 0, 0, 0,  72,  56, 0, 0, 0 },
+  { 0, 0, 0,  64,  64, 0, 0, 0 },
+  { 0, 0, 0,  56,  72, 0, 0, 0 },
+  { 0, 0, 0,  48,  80, 0, 0, 0 },
+  { 0, 0, 0,  40,  88, 0, 0, 0 },
+  { 0, 0, 0,  32,  96, 0, 0, 0 },
+  { 0, 0, 0,  24, 104, 0, 0, 0 },
+  { 0, 0, 0,  16, 112, 0, 0, 0 },
+  { 0, 0, 0,   8, 120, 0, 0, 0 }
 };
 
 #define FILTER_ALPHA       0
 #define FILTER_ALPHA_SHARP 1
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8])
+    = {
 #if FILTER_ALPHA == 0
   /* Lagrangian interpolation filter */
   { 0,   0,   0, 128,   0,   0,   0,  0},
@@ -55,6 +56,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
   { -1,   3,  -9,  27, 118, -13,   4, -1},
   { 0,   2,  -6,  18, 122, -10,   3, -1},
   { 0,   1,  -3,   8, 126,  -5,   1,  0}
+
 #elif FILTER_ALPHA == 50
   /* Generated using MATLAB:
    * alpha = 0.5;
@@ -82,7 +84,8 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
 #endif  /* FILTER_ALPHA */
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
+    = {
 #if FILTER_ALPHA_SHARP == 1
   /* dct based filter */
   {0,   0,   0, 128,   0,   0,   0, 0},
@@ -101,6 +104,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
   {-2,   5, -10,  27, 121, -17,   7, -3},
   {-1,   3,  -6,  17, 125, -13,   5, -2},
   {0,   1,  -3,   8, 127,  -7,   3, -1}
+
 #elif FILTER_ALPHA_SHARP == 75
   /* alpha = 0.75 */
   {0,   0,   0, 128,   0,   0,   0, 0},
@@ -122,7 +126,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
 #endif  /* FILTER_ALPHA_SHARP */
 };
 
-DECLARE_ALIGNED(16, const int16_t,
+DECLARE_ALIGNED(256, const int16_t,
                 vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
   /* 8-tap lowpass filter */
   /* Hamming window */
@@ -144,1072 +148,22 @@ DECLARE_ALIGNED(16, const int16_t,
   { 1, -2, -7, 37, 80, 28, -8, -1}
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
-  {0,   0, 128,   0,   0, 0},
-  {1,  -5, 125,   8,  -2, 1},
-  {1,  -8, 122,  17,  -5, 1},
-  {2, -11, 116,  27,  -8, 2},
-  {3, -14, 110,  37, -10, 2},
-  {3, -15, 103,  47, -12, 2},
-  {3, -16,  95,  57, -14, 3},
-  {3, -16,  86,  67, -15, 3},
-  {3, -16,  77,  77, -16, 3},
-  {3, -15,  67,  86, -16, 3},
-  {3, -14,  57,  95, -16, 3},
-  {2, -12,  47, 103, -15, 3},
-  {2, -10,  37, 110, -14, 3},
-  {2,  -8,  27, 116, -11, 2},
-  {1,  -5,  17, 122,  -8, 1},
-  {1,  -2,   8, 125,  -5, 1}
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8])
+    = {
+  {0, 0,   0, 128,   0,   0, 0,  0},
+  {0, 1,  -5, 125,   8,  -2, 1,  0},
+  {0, 1,  -8, 122,  17,  -5, 1,  0},
+  {0, 2, -11, 116,  27,  -8, 2,  0},
+  {0, 3, -14, 110,  37, -10, 2,  0},
+  {0, 3, -15, 103,  47, -12, 2,  0},
+  {0, 3, -16,  95,  57, -14, 3,  0},
+  {0, 3, -16,  86,  67, -15, 3,  0},
+  {0, 3, -16,  77,  77, -16, 3,  0},
+  {0, 3, -15,  67,  86, -16, 3,  0},
+  {0, 3, -14,  57,  95, -16, 3,  0},
+  {0, 2, -12,  47, 103, -15, 3,  0},
+  {0, 2, -10,  37, 110, -14, 3,  0},
+  {0, 2,  -8,  27, 116, -11, 2,  0},
+  {0, 1,  -5,  17, 122,  -8, 1,  0},
+  {0, 1,  -2,   8, 125,  -5, 1,  0}
 };
-
-static void filter_block2d_first_pass_6(uint8_t *src_ptr,
-                                        int *output_ptr,
-                                        unsigned int src_pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */
-
-      /* Normalize back to 0-255 */
-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-static void filter_block2d_second_pass_6(int *src_ptr,
-                                         uint8_t *output_ptr,
-                                         int output_pitch,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      /* Apply filter */
-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
-
-      /* Normalize back to 0-255 */
-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Start next row */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_pitch;
-  }
-}
-
-/*
- * The only functional difference between filter_block2d_second_pass()
- * and this function is that filter_block2d_second_pass() does a sixtap
- * filter on the input and stores it in the output. This function
- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
- * and then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_second_pass_avg_6(int *src_ptr,
-                                             uint8_t *output_ptr,
-                                             int output_pitch,
-                                             unsigned int src_pixels_per_line,
-                                             unsigned int pixel_step,
-                                             unsigned int output_height,
-                                             unsigned int output_width,
-                                             const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      /* Apply filter */
-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
-
-      /* Normalize back to 0-255 */
-      output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) +
-                       output_ptr[j] + 1) >> 1;
-      src_ptr++;
-    }
-
-    /* Start next row */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_pitch;
-  }
-}
-
-#define Interp_Extend 3
-static void filter_block2d_6(uint8_t *src_ptr,
-                             uint8_t *output_ptr,
-                             unsigned int src_pixels_per_line,
-                             int output_pitch,
-                             const int16_t *HFilter,
-                             const int16_t *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr,
-                               output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-
-void vp9_sixtap_predict4x4_c(uint8_t *src_ptr,
-                             int src_pixels_per_line,
-                             int xoffset,
-                             int yoffset,
-                             uint8_t *dst_ptr,
-                             int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
-                   VFilter);
-}
-
-/*
- * The difference between filter_block2d_6() and filter_block2d_avg_6 is
- * that filter_block2d_6() does a 6-tap filter and stores it in the output
- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
- * then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_avg_6(uint8_t *src_ptr,
-                                 uint8_t *output_ptr,
-                                 unsigned int src_pixels_per_line,
-                                 int output_pitch,
-                                 const int16_t *HFilter,
-                                 const int16_t *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
-                                   output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
-                       HFilter, VFilter);
-}
-
-void vp9_sixtap_predict8x8_c(uint8_t *src_ptr,
-                             int src_pixels_per_line,
-                             int xoffset,
-                             int yoffset,
-                             uint8_t *dst_ptr,
-                             int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
-                               dst_pitch, 8, 8, 8, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
-                                   dst_pitch, 8, 8, 8, 8, VFilter);
-}
-
-void vp9_sixtap_predict8x4_c(uint8_t *src_ptr,
-                             int src_pixels_per_line,
-                             int xoffset,
-                             int yoffset,
-                             uint8_t *dst_ptr,
-                             int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
-                               dst_pitch, 8, 8, 4, 8, VFilter);
-}
-
-void vp9_sixtap_predict16x16_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
-                               dst_pitch, 16, 16, 16, 16, VFilter);
-}
-
-void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
-                                   dst_pitch, 16, 16, 16, 16, VFilter);
-}
-
-typedef enum {
-  VPX_FILTER_4x4 = 0,
-  VPX_FILTER_8x8 = 1,
-  VPX_FILTER_8x4 = 2,
-  VPX_FILTER_16x16 = 3,
-} filter_size_t;
-
-static const unsigned int filter_size_to_wh[][2] = {
-  {4, 4},
-  {8, 8},
-  {8, 4},
-  {16,16},
-};
-
-static void filter_block2d_8_c(const uint8_t *src_ptr,
-                               const unsigned int src_stride,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               const filter_size_t filter_size,
-                               uint8_t *dst_ptr,
-                               unsigned int dst_stride) {
-  const unsigned int output_width = filter_size_to_wh[filter_size][0];
-  const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
-  // Between passes, we use an intermediate buffer whose height is extended to
-  // have enough horizontally filtered values as input for the vertical pass.
-  // This buffer is allocated to be big enough for the largest block type we
-  // support.
-  const int kInterp_Extend = 4;
-  const unsigned int intermediate_height =
-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;
-
-  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
-   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
-   *                                 + kInterp_Extend
-   *                               = 3 + 16 + 4
-   *                               = 23
-   * and filter_max_width = 16
-   */
-  uint8_t intermediate_buffer[23 * 16];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
-
-  // Horizontal pass (src -> transposed intermediate).
-  {
-    uint8_t *output_ptr = intermediate_buffer;
-    const int src_next_row_stride = src_stride - output_width;
-    unsigned int i, j;
-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-    for (i = 0; i < intermediate_height; i++) {
-      for (j = 0; j < output_width; j++) {
-        // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1); // Rounding
-
-        // Normalize back to 0-255...
-        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
-        src_ptr++;
-        output_ptr += intermediate_height;
-      }
-      src_ptr += src_next_row_stride;
-      output_ptr += intermediate_next_stride;
-    }
-  }
-
-  // Vertical pass (transposed intermediate -> dst).
-  {
-    uint8_t *src_ptr = intermediate_buffer;
-    const int dst_next_row_stride = dst_stride - output_width;
-    unsigned int i, j;
-    for (i = 0; i < output_height; i++) {
-      for (j = 0; j < output_width; j++) {
-        // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1); // Rounding
-
-        // Normalize back to 0-255...
-        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
-        src_ptr += intermediate_height;
-      }
-      src_ptr += intermediate_next_stride;
-      dst_ptr += dst_next_row_stride;
-    }
-  }
-}
-
-void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter_aligned16,
-                                const int16_t *VFilter_aligned16,
-                                uint8_t *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_4x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter_aligned16,
-                                const int16_t *VFilter_aligned16,
-                                uint8_t *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_8x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter_aligned16,
-                                const int16_t *VFilter_aligned16,
-                                uint8_t *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_8x8, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr,
-                                  const unsigned int src_stride,
-                                  const int16_t *HFilter_aligned16,
-                                  const int16_t *VFilter_aligned16,
-                                  uint8_t *dst_ptr,
-                                  unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_16x16, dst_ptr, dst_stride);
-}
-
-static void block2d_average_c(uint8_t *src,
-                              unsigned int src_stride,
-                              uint8_t *output_ptr,
-                              unsigned int output_stride,
-                              const filter_size_t filter_size) {
-  const unsigned int output_width = filter_size_to_wh[filter_size][0];
-  const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
-  unsigned int i, j;
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
-    }
-    output_ptr += output_stride;
-  }
-}
-
-#define block2d_average block2d_average_c
-
-void vp9_eighttap_predict4x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8[xoffset];
-  VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-  uint8_t tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8s[xoffset];
-  VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr,
-                                      int src_pixels_per_line,
-                                      int xoffset,
-                                      int yoffset,
-                                      uint8_t *dst_ptr,
-                                      int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         uint8_t *dst_ptr,
-                                         int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-  uint8_t tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr,
-                                          int src_pixels_per_line,
-                                          int xoffset,
-                                          int yoffset,
-                                          uint8_t *dst_ptr,
-                                          int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-  uint8_t tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-
-void vp9_eighttap_predict8x8_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr,
-                                      int src_pixels_per_line,
-                                      int xoffset,
-                                      int yoffset,
-                                      uint8_t *dst_ptr,
-                                      int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  uint8_t tmp[8 * 8];
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         uint8_t *dst_ptr,
-                                         int dst_pitch) {
-  uint8_t tmp[8 * 8];
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr,
-                                          int src_pixels_per_line,
-                                          int xoffset,
-                                          int yoffset,
-                                          uint8_t *dst_ptr,
-                                          int dst_pitch) {
-  uint8_t tmp[8 * 8];
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict8x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr,
-                                      int src_pixels_per_line,
-                                      int xoffset,
-                                      int yoffset,
-                                      uint8_t *dst_ptr,
-                                      int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr,
-                                       int src_pixels_per_line,
-                                       int xoffset,
-                                       int yoffset,
-                                       uint8_t *dst_ptr,
-                                       int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr,
-                                        int src_pixels_per_line,
-                                        int xoffset,
-                                        int yoffset,
-                                        uint8_t *dst_ptr,
-                                        int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr,
-                                           int src_pixels_per_line,
-                                           int xoffset,
-                                           int yoffset,
-                                           uint8_t *dst_ptr,
-                                           int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr,
-                                            int src_pixels_per_line,
-                                            int xoffset,
-                                            int yoffset,
-                                            uint8_t *dst_ptr,
-                                            int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : uint8_t  *src_ptr    : Pointer to source block.
- *                  uint32_t  src_stride : Stride of source block.
- *                  uint32_t  height     : Block height.
- *                  uint32_t  width      : Block width.
- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : int32_t  *dst_ptr    : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  in the horizontal direction to produce the filtered output
- *                  block. Used to implement first-pass of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_first_pass(uint8_t *src_ptr,
-                                          uint16_t *dst_ptr,
-                                          unsigned int src_stride,
-                                          unsigned int height,
-                                          unsigned int width,
-                                          const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply bilinear filter */
-      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
-                    ((int)src_ptr[1] * vp9_filter[1]) +
-                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
-      src_ptr++;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride - width;
-    dst_ptr += width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : int32_t  *src_ptr    : Pointer to source block.
- *                  uint32_t  dst_pitch  : Destination block pitch.
- *                  uint32_t  height     : Block height.
- *                  uint32_t  width      : Block width.
- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : uint16_t *dst_ptr    : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  in the vertical direction to produce the filtered output
- *                  block. Used to implement second-pass of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_second_pass(uint16_t *src_ptr,
-                                           uint8_t *dst_ptr,
-                                           int dst_pitch,
-                                           unsigned int height,
-                                           unsigned int width,
-                                           const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply filter */
-      temp = ((int)src_ptr[0]     * vp9_filter[0]) +
-             ((int)src_ptr[width] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    dst_ptr += dst_pitch;
-  }
-}
-
-/*
- * As before for filter_block2d_second_pass_avg(), the functional difference
- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
- * is that filter_block2d_bil_second_pass() does a bilinear filter on input
- * and stores the result in output; filter_block2d_bil_second_pass_avg(),
- * instead, does a bilinear filter on input, averages the resulting value
- * with the values already present in the output and stores the result of
- * that back into the output ((filter_result + dest + 1) >> 1).
- */
-static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr,
-                                               uint8_t *dst_ptr,
-                                               int dst_pitch,
-                                               unsigned int height,
-                                               unsigned int width,
-                                               const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply filter */
-      temp = (((int)src_ptr[0]     * vp9_filter[0]) +
-              ((int)src_ptr[width] * vp9_filter[1]) +
-              (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
-      dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    dst_ptr += dst_pitch;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil
- *
- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
- *                  uint32_t  src_pitch        : Stride of source block.
- *                  uint32_t  dst_pitch        : Stride of destination block.
- *                  int32_t  *HFilter          : Array of 2 horizontal filter taps.
- *                  int32_t  *VFilter          : Array of 2 vertical filter taps.
- *                  int32_t  Width             : Block width
- *                  int32_t  Height            : Block height
- *
- *  OUTPUTS       : uint16_t *dst_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 2-D filters an input block by applying a 2-tap
- *                  bi-linear filter horizontally followed by a 2-tap
- *                  bi-linear filter vertically on the result.
- *
- *  SPECIAL NOTES : The largest block size can be handled here is 16x16
- *
- ****************************************************************************/
-static void filter_block2d_bil(uint8_t *src_ptr,
-                               uint8_t *dst_ptr,
-                               unsigned int src_pitch,
-                               unsigned int dst_pitch,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               int Width,
-                               int Height) {
-
-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-static void filter_block2d_bil_avg(uint8_t *src_ptr,
-                                   uint8_t *dst_ptr,
-                                   unsigned int src_pitch,
-                                   unsigned int dst_pitch,
-                                   const int16_t *HFilter,
-                                   const int16_t *VFilter,
-                                   int Width,
-                                   int Height) {
-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-void vp9_bilinear_predict4x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-
-}
-
-void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-
-}
-
-void vp9_bilinear_predict16x16_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
-
-void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 16, 16);
-}
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index cd666578d..1ccfdaac2 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -21,10 +21,17 @@
 
 #define SUBPEL_SHIFTS 16
 
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
 extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
 extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
 extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
 
+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
+// filter kernel as a 2 tap filter.
+#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
+                   sizeof(vp9_bilinear_filters[0][0]))
+#define BF_OFFSET (BF_LENGTH / 2 - 1)
+#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
+
 #endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 1d11f4244..2f709bf58 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
   uint8_t temp2[2 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3,
                                     src_pixels_per_line, 1, 3, 16, HFilter);
@@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
   uint8_t temp2[2 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3,
                                     src_pixels_per_line, 1, 17, 2, HFilter);
@@ -141,130 +141,140 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
   int_mv sorted_mvs[MAX_MV_REF_CANDIDATES];
   int zero_seen = FALSE;
 
-  // Default all to 0,0 if nothing else available
-  nearest->as_int = near->as_int = 0;
-  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+  if (ref_y_buffer) {
 
-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
-  above_ref = ref_y_buffer - ref_y_stride * 2;
+    // Default all to 0,0 if nothing else available
+    nearest->as_int = near->as_int = 0;
+    vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+
+    above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+    above_ref = ref_y_buffer - ref_y_stride * 2;
 #if CONFIG_ABOVESPREFMV
-  above_src -= 4;
-  above_ref -= 4;
+    above_src -= 4;
+    above_ref -= 4;
 #else
-  left_src  = xd->dst.y_buffer - 2;
-  left_ref  = ref_y_buffer - 2;
+    left_src  = xd->dst.y_buffer - 2;
+    left_ref  = ref_y_buffer - 2;
 #endif
 
-  // Limit search to the predicted best few candidates
-  for(i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    int_mv this_mv;
-    int offset = 0;
-    int row_offset, col_offset;
+    // Limit search to the predicted best few candidates
+    for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+      int_mv this_mv;
+      int offset = 0;
+      int row_offset, col_offset;
 
-    this_mv.as_int = mvlist[i].as_int;
+      this_mv.as_int = mvlist[i].as_int;
 
-    // If we see a 0,0 vector for a second time we have reached the end of
-    // the list of valid candidate vectors.
-    if (!this_mv.as_int && zero_seen)
-      break;
+      // If we see a 0,0 vector for a second time we have reached the end of
+      // the list of valid candidate vectors.
+      if (!this_mv.as_int && zero_seen)
+        break;
 
-    zero_seen = zero_seen || !this_mv.as_int;
+      zero_seen = zero_seen || !this_mv.as_int;
 
 #if !CONFIG_ABOVESPREFMV
-    clamp_mv(&this_mv,
-             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
-             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
-             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+      clamp_mv(&this_mv,
+               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 #else
-    clamp_mv(&this_mv,
-             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,
-             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
-             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+      clamp_mv(&this_mv,
+               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 #endif
 
-    row_offset = this_mv.as_mv.row >> 3;
-    col_offset = this_mv.as_mv.col >> 3;
-    offset = ref_y_stride * row_offset + col_offset;
-    score = 0;
-    if (xd->up_available) {
-      vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
-                                 SP(this_mv.as_mv.col),
-                                 SP(this_mv.as_mv.row),
-                                 above_src, xd->dst.y_stride, &sse);
-      score += sse;
-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-        vp9_sub_pixel_variance16x2(above_ref + offset + 16,
-                                   ref_y_stride,
-                                   SP(this_mv.as_mv.col),
-                                   SP(this_mv.as_mv.row),
-                                   above_src + 16, xd->dst.y_stride, &sse);
-        score += sse;
-      }
-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
-        vp9_sub_pixel_variance16x2(above_ref + offset + 32,
-                                   ref_y_stride,
-                                   SP(this_mv.as_mv.col),
-                                   SP(this_mv.as_mv.row),
-                                   above_src + 32, xd->dst.y_stride, &sse);
-        score += sse;
-        vp9_sub_pixel_variance16x2(above_ref + offset + 48,
-                                   ref_y_stride,
-                                   SP(this_mv.as_mv.col),
-                                   SP(this_mv.as_mv.row),
-                                   above_src + 48, xd->dst.y_stride, &sse);
-        score += sse;
-      }
-    }
+      row_offset = this_mv.as_mv.row >> 3;
+      col_offset = this_mv.as_mv.col >> 3;
+      offset = ref_y_stride * row_offset + col_offset;
+      score = 0;
 #if !CONFIG_ABOVESPREFMV
-    if (xd->left_available) {
-      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
+      if (xd->up_available) {
+#else
+      if (xd->up_available && xd->left_available) {
+#endif
+        vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
                                    SP(this_mv.as_mv.col),
                                    SP(this_mv.as_mv.row),
-                                   left_src, xd->dst.y_stride, &sse);
-      score += sse;
-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,
+                                   above_src, xd->dst.y_stride, &sse);
+        score += sse;
+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
+          vp9_sub_pixel_variance16x2(above_ref + offset + 16,
                                      ref_y_stride,
                                      SP(this_mv.as_mv.col),
                                      SP(this_mv.as_mv.row),
-                                     left_src + xd->dst.y_stride * 16,
-                                     xd->dst.y_stride, &sse);
-        score += sse;
-      }
-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
-        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,
+                                     above_src + 16, xd->dst.y_stride, &sse);
+          score += sse;
+        }
+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+          vp9_sub_pixel_variance16x2(above_ref + offset + 32,
                                      ref_y_stride,
                                      SP(this_mv.as_mv.col),
                                      SP(this_mv.as_mv.row),
-                                     left_src + xd->dst.y_stride * 32,
-                                     xd->dst.y_stride, &sse);
-        score += sse;
-        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,
+                                     above_src + 32, xd->dst.y_stride, &sse);
+          score += sse;
+          vp9_sub_pixel_variance16x2(above_ref + offset + 48,
                                      ref_y_stride,
                                      SP(this_mv.as_mv.col),
                                      SP(this_mv.as_mv.row),
-                                     left_src + xd->dst.y_stride * 48,
-                                     xd->dst.y_stride, &sse);
+                                     above_src + 48, xd->dst.y_stride, &sse);
+          score += sse;
+        }
+      }
+#if !CONFIG_ABOVESPREFMV
+      if (xd->left_available) {
+        vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src, xd->dst.y_stride, &sse);
         score += sse;
+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
+          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,
+                                       ref_y_stride,
+                                       SP(this_mv.as_mv.col),
+                                       SP(this_mv.as_mv.row),
+                                       left_src + xd->dst.y_stride * 16,
+                                       xd->dst.y_stride, &sse);
+          score += sse;
+        }
+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,
+                                     ref_y_stride,
+                                       SP(this_mv.as_mv.col),
+                                       SP(this_mv.as_mv.row),
+                                       left_src + xd->dst.y_stride * 32,
+                                       xd->dst.y_stride, &sse);
+          score += sse;
+          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,
+                                       ref_y_stride,
+                                       SP(this_mv.as_mv.col),
+                                       SP(this_mv.as_mv.row),
+                                       left_src + xd->dst.y_stride * 48,
+                                       xd->dst.y_stride, &sse);
+          score += sse;
+        }
       }
-    }
 #endif
-    // Add the entry to our list and then resort the list on score.
-    ref_scores[i] = score;
-    sorted_mvs[i].as_int = this_mv.as_int;
-    j = i;
-    while (j > 0) {
-      if (ref_scores[j] < ref_scores[j-1]) {
-        ref_scores[j] = ref_scores[j-1];
-        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
-        ref_scores[j-1] = score;
-        sorted_mvs[j-1].as_int = this_mv.as_int;
-        j--;
-      } else
-        break;
+      // Add the entry to our list and then resort the list on score.
+      ref_scores[i] = score;
+      sorted_mvs[i].as_int = this_mv.as_int;
+      j = i;
+      while (j > 0) {
+        if (ref_scores[j] < ref_scores[j-1]) {
+          ref_scores[j] = ref_scores[j-1];
+          sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+          ref_scores[j-1] = score;
+          sorted_mvs[j-1].as_int = this_mv.as_int;
+          j--;
+        } else {
+          break;
+        }
+      }
     }
+  } else {
+    vpx_memcpy(sorted_mvs, mvlist, sizeof(sorted_mvs));
   }
 
   // Make sure all the candidates are properly clamped etc
@@ -273,21 +283,16 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
     clamp_mv2(&sorted_mvs[i], xd);
   }
 
-  // Provided that there are non zero vectors available there will not
-  // be more than one 0,0 entry in the sorted list.
-  // The best ref mv is always set to the first entry (which gave the best
-  // results. The nearest is set to the first non zero vector if available and
-  // near to the second non zero vector if available.
-  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
-  if ( sorted_mvs[0].as_int ) {
-    nearest->as_int = sorted_mvs[0].as_int;
-    if ( sorted_mvs[1].as_int )
-      near->as_int = sorted_mvs[1].as_int;
-    else
-      near->as_int = sorted_mvs[2].as_int;
+  // Nearest may be a 0,0 or non zero vector and now matches the chosen
+  // "best reference". This has advantages when it is used as part of a
+  // compound predictor as it means a non zero vector can be paired using
+  // this mode with a 0 vector. The Near vector is still forced to be a
+  // non zero candidate if one is avaialble.
+  nearest->as_int = sorted_mvs[0].as_int;
+  if ( sorted_mvs[1].as_int ) {
+    near->as_int = sorted_mvs[1].as_int;
   } else {
-      nearest->as_int = sorted_mvs[1].as_int;
-      near->as_int = sorted_mvs[2].as_int;
+    near->as_int = sorted_mvs[2].as_int;
   }
 
   // Copy back the re-ordered mv list
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index a66a7de27..c42aab1a5 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -28,7 +28,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                            int_mv *nearest,
                            int_mv *near);
 
-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
+                    int_mv *mvp, const int *ref_frame_sign_bias) {
   MV xmv;
   xmv = mvp->as_mv;
 
@@ -83,8 +84,12 @@ vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
 
 extern const uint8_t vp9_mbsplit_offset[4][16];
 
-static int left_block_mv(const MODE_INFO *cur_mb, int b) {
+static int left_block_mv(const MACROBLOCKD *xd,
+                         const MODE_INFO *cur_mb, int b) {
   if (!(b & 3)) {
+    if (!xd->left_available)
+      return 0;
+
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
@@ -93,11 +98,15 @@ static int left_block_mv(const MODE_INFO *cur_mb, int b) {
     b += 4;
   }
 
-  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+  return (cur_mb->bmi + b - 1)->as_mv[0].as_int;
 }
 
-static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {
+static int left_block_second_mv(const MACROBLOCKD *xd,
+                                const MODE_INFO *cur_mb, int b) {
   if (!(b & 3)) {
+    if (!xd->left_available)
+      return 0;
+
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
@@ -108,8 +117,8 @@ static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 1)->as_mv.second.as_int :
-      (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+      (cur_mb->bmi + b - 1)->as_mv[1].as_int :
+      (cur_mb->bmi + b - 1)->as_mv[0].as_int;
 }
 
 static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
@@ -122,7 +131,7 @@ static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
     b += 16;
   }
 
-  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+  return (cur_mb->bmi + b - 4)->as_mv[0].as_int;
 }
 
 static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
@@ -137,8 +146,8 @@ static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 4)->as_mv.second.as_int :
-      (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+      (cur_mb->bmi + b - 4)->as_mv[1].as_int :
+      (cur_mb->bmi + b - 4)->as_mv[0].as_int;
 }
 
 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
new file mode 100644
index 000000000..17d0134fa
--- /dev/null
+++ b/vp9/common/vp9_idct.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_IDCT_H_
+#define VP9_COMMON_VP9_IDCT_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+
+/* If we don't want to use ROUND_POWER_OF_TWO macro
+static INLINE int16_t round_power_of_two(int16_t value, int n) {
+  return (value + (1 << (n - 1))) >> n;
+}*/
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const int sinpi_1_9 = 5283;
+static const int sinpi_2_9 = 9929;
+static const int sinpi_3_9 = 13377;
+static const int sinpi_4_9 = 15212;
+
+static INLINE int dct_const_round_shift(int input) {
+  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}
+
+static INLINE int dct_32_round(int input) {
+  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
+typedef void (*transform_1d)(int16_t*, int16_t*);
+
+typedef struct {
+  transform_1d cols, rows;  // vertical and horizontal
+} transform_2d;
+
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+
+/* If we don't want to use ROUND_POWER_OF_TWO macro
+static INLINE int16_t round_power_of_two(int16_t value, int n) {
+  return (value + (1 << (n - 1))) >> n;
+}*/
+
+#endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 106ef9c19..673abd7b1 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -24,400 +24,25 @@
  **************************************************************************/
 #include <assert.h>
 #include <math.h>
+
 #include "./vpx_config.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
 
-static const int cospi8sqrt2minus1 = 20091;
-static const int sinpi8sqrt2      = 35468;
-static const int rounding = 0;
-
-static const int16_t idct_i4[16] = {
-  8192,  10703,  8192,   4433,
-  8192,   4433, -8192, -10703,
-  8192,  -4433, -8192,  10703,
-  8192, -10703,  8192,  -4433
-};
-
-static const int16_t iadst_i4[16] = {
-   3736,  9459, 10757,   7021,
-   7021,  9459, -3736, -10757,
-   9459,     0, -9459,   9459,
-  10757, -9459,  7021,  -3736
-};
-
-static const int16_t idct_i8[64] = {
-   5793,  8035,  7568,  6811,
-   5793,  4551,  3135,  1598,
-   5793,  6811,  3135, -1598,
-  -5793, -8035, -7568, -4551,
-   5793,  4551, -3135, -8035,
-  -5793,  1598,  7568,  6811,
-   5793,  1598, -7568, -4551,
-   5793,  6811, -3135, -8035,
-   5793, -1598, -7568,  4551,
-   5793, -6811, -3135,  8035,
-   5793, -4551, -3135,  8035,
-  -5793, -1598,  7568, -6811,
-   5793, -6811,  3135,  1598,
-  -5793,  8035, -7568,  4551,
-   5793, -8035,  7568, -6811,
-   5793, -4551,  3135, -1598
-};
-
-static const int16_t iadst_i8[64] = {
-   1460,  4184,  6342,  7644,
-   7914,  7114,  5354,  2871,
-   2871,  7114,  7644,  4184,
-  -1460, -6342, -7914, -5354,
-   4184,  7914,  2871, -5354,
-  -7644, -1460,  6342,  7114,
-   5354,  6342, -4184, -7114,
-   2871,  7644, -1460, -7914,
-   6342,  2871, -7914,  1460,
-   7114, -5354, -4184,  7644,
-   7114, -1460, -5354,  7914,
-  -4184, -2871,  7644, -6342,
-   7644, -5354,  1460,  2871,
-  -6342,  7914, -7114,  4184,
-   7914, -7644,  7114, -6342,
-   5354, -4184,  2871, -1460
-};
-
-
-
-static const int16_t idct_i16[256] = {
-   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,
-   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,
-   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,
-  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
-   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,
-  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,
-   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,
-   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,
-   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,
-   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,
-   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,
-  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,
-   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,
-  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,
-   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,
-   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,
-   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,
-   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,
-   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,
-  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,
-   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,
-  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,
-   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,
-   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,
-   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,
-   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,
-   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,
-  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,
-   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,
-  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,
-   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,
-   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568
-};
-
-static const int16_t iadst_i16[256] = {
-    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,
-   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,
-   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,
-   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
-   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,
-  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,
-   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,
-   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,
-   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,
-   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,
-   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,
-  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,
-   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,
-  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,
-   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,
-   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,
-   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,
-   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,
-   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,
-  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,
-   4940,     0, -4940,  4940,     0, -4940,  4940,     0,
-  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,
-   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,
-   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,
-   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,
-   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,
-   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,
-  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,
-   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,
-  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,
-   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,
-   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
-};
-
-
-/* Converted the transforms to integer form. */
-#define HORIZONTAL_SHIFT 14  // 16
-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
-#define VERTICAL_SHIFT 17  // 15
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
-                      TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
-  int i, j, k;
-  int nz_dim;
-  int16_t imbuf[256];
-
-  const int16_t *ip = input;
-  int16_t *op = output;
-  int16_t *im = &imbuf[0];
-
-  /* pointers to vertical and horizontal transforms. */
-  const int16_t *ptv = NULL, *pth = NULL;
-  int shortpitch = pitch >> 1;
-
-  switch (tx_type) {
-    case ADST_ADST :
-      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
-                                  : ((tx_dim == 8) ? &iadst_i8[0]
-                                                     : &iadst_i16[0]);
-      break;
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &iadst_i4[0]
-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
-      pth = (tx_dim == 4) ? &idct_i4[0]
-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
-      break;
-    case  DCT_ADST :
-      ptv = (tx_dim == 4) ? &idct_i4[0]
-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
-      pth = (tx_dim == 4) ? &iadst_i4[0]
-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
-      break;
-    case  DCT_DCT :
-      ptv = pth = (tx_dim == 4) ? &idct_i4[0]
-                                  : ((tx_dim == 8) ? &idct_i8[0]
-                                                     : &idct_i16[0]);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  nz_dim = tx_dim;
-  if(tx_dim > 4) {
-    if(eobs < 36) {
-      vpx_memset(im, 0, 512);
-      nz_dim = 8;
-      if(eobs < 3) {
-        nz_dim = 2;
-      } else if(eobs < 10) {
-        nz_dim = 4;
-      }
-    }
-  }
-
-  /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
-   * from right to left:
-   * 1. horizontal transform: Y= Z*Transposed_M2
-   * 2. vertical transform: X = M1*Y
-   * In SIMD, doing this way could eliminate the transpose needed if it is
-   * calculated from left to right.
-   */
-  /* Horizontal transformation */
-  for (j = 0; j < tx_dim; j++) {
-    for (i = 0; i < nz_dim; i++) {
-      int temp = 0;
-
-      for (k = 0; k < nz_dim; k++) {
-        temp += ip[k] * pth[k];
-      }
-
-      /* Calculate im and store it in its transposed position. */
-      im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
-      ip += tx_dim;
-    }
-    im += tx_dim;
-    pth += tx_dim;
-    ip = input;
-  }
-
-  /* Vertical transformation */
-  im = &imbuf[0];
-
-  for (i = 0; i < tx_dim; i++) {
-    for (j = 0; j < tx_dim; j++) {
-      int temp = 0;
-
-      for (k = 0; k < nz_dim; k++) {
-        temp += ptv[k] * im[k];
-      }
-
-      op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
-      im += tx_dim;
-    }
-    im = &imbuf[0];
-    ptv += tx_dim;
-    op += shortpitch;
-  }
-}
-
-void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-
-  int16_t *ip = input;
-  int16_t *op = output;
-  int temp1, temp2;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[8];
-    b1 = ip[0] - ip[8];
-
-    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
-    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
-    c1 = temp1 - temp2;
-
-    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
-    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
-    d1 = temp1 + temp2;
-
-    op[shortpitch * 0] = a1 + d1;
-    op[shortpitch * 3] = a1 - d1;
-
-    op[shortpitch * 1] = b1 + c1;
-    op[shortpitch * 2] = b1 - c1;
-
-    ip++;
-    op++;
-  }
-
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[2];
-    b1 = ip[0] - ip[2];
-
-    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
-    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
-    c1 = temp1 - temp2;
-
-    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
-    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
-    d1 = temp1 + temp2;
-
-    op[0] = (a1 + d1 + 16) >> 5;
-    op[3] = (a1 - d1 + 16) >> 5;
-
-    op[1] = (b1 + c1 + 16) >> 5;
-    op[2] = (b1 - c1 + 16) >> 5;
-
-    ip += shortpitch;
-    op += shortpitch;
-  }
-}
-
-void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
-  int i;
-  int a1;
-  int16_t *op = output;
-  int shortpitch = pitch >> 1;
-  a1 = ((input[0] + 16) >> 5);
-  for (i = 0; i < 4; i++) {
-    op[0] = a1;
-    op[1] = a1;
-    op[2] = a1;
-    op[3] = a1;
-    op += shortpitch;
-  }
-}
-
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
-                            uint8_t *dst_ptr, int pitch, int stride) {
-  int a1 = ((input_dc + 16) >> 5);
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
-    }
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
-}
-
-void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) {
-  int i;
-  int a1, b1, c1, d1;
-  int16_t *ip = input;
-  int16_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3]));
-    b1 = ((ip[1] + ip[2]));
-    c1 = ((ip[1] - ip[2]));
-    d1 = ((ip[0] - ip[3]));
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
-
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
-    ip++;
-    op++;
-  }
-}
-
-void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) {
-  int i;
-  int16_t tmp[4];
-  int16_t *ip = in;
-  int16_t *op = tmp;
-
-  op[0] = (ip[0] + 1) >> 1;
-  op[1] = op[2] = op[3] = (ip[0] >> 1);
-
-  ip = tmp;
-  op = out;
-  for (i = 0; i < 4; i++) {
-    op[0] = (ip[0] + 1) >> 1;
-    op[4] = op[8] = op[12] = (ip[0] >> 1);
-    ip++;
-    op++;
-  }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) {
+void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
   int16_t *ip = input;
   int16_t *op = output;
+  const int half_pitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
-    b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
-    c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
-    d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
+    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
+    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;
+    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;
+    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;
 
     op[0] = (a1 + b1 + 1) >> 1;
     op[1] = (c1 + d1) >> 1;
@@ -425,941 +50,602 @@ void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) {
     op[3] = (d1 - c1) >> 1;
 
     ip += 4;
-    op += 4;
+    op += half_pitch;
   }
 
   ip = output;
   op = output;
   for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
+    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
+    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
+    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
+    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
 
 
-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
+    op[half_pitch * 1] = (c1 + d1) >> 1;
+    op[half_pitch * 2] = (a1 - b1) >> 1;
+    op[half_pitch * 3] = (d1 - c1) >> 1;
 
     ip++;
     op++;
   }
 }
 
-void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) {
+void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
   int i;
   int16_t tmp[4];
   int16_t *ip = in;
   int16_t *op = tmp;
+  const int half_pitch = pitch >> 1;
 
-  op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
-  op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
+  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
+  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
 
   ip = tmp;
   op = out;
   for (i = 0; i < 4; i++) {
-    op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
+    op[half_pitch * 0] = (ip[0] + 1) >> 1;
+    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
     ip++;
     op++;
   }
 }
 
-void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  int16_t *ip = input;
-  int16_t *op = output;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
-    b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
-    c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
-    d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
+void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
+                                 uint8_t *dst_ptr,
+                                 int pitch, int stride) {
+  int r, c;
+  int16_t dc = input_dc;
+  int16_t tmp[4 * 4];
+  vp9_short_inv_walsh4x4_1_x8_c(&dc, tmp, 4 << 1);
 
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++)
+      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
 
-    ip += 4;
-    op += shortpitch;
+    dst_ptr += stride;
+    pred_ptr += pitch;
   }
+}
 
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
-    b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
-    c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
-    d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
+static void idct4_1d(int16_t *input, int16_t *output) {
+  int16_t step[4];
+  int temp1, temp2;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = dct_const_round_shift(temp1);
+  step[1] = dct_const_round_shift(temp2);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = dct_const_round_shift(temp1);
+  step[3] = dct_const_round_shift(temp2);
+
+  // stage 2
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = step[1] - step[2];
+  output[3] = step[0] - step[3];
+}
 
+void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[4 * 4];
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[4], temp_out[4];
 
-    op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
-    op[shortpitch * 1] = (c1 + d1) >> 1;
-    op[shortpitch * 2] = (a1 - b1) >> 1;
-    op[shortpitch * 3] = (d1 - c1) >> 1;
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = input[j];
+    idct4_1d(temp_in, outptr);
+    input += 4;
+    outptr += 4;
+  }
 
-    ip++;
-    op++;
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    idct4_1d(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
 }
 
-void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
   int i;
-  int16_t tmp[4];
-  int16_t *ip = in;
-  int16_t *op = tmp;
-  int shortpitch = pitch >> 1;
-
-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
-  op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
-
+  int a1;
+  int16_t *op = output;
+  const int half_pitch = pitch >> 1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
 
-  ip = tmp;
-  op = out;
   for (i = 0; i < 4; i++) {
-    op[shortpitch * 0] = (ip[0] + 1) >> 1;
-    op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
-    ip++;
-    op++;
+    op[0] = op[1] = op[2] = op[3] = a1;
+    op += half_pitch;
   }
 }
 
-void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
-                                 uint8_t *dst_ptr,
-                                 int pitch, int stride) {
+void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
+                            uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
   int r, c;
-  short tmp[16];
-  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
+  int16_t out = dct_const_round_shift(input_dc * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-    }
+    for (c = 0; c < 4; c++)
+      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
 
     dst_ptr += stride;
     pred_ptr += pitch;
   }
 }
-#endif
-
-void vp9_dc_only_idct_add_8x8_c(short input_dc,
-                                uint8_t *pred_ptr,
-                                uint8_t *dst_ptr,
-                                int pitch, int stride) {
-  int a1 = ((input_dc + 16) >> 5);
-  int r, c, b;
-  uint8_t *orig_pred = pred_ptr;
-  uint8_t *orig_dst = dst_ptr;
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
-      }
-
-      dst_ptr += stride;
-      pred_ptr += pitch;
-    }
-    dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
-    pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
-  }
+
+static void idct8_1d(int16_t *input, int16_t *output) {
+  int16_t step1[8], step2[8];
+  int temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = dct_const_round_shift(temp1);
+  step1[7] = dct_const_round_shift(temp2);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = dct_const_round_shift(temp1);
+  step1[6] = dct_const_round_shift(temp2);
+
+  // stage 2 & stage 3 - even half
+  idct4_1d(step1, step1);
+
+  // stage 2 - odd half
+  step2[4] = step1[4] + step1[5];
+  step2[5] = step1[4] - step1[5];
+  step2[6] = -step1[6] + step1[7];
+  step2[7] = step1[6] + step1[7];
+
+  // stage 3 -odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = dct_const_round_shift(temp1);
+  step1[6] = dct_const_round_shift(temp2);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = step1[0] + step1[7];
+  output[1] = step1[1] + step1[6];
+  output[2] = step1[2] + step1[5];
+  output[3] = step1[3] + step1[4];
+  output[4] = step1[3] - step1[4];
+  output[5] = step1[2] - step1[5];
+  output[6] = step1[1] - step1[6];
+  output[7] = step1[0] - step1[7];
 }
 
-#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */
-#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */
-#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */
-#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */
-#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */
-#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */
+void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[8 * 8];
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[8], temp_out[8];
 
-/* row (horizontal) IDCT
- *
- * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *
- * ( k + - ) * l ) l=0                      8          2
- *
- * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */
-
-static void idctrow(int *blk) {
-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-  /* shortcut */
-  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
-    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
-                                        = blk[5] = blk[6] = blk[7] = blk[0] << 3;
-    return;
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    idct8_1d(input, outptr);
+    input += 8;
+    outptr += 8;
   }
 
-  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
-  /* first stage */
-  x8 = W7 * (x4 + x5);
-  x4 = x8 + (W1 - W7) * x4;
-  x5 = x8 - (W1 + W7) * x5;
-  x8 = W3 * (x6 + x7);
-  x6 = x8 - (W3 - W5) * x6;
-  x7 = x8 - (W3 + W5) * x7;
-
-  /* second stage */
-  x8 = x0 + x1;
-  x0 -= x1;
-  x1 = W6 * (x3 + x2);
-  x2 = x1 - (W2 + W6) * x2;
-  x3 = x1 + (W2 - W6) * x3;
-  x1 = x4 + x6;
-  x4 -= x6;
-  x6 = x5 + x7;
-  x5 -= x7;
-
-  /* third stage */
-  x7 = x8 + x3;
-  x8 -= x3;
-  x3 = x0 + x2;
-  x0 -= x2;
-  x2 = (181 * (x4 + x5) + 128) >> 8;
-  x4 = (181 * (x4 - x5) + 128) >> 8;
-
-  /* fourth stage */
-  blk[0] = (x7 + x1) >> 8;
-  blk[1] = (x3 + x2) >> 8;
-  blk[2] = (x0 + x4) >> 8;
-  blk[3] = (x8 + x6) >> 8;
-  blk[4] = (x8 - x6) >> 8;
-  blk[5] = (x0 - x4) >> 8;
-  blk[6] = (x3 - x2) >> 8;
-  blk[7] = (x7 - x1) >> 8;
-}
-
-/* column (vertical) IDCT
- *
- * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *
- * cos( -- * ( k + - ) * l ) l=0                        8          2
- *
- * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
-static void idctcol(int *blk) {
-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-  /* shortcut */
-  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
-        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
-        (x7 = blk[8 * 3]))) {
-    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
-        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
-        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
-    return;
+  // Columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    idct8_1d(temp_in, temp_out);
+    for (j = 0; j < 8; ++j)
+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
   }
-
-  x0 = (blk[8 * 0] << 8) + 16384;
-
-  /* first stage */
-  x8 = W7 * (x4 + x5) + 4;
-  x4 = (x8 + (W1 - W7) * x4) >> 3;
-  x5 = (x8 - (W1 + W7) * x5) >> 3;
-  x8 = W3 * (x6 + x7) + 4;
-  x6 = (x8 - (W3 - W5) * x6) >> 3;
-  x7 = (x8 - (W3 + W5) * x7) >> 3;
-
-  /* second stage */
-  x8 = x0 + x1;
-  x0 -= x1;
-  x1 = W6 * (x3 + x2) + 4;
-  x2 = (x1 - (W2 + W6) * x2) >> 3;
-  x3 = (x1 + (W2 - W6) * x3) >> 3;
-  x1 = x4 + x6;
-  x4 -= x6;
-  x6 = x5 + x7;
-  x5 -= x7;
-
-  /* third stage */
-  x7 = x8 + x3;
-  x8 -= x3;
-  x3 = x0 + x2;
-  x0 -= x2;
-  x2 = (181 * (x4 + x5) + 128) >> 8;
-  x4 = (181 * (x4 - x5) + 128) >> 8;
-
-  /* fourth stage */
-  blk[8 * 0] = (x7 + x1) >> 14;
-  blk[8 * 1] = (x3 + x2) >> 14;
-  blk[8 * 2] = (x0 + x4) >> 14;
-  blk[8 * 3] = (x8 + x6) >> 14;
-  blk[8 * 4] = (x8 - x6) >> 14;
-  blk[8 * 5] = (x0 - x4) >> 14;
-  blk[8 * 6] = (x3 - x2) >> 14;
-  blk[8 * 7] = (x7 - x1) >> 14;
 }
 
-#define TX_DIM 8
-void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) {
-  int X[TX_DIM * TX_DIM];
-  int i, j;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < TX_DIM; i++) {
-    for (j = 0; j < TX_DIM; j++) {
-      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
-                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
-    }
-  }
-  for (i = 0; i < 8; i++)
-    idctrow(X + 8 * i);
+static void iadst4_1d(int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
 
-  for (i = 0; i < 8; i++)
-    idctcol(X + i);
+  int x0 = input[0];
+  int x1 = input[1];
+  int x2 = input[2];
+  int x3 = input[3];
 
-  for (i = 0; i < TX_DIM; i++) {
-    for (j = 0; j < TX_DIM; j++) {
-      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
-    }
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
   }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
 }
 
-/* Row IDCT when only first 4 coefficients are non-zero. */
-static void idctrow10(int *blk) {
-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+static const transform_2d IHT_4[] = {
+  { idct4_1d,  idct4_1d  },  // DCT_DCT  = 0
+  { iadst4_1d, idct4_1d  },  // ADST_DCT = 1
+  { idct4_1d,  iadst4_1d },  // DCT_ADST = 2
+  { iadst4_1d, iadst4_1d }   // ADST_ADST = 3
+};
 
-  /* shortcut */
-  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
-    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
-           = blk[5] = blk[6] = blk[7] = blk[0] << 3;
-    return;
+void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
+                        int pitch, TX_TYPE tx_type) {
+  int i, j;
+  int16_t out[4 * 4];
+  int16_t *outptr = out;
+  int16_t temp_in[4], temp_out[4];
+  const transform_2d ht = IHT_4[tx_type];
+
+  // inverse transform row vectors
+  for (i = 0; i < 4; ++i) {
+    ht.rows(input, outptr);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
-
-  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
-  /* first stage */
-  x5 = W7 * x4;
-  x4 = W1 * x4;
-  x6 = W3 * x7;
-  x7 = -W5 * x7;
-
-  /* second stage */
-  x2 = W6 * x3;
-  x3 = W2 * x3;
-  x1 = x4 + x6;
-  x4 -= x6;
-  x6 = x5 + x7;
-  x5 -= x7;
-
-  /* third stage */
-  x7 = x0 + x3;
-  x8 = x0 - x3;
-  x3 = x0 + x2;
-  x0 -= x2;
-  x2 = (181 * (x4 + x5) + 128) >> 8;
-  x4 = (181 * (x4 - x5) + 128) >> 8;
-
-  /* fourth stage */
-  blk[0] = (x7 + x1) >> 8;
-  blk[1] = (x3 + x2) >> 8;
-  blk[2] = (x0 + x4) >> 8;
-  blk[3] = (x8 + x6) >> 8;
-  blk[4] = (x8 - x6) >> 8;
-  blk[5] = (x0 - x4) >> 8;
-  blk[6] = (x3 - x2) >> 8;
-  blk[7] = (x7 - x1) >> 8;
 }
 
-/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
-static void idctcol10(int *blk) {
-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-  /* shortcut */
-  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
-        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
-        (x7 = blk[8 * 3]))) {
-    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
-        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
-        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+static void iadst8_1d(int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+
+  int x0 = input[7];
+  int x1 = input[0];
+  int x2 = input[5];
+  int x3 = input[2];
+  int x4 = input[3];
+  int x5 = input[4];
+  int x6 = input[1];
+  int x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
     return;
   }
 
-  x0 = (blk[8 * 0] << 8) + 16384;
-
-  /* first stage */
-  x5 = (W7 * x4 + 4) >> 3;
-  x4 = (W1 * x4 + 4) >> 3;
-  x6 = (W3 * x7 + 4) >> 3;
-  x7 = (-W5 * x7 + 4) >> 3;
-
-  /* second stage */
-  x2 = (W6 * x3 + 4) >> 3;
-  x3 = (W2 * x3 + 4) >> 3;
-  x1 = x4 + x6;
-  x4 -= x6;
-  x6 = x5 + x7;
-  x5 -= x7;
-
-  /* third stage */
-  x7 = x0 + x3;
-  x8 = x0 - x3;
-  x3 = x0 + x2;
-  x0 -= x2;
-  x2 = (181 * (x4 + x5) + 128) >> 8;
-  x4 = (181 * (x4 - x5) + 128) >> 8;
-
-  /* fourth stage */
-  blk[8 * 0] = (x7 + x1) >> 14;
-  blk[8 * 1] = (x3 + x2) >> 14;
-  blk[8 * 2] = (x0 + x4) >> 14;
-  blk[8 * 3] = (x8 + x6) >> 14;
-  blk[8 * 4] = (x8 - x6) >> 14;
-  blk[8 * 5] = (x0 - x4) >> 14;
-  blk[8 * 6] = (x3 - x2) >> 14;
-  blk[8 * 7] = (x7 - x1) >> 14;
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = dct_const_round_shift(s0 + s4);
+  x1 = dct_const_round_shift(s1 + s5);
+  x2 = dct_const_round_shift(s2 + s6);
+  x3 = dct_const_round_shift(s3 + s7);
+  x4 = dct_const_round_shift(s0 - s4);
+  x5 = dct_const_round_shift(s1 - s5);
+  x6 = dct_const_round_shift(s2 - s6);
+  x7 = dct_const_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
 }
 
-void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) {
-  int X[TX_DIM * TX_DIM];
-  int i, j;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < TX_DIM; i++) {
-    for (j = 0; j < TX_DIM; j++) {
-      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
-                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
-    }
-  }
-
-  /* Do first 4 row idct only since non-zero dct coefficients are all in
-   * upper-left 4x4 area. */
-  for (i = 0; i < 4; i++)
-    idctrow10(X + 8 * i);
+static const transform_2d IHT_8[] = {
+  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0
+  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1
+  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2
+  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
+};
 
-  for (i = 0; i < 8; i++)
-    idctcol10(X + i);
+void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
+                        int pitch, TX_TYPE tx_type) {
+  int i, j;
+  int16_t out[8 * 8];
+  int16_t *outptr = out;
+  int16_t temp_in[8], temp_out[8];
+  const transform_2d ht = IHT_8[tx_type];
 
-  for (i = 0; i < TX_DIM; i++) {
-    for (j = 0; j < TX_DIM; j++) {
-      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
-    }
+  // inverse transform row vectors
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr);
+    input += 8;
+    outptr += 8;
   }
-}
 
-void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) {
-  int i;
-  int16_t *ip = input;  // 0, 1, 4, 8
-  int16_t *op = output;
-  for (i = 0; i < 16; i++) {
-    op[i] = 0;
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 8; ++j)
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
   }
-
-  op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
-  op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
-  op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
-  op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
 }
 
+void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[8 * 8];
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[8], temp_out[8];
 
-#if 0
-// Keep a really bad float version as reference for now.
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double x;
-    const int short_pitch = pitch >> 1;
-    int i, j, k, l;
-    for (l = 0; l < 16; ++l) {
-      for (k = 0; k < 16; ++k) {
-        double s = 0;
-        for (i = 0; i < 16; ++i) {
-          for (j = 0; j < 16; ++j) {
-            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
-            if (i != 0)
-              x *= sqrt(2.0);
-            if (j != 0)
-              x *= sqrt(2.0);
-            s += x;
-          }
-        }
-        output[k*short_pitch+l] = (short)round(s);
-      }
-    }
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-#endif
-
-#define TEST_INT_16x16_IDCT 1
-#if !TEST_INT_16x16_IDCT
-
-static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
-
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-
-    // step 1 and 2
-    step[ 0] = input[0] + input[8];
-    step[ 1] = input[0] - input[8];
-
-    temp1 = input[4]*C12;
-    temp2 = input[12]*C4;
-
-    temp1 -= temp2;
-    temp1 *= C8;
-
-    step[ 2] = 2*(temp1);
-
-    temp1 = input[4]*C4;
-    temp2 = input[12]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    step[ 3] = 2*(temp1);
-
-    temp1 = input[2]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] + input[10];
-
-    step[ 4] = temp1 + temp2;
-    step[ 5] = temp1 - temp2;
-
-    temp1 = input[14]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] - input[10];
-
-    step[ 6] = temp2 - temp1;
-    step[ 7] = temp2 + temp1;
-
-    // for odd input
-    temp1 = input[3]*C12;
-    temp2 = input[13]*C4;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[ 8] = 2*(temp1);
-
-    temp1 = input[3]*C4;
-    temp2 = input[13]*C12;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[ 9] = 2*(temp2);
-
-    intermediate[10] = 2*(input[9]*C8);
-    intermediate[11] = input[15] - input[1];
-    intermediate[12] = input[15] + input[1];
-    intermediate[13] = 2*((input[7]*C8));
-
-    temp1 = input[11]*C12;
-    temp2 = input[5]*C4;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[14] = 2*(temp2);
-
-    temp1 = input[11]*C4;
-    temp2 = input[5]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[15] = 2*(temp1);
-
-    step[ 8] = intermediate[ 8] + intermediate[14];
-    step[ 9] = intermediate[ 9] + intermediate[15];
-    step[10] = intermediate[10] + intermediate[11];
-    step[11] = intermediate[10] - intermediate[11];
-    step[12] = intermediate[12] + intermediate[13];
-    step[13] = intermediate[12] - intermediate[13];
-    step[14] = intermediate[ 8] - intermediate[14];
-    step[15] = intermediate[ 9] - intermediate[15];
-
-    // step 3
-    output[0] = step[ 0] + step[ 3];
-    output[1] = step[ 1] + step[ 2];
-    output[2] = step[ 1] - step[ 2];
-    output[3] = step[ 0] - step[ 3];
-
-    temp1 = step[ 4]*C14;
-    temp2 = step[ 7]*C2;
-    temp1 -= temp2;
-    output[4] =  (temp1);
-
-    temp1 = step[ 4]*C2;
-    temp2 = step[ 7]*C14;
-    temp1 += temp2;
-    output[7] =  (temp1);
-
-    temp1 = step[ 5]*C10;
-    temp2 = step[ 6]*C6;
-    temp1 -= temp2;
-    output[5] =  (temp1);
-
-    temp1 = step[ 5]*C6;
-    temp2 = step[ 6]*C10;
-    temp1 += temp2;
-    output[6] =  (temp1);
-
-    output[8] = step[ 8] + step[11];
-    output[9] = step[ 9] + step[10];
-    output[10] = step[ 9] - step[10];
-    output[11] = step[ 8] - step[11];
-    output[12] = step[12] + step[15];
-    output[13] = step[13] + step[14];
-    output[14] = step[13] - step[14];
-    output[15] = step[12] - step[15];
-
-    // output 4
-    step[ 0] = output[0] + output[7];
-    step[ 1] = output[1] + output[6];
-    step[ 2] = output[2] + output[5];
-    step[ 3] = output[3] + output[4];
-    step[ 4] = output[3] - output[4];
-    step[ 5] = output[2] - output[5];
-    step[ 6] = output[1] - output[6];
-    step[ 7] = output[0] - output[7];
-
-    temp1 = output[8]*C7;
-    temp2 = output[15]*C9;
-    temp1 -= temp2;
-    step[ 8] = (temp1);
-
-    temp1 = output[9]*C11;
-    temp2 = output[14]*C5;
-    temp1 += temp2;
-    step[ 9] = (temp1);
-
-    temp1 = output[10]*C3;
-    temp2 = output[13]*C13;
-    temp1 -= temp2;
-    step[10] = (temp1);
-
-    temp1 = output[11]*C15;
-    temp2 = output[12]*C1;
-    temp1 += temp2;
-    step[11] = (temp1);
-
-    temp1 = output[11]*C1;
-    temp2 = output[12]*C15;
-    temp2 -= temp1;
-    step[12] = (temp2);
-
-    temp1 = output[10]*C13;
-    temp2 = output[13]*C3;
-    temp1 += temp2;
-    step[13] = (temp1);
-
-    temp1 = output[9]*C5;
-    temp2 = output[14]*C11;
-    temp2 -= temp1;
-    step[14] = (temp2);
-
-    temp1 = output[8]*C9;
-    temp2 = output[15]*C7;
-    temp1 += temp2;
-    step[15] = (temp1);
-
-    // step 5
-    output[0] = (step[0] + step[15]);
-    output[1] = (step[1] + step[14]);
-    output[2] = (step[2] + step[13]);
-    output[3] = (step[3] + step[12]);
-    output[4] = (step[4] + step[11]);
-    output[5] = (step[5] + step[10]);
-    output[6] = (step[6] + step[ 9]);
-    output[7] = (step[7] + step[ 8]);
-
-    output[15] = (step[0] - step[15]);
-    output[14] = (step[1] - step[14]);
-    output[13] = (step[2] - step[13]);
-    output[12] = (step[3] - step[12]);
-    output[11] = (step[4] - step[11]);
-    output[10] = (step[5] - step[10]);
-    output[9] = (step[6] - step[ 9]);
-    output[8] = (step[7] - step[ 8]);
+  vpx_memset(out, 0, sizeof(out));
+  // First transform rows
+  // only first 4 row has non-zero coefs
+  for (i = 0; i < 4; ++i) {
+    idct8_1d(input, outptr);
+    input += 8;
+    outptr += 8;
   }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
 
-// Remove once an int version of iDCT is written
-#if 0
-void reference_16x16_idct_1d(double input[16], double output[16]) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    const double kPi = 3.141592653589793238462643383279502884;
-    const double kSqrt2 = 1.414213562373095048801688724209698;
-    for (int k = 0; k < 16; k++) {
-      output[k] = 0.0;
-      for (int n = 0; n < 16; n++) {
-        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
-        if (n == 0)
-          output[k] = output[k]/kSqrt2;
-      }
-    }
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    idct8_1d(temp_in, temp_out);
+    for (j = 0; j < 8; ++j)
+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
   }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
 }
-#endif
 
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double out[16*16], out2[16*16];
-    const int short_pitch = pitch >> 1;
-    int i, j;
-      // First transform rows
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j + i*short_pitch];
-      butterfly_16x16_idct_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out[j + i*16] = temp_out[j];
-    }
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      butterfly_16x16_idct_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out2[j*16 + i] = temp_out[j];
-    }
-    for (i = 0; i < 16*16; ++i)
-      output[i] = round(out2[i]/128);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
+void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  output[0] = ROUND_POWER_OF_TWO(out, 5);
 }
 
-#else
-
-#define INITIAL_SHIFT 2
-#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))
-#define RIGHT_SHIFT 14
-#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))
-
-static const int16_t C1 = 16305;
-static const int16_t C2 = 16069;
-static const int16_t C3 = 15679;
-static const int16_t C4 = 15137;
-static const int16_t C5 = 14449;
-static const int16_t C6 = 13623;
-static const int16_t C7 = 12665;
-static const int16_t C8 = 11585;
-static const int16_t C9 = 10394;
-static const int16_t C10 = 9102;
-static const int16_t C11 = 7723;
-static const int16_t C12 = 6270;
-static const int16_t C13 = 4756;
-static const int16_t C14 = 3196;
-static const int16_t C15 = 1606;
-
-static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16],
-                                    int last_shift_bits) {
-  int16_t step[16];
-  int intermediate[16];
+static void idct16_1d(int16_t *input, int16_t *output) {
+  int16_t step1[16], step2[16];
   int temp1, temp2;
 
-  int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;
-  int step1_rounding = 1 << (step1_shift - 1);
-  int last_rounding = 0;
-
-  if (last_shift_bits > 0)
-    last_rounding = 1 << (last_shift_bits - 1);
-
-  // step 1 and 2
-  step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-  step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-
-  temp1 = input[4] * C12;
-  temp2 = input[12] * C4;
-  temp1 = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp1  *= C8;
-  step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;
-
-  temp1 = input[4] * C4;
-  temp2 = input[12] * C12;
-  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp1 *= C8;
-  step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;
-
-  temp1 = input[2] * C8;
-  temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp2 = input[6] + input[10];
-  step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-  step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-
-  temp1 = input[14] * C8;
-  temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp2 = input[6] - input[10];
-  step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-  step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-
-  // for odd input
-  temp1 = input[3] * C12;
-  temp2 = input[13] * C4;
-  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp1 *= C8;
-  intermediate[ 8] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = input[3] * C4;
-  temp2 = input[13] * C12;
-  temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp2 *= C8;
-  intermediate[ 9] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  intermediate[11] = input[15] - input[1];
-  intermediate[12] = input[15] + input[1];
-  intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = input[11] * C12;
-  temp2 = input[5] * C4;
-  temp2 = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp2 *= C8;
-  intermediate[14] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = input[11] * C4;
-  temp2 = input[5] * C12;
-  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-  temp1 *= C8;
-  intermediate[15] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-  step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)
-      >> INITIAL_SHIFT;
-
-  // step 3
-  output[0] = step[ 0] + step[ 3];
-  output[1] = step[ 1] + step[ 2];
-  output[2] = step[ 1] - step[ 2];
-  output[3] = step[ 0] - step[ 3];
-
-  temp1 = step[ 4] * C14;
-  temp2 = step[ 7] * C2;
-  output[4] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = step[ 4] * C2;
-  temp2 = step[ 7] * C14;
-  output[7] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = step[ 5] * C10;
-  temp2 = step[ 6] * C6;
-  output[5] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = step[ 5] * C6;
-  temp2 = step[ 6] * C10;
-  output[6] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  output[8] = step[ 8] + step[11];
-  output[9] = step[ 9] + step[10];
-  output[10] = step[ 9] - step[10];
-  output[11] = step[ 8] - step[11];
-  output[12] = step[12] + step[15];
-  output[13] = step[13] + step[14];
-  output[14] = step[13] - step[14];
-  output[15] = step[12] - step[15];
-
-  // output 4
-  step[ 0] = output[0] + output[7];
-  step[ 1] = output[1] + output[6];
-  step[ 2] = output[2] + output[5];
-  step[ 3] = output[3] + output[4];
-  step[ 4] = output[3] - output[4];
-  step[ 5] = output[2] - output[5];
-  step[ 6] = output[1] - output[6];
-  step[ 7] = output[0] - output[7];
-
-  temp1 = output[8] * C7;
-  temp2 = output[15] * C9;
-  step[ 8] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[9] * C11;
-  temp2 = output[14] * C5;
-  step[ 9] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[10] * C3;
-  temp2 = output[13] * C13;
-  step[10] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[11] * C15;
-  temp2 = output[12] * C1;
-  step[11] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[11] * C1;
-  temp2 = output[12] * C15;
-  step[12] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[10] * C13;
-  temp2 = output[13] * C3;
-  step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[9] * C5;
-  temp2 = output[14] * C11;
-  step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  temp1 = output[8] * C9;
-  temp2 = output[15] * C7;
-  step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-  // step 5
-  output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
-  output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
-  output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
-  output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
-  output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
-  output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
-  output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
-  output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
-
-  output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
-  output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
-  output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
-  output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
-  output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
-  output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
-  output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
-  output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = dct_const_round_shift(temp1);
+  step2[15] = dct_const_round_shift(temp2);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = dct_const_round_shift(temp1);
+  step2[14] = dct_const_round_shift(temp2);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = dct_const_round_shift(temp1);
+  step2[13] = dct_const_round_shift(temp2);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = dct_const_round_shift(temp1);
+  step2[12] = dct_const_round_shift(temp2);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = dct_const_round_shift(temp1);
+  step1[7] = dct_const_round_shift(temp2);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = dct_const_round_shift(temp1);
+  step1[6] = dct_const_round_shift(temp2);
+
+  step1[8] = step2[8] + step2[9];
+  step1[9] = step2[8] - step2[9];
+  step1[10] = -step2[10] + step2[11];
+  step1[11] = step2[10] + step2[11];
+  step1[12] = step2[12] + step2[13];
+  step1[13] = step2[12] - step2[13];
+  step1[14] = -step2[14] + step2[15];
+  step1[15] = step2[14] + step2[15];
+
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = dct_const_round_shift(temp1);
+  step2[1] = dct_const_round_shift(temp2);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = dct_const_round_shift(temp1);
+  step2[3] = dct_const_round_shift(temp2);
+  step2[4] = step1[4] + step1[5];
+  step2[5] = step1[4] - step1[5];
+  step2[6] = -step1[6] + step1[7];
+  step2[7] = step1[6] + step1[7];
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = dct_const_round_shift(temp1);
+  step2[14] = dct_const_round_shift(temp2);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = dct_const_round_shift(temp1);
+  step2[13] = dct_const_round_shift(temp2);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = step2[0] + step2[3];
+  step1[1] = step2[1] + step2[2];
+  step1[2] = step2[1] - step2[2];
+  step1[3] = step2[0] - step2[3];
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = dct_const_round_shift(temp1);
+  step1[6] = dct_const_round_shift(temp2);
+  step1[7] = step2[7];
+
+  step1[8] = step2[8] + step2[11];
+  step1[9] = step2[9] + step2[10];
+  step1[10] = step2[9] - step2[10];
+  step1[11] = step2[8] - step2[11];
+  step1[12] = -step2[12] + step2[15];
+  step1[13] = -step2[13] + step2[14];
+  step1[14] = step2[13] + step2[14];
+  step1[15] = step2[12] + step2[15];
+
+  // stage 6
+  step2[0] = step1[0] + step1[7];
+  step2[1] = step1[1] + step1[6];
+  step2[2] = step1[2] + step1[5];
+  step2[3] = step1[3] + step1[4];
+  step2[4] = step1[3] - step1[4];
+  step2[5] = step1[2] - step1[5];
+  step2[6] = step1[1] - step1[6];
+  step2[7] = step1[0] - step1[7];
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = dct_const_round_shift(temp1);
+  step2[13] = dct_const_round_shift(temp2);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = dct_const_round_shift(temp1);
+  step2[12] = dct_const_round_shift(temp2);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = step2[0] + step2[15];
+  output[1] = step2[1] + step2[14];
+  output[2] = step2[2] + step2[13];
+  output[3] = step2[3] + step2[12];
+  output[4] = step2[4] + step2[11];
+  output[5] = step2[5] + step2[10];
+  output[6] = step2[6] + step2[9];
+  output[7] = step2[7] + step2[8];
+  output[8] = step2[7] - step2[8];
+  output[9] = step2[6] - step2[9];
+  output[10] = step2[5] - step2[10];
+  output[11] = step2[4] - step2[11];
+  output[12] = step2[3] - step2[12];
+  output[13] = step2[2] - step2[13];
+  output[14] = step2[1] - step2[14];
+  output[15] = step2[0] - step2[15];
 }
 
 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
   int16_t out[16 * 16];
-  int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
-    butterfly_16x16_idct_1d(input, outptr, 0);
-    input += short_pitch;
+    idct16_1d(input, outptr);
+    input += half_pitch;
     outptr += 16;
   }
 
@@ -1367,144 +653,219 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
-    butterfly_16x16_idct_1d(temp_in, temp_out, 3);
+    idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-        output[j * 16 + i] = temp_out[j];
-    }
+      output[j * 16 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+  }
 }
 
-/* The following function is called when we know the maximum number of non-zero
- * dct coefficients is less or equal 10.
- */
-static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],
-                                      int last_shift_bits) {
-    int16_t step[16] = {0};
-    int intermediate[16] = {0};
-    int temp1, temp2;
-    int last_rounding = 0;
-
-    if (last_shift_bits > 0)
-      last_rounding = 1 << (last_shift_bits - 1);
-
-    // step 1 and 2
-    step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-
-    temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-
-    // for odd input
-    temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp1 *= C8;
-    intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp1 *= C8;
-    intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-
-    // step 3
-    output[0] = step[ 0];
-    output[1] = step[ 1];
-    output[2] = step[ 1];
-    output[3] = step[ 0];
-
-    temp1 = step[ 4] * C14;
-    output[4] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[ 4] * C2;
-    output[7] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[ 5] * C10;
-    output[5] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[ 5] * C6;
-    output[6] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    output[8] = step[ 8] + step[11];
-    output[9] = step[ 9] + step[10];
-    output[10] = step[ 9] - step[10];
-    output[11] = step[ 8] - step[11];
-    output[12] = step[12] + step[15];
-    output[13] = step[13] + step[14];
-    output[14] = step[13] - step[14];
-    output[15] = step[12] - step[15];
-
-    // output 4
-    step[ 0] = output[0] + output[7];
-    step[ 1] = output[1] + output[6];
-    step[ 2] = output[2] + output[5];
-    step[ 3] = output[3] + output[4];
-    step[ 4] = output[3] - output[4];
-    step[ 5] = output[2] - output[5];
-    step[ 6] = output[1] - output[6];
-    step[ 7] = output[0] - output[7];
-
-    temp1 = output[8] * C7;
-    temp2 = output[15] * C9;
-    step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[9] * C11;
-    temp2 = output[14] * C5;
-    step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[10] * C3;
-    temp2 = output[13] * C13;
-    step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[11] * C15;
-    temp2 = output[12] * C1;
-    step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[11] * C1;
-    temp2 = output[12] * C15;
-    step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[10] * C13;
-    temp2 = output[13] * C3;
-    step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[9] * C5;
-    temp2 = output[14] * C11;
-    step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[8] * C9;
-    temp2 = output[15] * C7;
-    step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-
-    // step 5
-    output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
-    output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
-    output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
-    output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
-    output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
-    output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
-    output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
-    output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
-
-    output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
-    output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
-    output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
-    output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
-    output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
-    output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
-    output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
-    output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
+void iadst16_1d(int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+static const transform_2d IHT_16[] = {
+  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0
+  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1
+  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2
+  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
+};
+
+void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
+                          int input_pitch, TX_TYPE tx_type) {
+  int i, j;
+  int16_t out[16 * 16];
+  int16_t *outptr = out;
+  int16_t temp_in[16], temp_out[16];
+  const transform_2d ht = IHT_16[tx_type];
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr);
+    input += input_pitch;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      output[j * 16 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+  }
 }
 
 void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
     int16_t out[16 * 16];
-    int16_t *outptr = &out[0];
-    const int short_pitch = pitch >> 1;
+    int16_t *outptr = out;
+    const int half_pitch = pitch >> 1;
     int i, j;
     int16_t temp_in[16], temp_out[16];
 
@@ -1513,8 +874,8 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
      */
     vpx_memset(out, 0, sizeof(out));
     for (i = 0; i < 4; ++i) {
-      butterfly_16x16_idct10_1d(input, outptr, 0);
-      input += short_pitch;
+      idct16_1d(input, outptr);
+      input += half_pitch;
       outptr += 16;
     }
 
@@ -1522,1149 +883,439 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
     for (i = 0; i < 16; ++i) {
       for (j = 0; j < 16; ++j)
         temp_in[j] = out[j*16 + i];
-      butterfly_16x16_idct10_1d(temp_in, temp_out, 3);
+      idct16_1d(temp_in, temp_out);
       for (j = 0; j < 16; ++j)
-        output[j*16 + i] = temp_out[j];
+        output[j*16 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
     }
 }
-#undef INITIAL_SHIFT
-#undef INITIAL_ROUNDING
-#undef RIGHT_SHIFT
-#undef RIGHT_ROUNDING
-#endif
-
-#if !CONFIG_DWTDCTHYBRID
-#define DownshiftMultiplyBy2(x) x * 2
-#define DownshiftMultiply(x) x
-
-static void idct16(double *input, double *output, int stride) {
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  double step[16];
-  double intermediate[16];
-  double temp1, temp2;
-
-  // step 1 and 2
-  step[ 0] = input[stride*0] + input[stride*8];
-  step[ 1] = input[stride*0] - input[stride*8];
-
-  temp1 = input[stride*4]*C12;
-  temp2 = input[stride*12]*C4;
-
-  temp1 -= temp2;
-  temp1 = DownshiftMultiply(temp1);
-  temp1 *= C8;
-
-  step[ 2] = DownshiftMultiplyBy2(temp1);
-
-  temp1 = input[stride*4]*C4;
-  temp2 = input[stride*12]*C12;
-  temp1 += temp2;
-  temp1 = DownshiftMultiply(temp1);
-  temp1 *= C8;
-  step[ 3] = DownshiftMultiplyBy2(temp1);
-
-  temp1 = input[stride*2]*C8;
-  temp1 = DownshiftMultiplyBy2(temp1);
-  temp2 = input[stride*6] + input[stride*10];
-
-  step[ 4] = temp1 + temp2;
-  step[ 5] = temp1 - temp2;
-
-  temp1 = input[stride*14]*C8;
-  temp1 = DownshiftMultiplyBy2(temp1);
-  temp2 = input[stride*6] - input[stride*10];
-
-  step[ 6] = temp2 - temp1;
-  step[ 7] = temp2 + temp1;
-
-  // for odd input
-  temp1 = input[stride*3]*C12;
-  temp2 = input[stride*13]*C4;
-  temp1 += temp2;
-  temp1 = DownshiftMultiply(temp1);
-  temp1 *= C8;
-  intermediate[ 8] = DownshiftMultiplyBy2(temp1);
-
-  temp1 = input[stride*3]*C4;
-  temp2 = input[stride*13]*C12;
-  temp2 -= temp1;
-  temp2 = DownshiftMultiply(temp2);
-  temp2 *= C8;
-  intermediate[ 9] = DownshiftMultiplyBy2(temp2);
-
-  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
-  intermediate[11] = input[stride*15] - input[stride*1];
-  intermediate[12] = input[stride*15] + input[stride*1];
-  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
-
-  temp1 = input[stride*11]*C12;
-  temp2 = input[stride*5]*C4;
-  temp2 -= temp1;
-  temp2 = DownshiftMultiply(temp2);
-  temp2 *= C8;
-  intermediate[14] = DownshiftMultiplyBy2(temp2);
-
-  temp1 = input[stride*11]*C4;
-  temp2 = input[stride*5]*C12;
-  temp1 += temp2;
-  temp1 = DownshiftMultiply(temp1);
-  temp1 *= C8;
-  intermediate[15] = DownshiftMultiplyBy2(temp1);
-
-  step[ 8] = intermediate[ 8] + intermediate[14];
-  step[ 9] = intermediate[ 9] + intermediate[15];
-  step[10] = intermediate[10] + intermediate[11];
-  step[11] = intermediate[10] - intermediate[11];
-  step[12] = intermediate[12] + intermediate[13];
-  step[13] = intermediate[12] - intermediate[13];
-  step[14] = intermediate[ 8] - intermediate[14];
-  step[15] = intermediate[ 9] - intermediate[15];
-
-  // step 3
-  output[stride*0] = step[ 0] + step[ 3];
-  output[stride*1] = step[ 1] + step[ 2];
-  output[stride*2] = step[ 1] - step[ 2];
-  output[stride*3] = step[ 0] - step[ 3];
-
-  temp1 = step[ 4]*C14;
-  temp2 = step[ 7]*C2;
-  temp1 -= temp2;
-  output[stride*4] =  DownshiftMultiply(temp1);
-
-  temp1 = step[ 4]*C2;
-  temp2 = step[ 7]*C14;
-  temp1 += temp2;
-  output[stride*7] =  DownshiftMultiply(temp1);
-
-  temp1 = step[ 5]*C10;
-  temp2 = step[ 6]*C6;
-  temp1 -= temp2;
-  output[stride*5] =  DownshiftMultiply(temp1);
-
-  temp1 = step[ 5]*C6;
-  temp2 = step[ 6]*C10;
-  temp1 += temp2;
-  output[stride*6] =  DownshiftMultiply(temp1);
-
-  output[stride*8] = step[ 8] + step[11];
-  output[stride*9] = step[ 9] + step[10];
-  output[stride*10] = step[ 9] - step[10];
-  output[stride*11] = step[ 8] - step[11];
-  output[stride*12] = step[12] + step[15];
-  output[stride*13] = step[13] + step[14];
-  output[stride*14] = step[13] - step[14];
-  output[stride*15] = step[12] - step[15];
-
-  // output 4
-  step[ 0] = output[stride*0] + output[stride*7];
-  step[ 1] = output[stride*1] + output[stride*6];
-  step[ 2] = output[stride*2] + output[stride*5];
-  step[ 3] = output[stride*3] + output[stride*4];
-  step[ 4] = output[stride*3] - output[stride*4];
-  step[ 5] = output[stride*2] - output[stride*5];
-  step[ 6] = output[stride*1] - output[stride*6];
-  step[ 7] = output[stride*0] - output[stride*7];
-
-  temp1 = output[stride*8]*C7;
-  temp2 = output[stride*15]*C9;
-  temp1 -= temp2;
-  step[ 8] = DownshiftMultiply(temp1);
-
-  temp1 = output[stride*9]*C11;
-  temp2 = output[stride*14]*C5;
-  temp1 += temp2;
-  step[ 9] = DownshiftMultiply(temp1);
-
-  temp1 = output[stride*10]*C3;
-  temp2 = output[stride*13]*C13;
-  temp1 -= temp2;
-  step[10] = DownshiftMultiply(temp1);
-
-  temp1 = output[stride*11]*C15;
-  temp2 = output[stride*12]*C1;
-  temp1 += temp2;
-  step[11] = DownshiftMultiply(temp1);
-
-  temp1 = output[stride*11]*C1;
-  temp2 = output[stride*12]*C15;
-  temp2 -= temp1;
-  step[12] = DownshiftMultiply(temp2);
-
-  temp1 = output[stride*10]*C13;
-  temp2 = output[stride*13]*C3;
-  temp1 += temp2;
-  step[13] = DownshiftMultiply(temp1);
-
-  temp1 = output[stride*9]*C5;
-  temp2 = output[stride*14]*C11;
-  temp2 -= temp1;
-  step[14] = DownshiftMultiply(temp2);
-
-  temp1 = output[stride*8]*C9;
-  temp2 = output[stride*15]*C7;
-  temp1 += temp2;
-  step[15] = DownshiftMultiply(temp1);
-
-  // step 5
-  output[stride*0] = step[0] + step[15];
-  output[stride*1] = step[1] + step[14];
-  output[stride*2] = step[2] + step[13];
-  output[stride*3] = step[3] + step[12];
-  output[stride*4] = step[4] + step[11];
-  output[stride*5] = step[5] + step[10];
-  output[stride*6] = step[6] + step[ 9];
-  output[stride*7] = step[7] + step[ 8];
-
-  output[stride*15] = step[0] - step[15];
-  output[stride*14] = step[1] - step[14];
-  output[stride*13] = step[2] - step[13];
-  output[stride*12] = step[3] - step[12];
-  output[stride*11] = step[4] - step[11];
-  output[stride*10] = step[5] - step[10];
-  output[stride*9] = step[6] - step[ 9];
-  output[stride*8] = step[7] - step[ 8];
-}
 
-static void butterfly_32_idct_1d(double *input, double *output, int stride) {
-  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
-  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
-  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
-  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
-  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
-  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
-  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
-  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
-  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
-  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
-  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
-  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
-  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
-  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
-  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
-  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
-  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
-
-  double step1[32];
-  double step2[32];
-
-  step1[ 0] = input[stride*0];
-  step1[ 1] = input[stride*2];
-  step1[ 2] = input[stride*4];
-  step1[ 3] = input[stride*6];
-  step1[ 4] = input[stride*8];
-  step1[ 5] = input[stride*10];
-  step1[ 6] = input[stride*12];
-  step1[ 7] = input[stride*14];
-  step1[ 8] = input[stride*16];
-  step1[ 9] = input[stride*18];
-  step1[10] = input[stride*20];
-  step1[11] = input[stride*22];
-  step1[12] = input[stride*24];
-  step1[13] = input[stride*26];
-  step1[14] = input[stride*28];
-  step1[15] = input[stride*30];
-
-  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
-  step1[17] = (input[stride*3] + input[stride*1]);
-  step1[18] = (input[stride*5] + input[stride*3]);
-  step1[19] = (input[stride*7] + input[stride*5]);
-  step1[20] = (input[stride*9] + input[stride*7]);
-  step1[21] = (input[stride*11] + input[stride*9]);
-  step1[22] = (input[stride*13] + input[stride*11]);
-  step1[23] = (input[stride*15] + input[stride*13]);
-  step1[24] = (input[stride*17] + input[stride*15]);
-  step1[25] = (input[stride*19] + input[stride*17]);
-  step1[26] = (input[stride*21] + input[stride*19]);
-  step1[27] = (input[stride*23] + input[stride*21]);
-  step1[28] = (input[stride*25] + input[stride*23]);
-  step1[29] = (input[stride*27] + input[stride*25]);
-  step1[30] = (input[stride*29] + input[stride*27]);
-  step1[31] = (input[stride*31] + input[stride*29]);
-
-  idct16(step1, step2, 1);
-  idct16(step1 + 16, step2 + 16, 1);
-
-  step2[16] = DownshiftMultiply(step2[16] / (2*C1));
-  step2[17] = DownshiftMultiply(step2[17] / (2*C3));
-  step2[18] = DownshiftMultiply(step2[18] / (2*C5));
-  step2[19] = DownshiftMultiply(step2[19] / (2*C7));
-  step2[20] = DownshiftMultiply(step2[20] / (2*C9));
-  step2[21] = DownshiftMultiply(step2[21] / (2*C11));
-  step2[22] = DownshiftMultiply(step2[22] / (2*C13));
-  step2[23] = DownshiftMultiply(step2[23] / (2*C15));
-  step2[24] = DownshiftMultiply(step2[24] / (2*C17));
-  step2[25] = DownshiftMultiply(step2[25] / (2*C19));
-  step2[26] = DownshiftMultiply(step2[26] / (2*C21));
-  step2[27] = DownshiftMultiply(step2[27] / (2*C23));
-  step2[28] = DownshiftMultiply(step2[28] / (2*C25));
-  step2[29] = DownshiftMultiply(step2[29] / (2*C27));
-  step2[30] = DownshiftMultiply(step2[30] / (2*C29));
-  step2[31] = DownshiftMultiply(step2[31] / (2*C31));
-
-  output[stride* 0] = step2[ 0] + step2[16];
-  output[stride* 1] = step2[ 1] + step2[17];
-  output[stride* 2] = step2[ 2] + step2[18];
-  output[stride* 3] = step2[ 3] + step2[19];
-  output[stride* 4] = step2[ 4] + step2[20];
-  output[stride* 5] = step2[ 5] + step2[21];
-  output[stride* 6] = step2[ 6] + step2[22];
-  output[stride* 7] = step2[ 7] + step2[23];
-  output[stride* 8] = step2[ 8] + step2[24];
-  output[stride* 9] = step2[ 9] + step2[25];
-  output[stride*10] = step2[10] + step2[26];
-  output[stride*11] = step2[11] + step2[27];
-  output[stride*12] = step2[12] + step2[28];
-  output[stride*13] = step2[13] + step2[29];
-  output[stride*14] = step2[14] + step2[30];
-  output[stride*15] = step2[15] + step2[31];
-  output[stride*16] = step2[15] - step2[(31 - 0)];
-  output[stride*17] = step2[14] - step2[(31 - 1)];
-  output[stride*18] = step2[13] - step2[(31 - 2)];
-  output[stride*19] = step2[12] - step2[(31 - 3)];
-  output[stride*20] = step2[11] - step2[(31 - 4)];
-  output[stride*21] = step2[10] - step2[(31 - 5)];
-  output[stride*22] = step2[ 9] - step2[(31 - 6)];
-  output[stride*23] = step2[ 8] - step2[(31 - 7)];
-  output[stride*24] = step2[ 7] - step2[(31 - 8)];
-  output[stride*25] = step2[ 6] - step2[(31 - 9)];
-  output[stride*26] = step2[ 5] - step2[(31 - 10)];
-  output[stride*27] = step2[ 4] - step2[(31 - 11)];
-  output[stride*28] = step2[ 3] - step2[(31 - 12)];
-  output[stride*29] = step2[ 2] - step2[(31 - 13)];
-  output[stride*30] = step2[ 1] - step2[(31 - 14)];
-  output[stride*31] = step2[ 0] - step2[(31 - 15)];
-}
 
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double out[32*32], out2[32*32];
-    const int short_pitch = pitch >> 1;
-    int i, j;
-    // First transform rows
-    for (i = 0; i < 32; ++i) {
-      double temp_in[32], temp_out[32];
-      for (j = 0; j < 32; ++j)
-        temp_in[j] = input[j + i*short_pitch];
-      butterfly_32_idct_1d(temp_in, temp_out, 1);
-      for (j = 0; j < 32; ++j)
-        out[j + i*32] = temp_out[j];
-    }
-    // Then transform columns
-    for (i = 0; i < 32; ++i) {
-      double temp_in[32], temp_out[32];
-      for (j = 0; j < 32; ++j)
-        temp_in[j] = out[j*32 + i];
-      butterfly_32_idct_1d(temp_in, temp_out, 1);
-      for (j = 0; j < 32; ++j)
-        out2[j*32 + i] = temp_out[j];
-    }
-    for (i = 0; i < 32*32; ++i)
-      output[i] = round(out2[i]/128);
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-#else  // !CONFIG_DWTDCTHYBRID
-
-#if DWT_TYPE == 53
-
-// Note: block length must be even for this implementation
-static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, *a, *b;
-  int n;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ -= (r + (*b) + 1) >> 1;
-    r = *b++;
-  }
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    *x++ = ((r = *a++) + 1) >> 1;
-    *x++ = *b++ + ((r + (*a) + 2) >> 2);
-  }
-  *x++ = ((r = *a) + 1) >> 1;
-  *x++ = *b + ((r + 1) >> 1);
-}
-
-static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, *a, *b;
-  int n;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ -= (r + (*b) + 1) >> 1;
-    r = *b++;
-  }
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    r = *a++;
-    *x++ = r;
-    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
-  }
-  *x++ = *a;
-  *x++ = ((*b) << 1) + *a;
-}
-
-static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
-                                 int pitch_c, int16_t *x, int pitch_x) {
-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
-  short buffer[2 * DWT_MAX_LENGTH];
-
-  th[0] = hh;
-  tw[0] = hw;
-  for (i = 1; i <= levels; i++) {
-    th[i] = (th[i - 1] + 1) >> 1;
-    tw[i] = (tw[i - 1] + 1) >> 1;
-  }
-  for (lv = levels - 1; lv >= 0; lv--) {
-    nh = th[lv];
-    nw = tw[lv];
-    hh = th[lv + 1];
-    hw = tw[lv + 1];
-    if ((nh < 2) || (nw < 2)) continue;
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i] = c[i * pitch_c + j];
-      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i + nh];
-    }
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
-      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
-    }
-  }
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
-          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
-          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
-    }
-  }
-}
-
-#elif DWT_TYPE == 26
-
-// Note: block length must be even for this implementation
-static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, s, *a, *b;
-  int i, n = length >> 1;
-
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ += (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b += (r - *a + 4) >> 3;
-  }
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    s = *b++;
-    r = *a++;
-    *x++ = (r + s + 1) >> 1;
-    *x++ = (r - s + 1) >> 1;
-  }
-}
-
-static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, s, *a, *b;
-  int i, n = length >> 1;
-
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ += (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b += (r - *a + 4) >> 3;
-  }
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    s = *b++;
-    r = *a++;
-    *x++ = r + s;
-    *x++ = r - s;
-  }
-}
-
-static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
-                                 int pitch_c, int16_t *x, int pitch_x) {
-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
-  int16_t buffer[2 * DWT_MAX_LENGTH];
-
-  th[0] = hh;
-  tw[0] = hw;
-  for (i = 1; i <= levels; i++) {
-    th[i] = (th[i - 1] + 1) >> 1;
-    tw[i] = (tw[i - 1] + 1) >> 1;
-  }
-  for (lv = levels - 1; lv >= 0; lv--) {
-    nh = th[lv];
-    nw = tw[lv];
-    hh = th[lv + 1];
-    hw = tw[lv + 1];
-    if ((nh < 2) || (nw < 2)) continue;
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i] = c[i * pitch_c + j];
-      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i + nh];
-    }
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
-      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
-    }
-  }
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
-          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
-          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
-    }
-  }
-}
-
-#elif DWT_TYPE == 97
-
-static void synthesis_97(int length, double *lowpass, double *highpass,
-                         double *x) {
-  static const double a_predict1 = -1.586134342;
-  static const double a_update1 = -0.05298011854;
-  static const double a_predict2 = 0.8829110762;
-  static const double a_update2 = 0.4435068522;
-  static const double s_low = 1.149604398;
-  static const double s_high = 1/1.149604398;
-  static const double inv_s_low = 1 / s_low;
-  static const double inv_s_high = 1 / s_high;
-  int i;
-  double y[DWT_MAX_LENGTH];
-  // Undo pack and scale
-  for (i = 0; i < length / 2; i++) {
-    y[i * 2] = lowpass[i] * inv_s_low;
-    y[i * 2 + 1] = highpass[i] * inv_s_high;
-  }
-  memcpy(x, y, sizeof(*y) * length);
-  // Undo update 2
-  for (i = 2; i < length; i += 2) {
-    x[i] -= a_update2 * (x[i-1] + x[i+1]);
-  }
-  x[0] -= 2 * a_update2 * x[1];
-  // Undo predict 2
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] -= 2 * a_predict2 * x[length - 2];
-  // Undo update 1
-  for (i = 2; i < length; i += 2) {
-    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
-  }
-  x[0] -= 2 * a_update1 * x[1];
-  // Undo predict 1
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] -= 2 * a_predict1 * x[length - 2];
-}
-
-static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
-                                 int pitch_c, int16_t *x, int pitch_x) {
-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
-  double buffer[2 * DWT_MAX_LENGTH];
-  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
-
-  th[0] = hh;
-  tw[0] = hw;
-  for (i = 1; i <= levels; i++) {
-    th[i] = (th[i - 1] + 1) >> 1;
-    tw[i] = (tw[i - 1] + 1) >> 1;
-  }
-  for (lv = levels - 1; lv >= 0; lv--) {
-    nh = th[lv];
-    nw = tw[lv];
-    hh = th[lv + 1];
-    hw = tw[lv + 1];
-    if ((nh < 2) || (nw < 2)) continue;
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i] = c[i * pitch_c + j];
-      synthesis_97(nh, buffer, buffer + hh, buffer + nh);
-      for (i = 0; i < nh; i++)
-        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
-    }
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
-      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
-    }
-  }
-  for (i = 0; i < height; i++)
-    for (j = 0; j < width; j++)
-      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
-                                 (1 << DWT_PRECISION_BITS));
-}
-
-#endif  // DWT_TYPE
-
-// TODO(debargha): Implement scaling differently so as not to have to use the
-// floating point 16x16 dct
-static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) {
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-
-    // step 1 and 2
-    step[ 0] = input[0] + input[8];
-    step[ 1] = input[0] - input[8];
-
-    temp1 = input[4]*C12;
-    temp2 = input[12]*C4;
-
-    temp1 -= temp2;
-    temp1 *= C8;
-
-    step[ 2] = 2*(temp1);
-
-    temp1 = input[4]*C4;
-    temp2 = input[12]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    step[ 3] = 2*(temp1);
-
-    temp1 = input[2]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] + input[10];
-
-    step[ 4] = temp1 + temp2;
-    step[ 5] = temp1 - temp2;
-
-    temp1 = input[14]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] - input[10];
-
-    step[ 6] = temp2 - temp1;
-    step[ 7] = temp2 + temp1;
-
-    // for odd input
-    temp1 = input[3]*C12;
-    temp2 = input[13]*C4;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[ 8] = 2*(temp1);
-
-    temp1 = input[3]*C4;
-    temp2 = input[13]*C12;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[ 9] = 2*(temp2);
-
-    intermediate[10] = 2*(input[9]*C8);
-    intermediate[11] = input[15] - input[1];
-    intermediate[12] = input[15] + input[1];
-    intermediate[13] = 2*((input[7]*C8));
-
-    temp1 = input[11]*C12;
-    temp2 = input[5]*C4;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[14] = 2*(temp2);
-
-    temp1 = input[11]*C4;
-    temp2 = input[5]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[15] = 2*(temp1);
-
-    step[ 8] = intermediate[ 8] + intermediate[14];
-    step[ 9] = intermediate[ 9] + intermediate[15];
-    step[10] = intermediate[10] + intermediate[11];
-    step[11] = intermediate[10] - intermediate[11];
-    step[12] = intermediate[12] + intermediate[13];
-    step[13] = intermediate[12] - intermediate[13];
-    step[14] = intermediate[ 8] - intermediate[14];
-    step[15] = intermediate[ 9] - intermediate[15];
-
-    // step 3
-    output[0] = step[ 0] + step[ 3];
-    output[1] = step[ 1] + step[ 2];
-    output[2] = step[ 1] - step[ 2];
-    output[3] = step[ 0] - step[ 3];
-
-    temp1 = step[ 4]*C14;
-    temp2 = step[ 7]*C2;
-    temp1 -= temp2;
-    output[4] =  (temp1);
-
-    temp1 = step[ 4]*C2;
-    temp2 = step[ 7]*C14;
-    temp1 += temp2;
-    output[7] =  (temp1);
-
-    temp1 = step[ 5]*C10;
-    temp2 = step[ 6]*C6;
-    temp1 -= temp2;
-    output[5] =  (temp1);
-
-    temp1 = step[ 5]*C6;
-    temp2 = step[ 6]*C10;
-    temp1 += temp2;
-    output[6] =  (temp1);
-
-    output[8] = step[ 8] + step[11];
-    output[9] = step[ 9] + step[10];
-    output[10] = step[ 9] - step[10];
-    output[11] = step[ 8] - step[11];
-    output[12] = step[12] + step[15];
-    output[13] = step[13] + step[14];
-    output[14] = step[13] - step[14];
-    output[15] = step[12] - step[15];
-
-    // output 4
-    step[ 0] = output[0] + output[7];
-    step[ 1] = output[1] + output[6];
-    step[ 2] = output[2] + output[5];
-    step[ 3] = output[3] + output[4];
-    step[ 4] = output[3] - output[4];
-    step[ 5] = output[2] - output[5];
-    step[ 6] = output[1] - output[6];
-    step[ 7] = output[0] - output[7];
-
-    temp1 = output[8]*C7;
-    temp2 = output[15]*C9;
-    temp1 -= temp2;
-    step[ 8] = (temp1);
-
-    temp1 = output[9]*C11;
-    temp2 = output[14]*C5;
-    temp1 += temp2;
-    step[ 9] = (temp1);
-
-    temp1 = output[10]*C3;
-    temp2 = output[13]*C13;
-    temp1 -= temp2;
-    step[10] = (temp1);
-
-    temp1 = output[11]*C15;
-    temp2 = output[12]*C1;
-    temp1 += temp2;
-    step[11] = (temp1);
-
-    temp1 = output[11]*C1;
-    temp2 = output[12]*C15;
-    temp2 -= temp1;
-    step[12] = (temp2);
-
-    temp1 = output[10]*C13;
-    temp2 = output[13]*C3;
-    temp1 += temp2;
-    step[13] = (temp1);
-
-    temp1 = output[9]*C5;
-    temp2 = output[14]*C11;
-    temp2 -= temp1;
-    step[14] = (temp2);
-
-    temp1 = output[8]*C9;
-    temp2 = output[15]*C7;
-    temp1 += temp2;
-    step[15] = (temp1);
-
-    // step 5
-    output[0] = (step[0] + step[15]);
-    output[1] = (step[1] + step[14]);
-    output[2] = (step[2] + step[13]);
-    output[3] = (step[3] + step[12]);
-    output[4] = (step[4] + step[11]);
-    output[5] = (step[5] + step[10]);
-    output[6] = (step[6] + step[ 9]);
-    output[7] = (step[7] + step[ 8]);
-
-    output[15] = (step[0] - step[15]);
-    output[14] = (step[1] - step[14]);
-    output[13] = (step[2] - step[13]);
-    output[12] = (step[3] - step[12]);
-    output[11] = (step[4] - step[11]);
-    output[10] = (step[5] - step[10]);
-    output[9] = (step[6] - step[ 9]);
-    output[8] = (step[7] - step[ 8]);
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,
-                                    int scale) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double out[16*16], out2[16*16];
-    const int short_pitch = pitch >> 1;
-    int i, j;
-      // First transform rows
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j + i*short_pitch];
-      butterfly_16x16_idct_1d_f(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out[j + i*16] = temp_out[j];
-    }
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      butterfly_16x16_idct_1d_f(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out2[j*16 + i] = temp_out[j];
-    }
-    for (i = 0; i < 16*16; ++i)
-      output[i] = round(out2[i] / (128 >> scale));
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-static void idct8_1d(double *x) {
-  int i, j;
-  double t[8];
-  static const double idctmat[64] = {
-    0.35355339059327,  0.49039264020162,  0.46193976625564,  0.41573480615127,
-    0.35355339059327,   0.2777851165098,  0.19134171618254, 0.097545161008064,
-    0.35355339059327,  0.41573480615127,  0.19134171618254, -0.097545161008064,
-    -0.35355339059327, -0.49039264020161, -0.46193976625564,  -0.2777851165098,
-    0.35355339059327,   0.2777851165098, -0.19134171618254, -0.49039264020162,
-    -0.35355339059327, 0.097545161008064,  0.46193976625564,  0.41573480615127,
-    0.35355339059327, 0.097545161008063, -0.46193976625564,  -0.2777851165098,
-    0.35355339059327,  0.41573480615127, -0.19134171618254, -0.49039264020162,
-    0.35355339059327, -0.097545161008063, -0.46193976625564,   0.2777851165098,
-    0.35355339059327, -0.41573480615127, -0.19134171618255,  0.49039264020162,
-    0.35355339059327,  -0.2777851165098, -0.19134171618254,  0.49039264020161,
-    -0.35355339059327, -0.097545161008064,  0.46193976625564, -0.41573480615127,
-    0.35355339059327, -0.41573480615127,  0.19134171618254, 0.097545161008065,
-    -0.35355339059327,  0.49039264020162, -0.46193976625564,   0.2777851165098,
-    0.35355339059327, -0.49039264020162,  0.46193976625564, -0.41573480615127,
-    0.35355339059327,  -0.2777851165098,  0.19134171618255, -0.097545161008064
-  };
-  for (i = 0; i < 8; ++i) {
-    t[i] = 0;
-    for (j = 0; j < 8; ++j)
-      t[i] += idctmat[i * 8 + j] * x[j];
-  }
-  for (i = 0; i < 8; ++i) {
-    x[i] = t[i];
-  }
-}
+static void idct32_1d(int16_t *input, int16_t *output) {
+  int16_t step1[32], step2[32];
+  int temp1, temp2;
 
-static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,
-                                  int scale) {
-  double X[8 * 8], Y[8];
-  int i, j;
-  int shortpitch = pitch >> 1;
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++) {
-        X[i * 8 + j] = (double)coefs[i * shortpitch + j];
-      }
-    }
-    for (i = 0; i < 8; i++)
-      idct8_1d(X + 8 * i);
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; ++j)
-        Y[j] = X[i + 8 * j];
-      idct8_1d(Y);
-      for (j = 0; j < 8; ++j)
-        X[i + 8 * j] = Y[j];
-    }
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++) {
-        block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));
-      }
-    }
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = dct_const_round_shift(temp1);
+  step1[31] = dct_const_round_shift(temp2);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = dct_const_round_shift(temp1);
+  step1[30] = dct_const_round_shift(temp2);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = dct_const_round_shift(temp1);
+  step1[29] = dct_const_round_shift(temp2);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = dct_const_round_shift(temp1);
+  step1[28] = dct_const_round_shift(temp2);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = dct_const_round_shift(temp1);
+  step1[27] = dct_const_round_shift(temp2);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = dct_const_round_shift(temp1);
+  step1[26] = dct_const_round_shift(temp2);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = dct_const_round_shift(temp1);
+  step1[25] = dct_const_round_shift(temp2);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = dct_const_round_shift(temp1);
+  step1[24] = dct_const_round_shift(temp2);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = dct_const_round_shift(temp1);
+  step2[15] = dct_const_round_shift(temp2);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = dct_const_round_shift(temp1);
+  step2[14] = dct_const_round_shift(temp2);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = dct_const_round_shift(temp1);
+  step2[13] = dct_const_round_shift(temp2);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = dct_const_round_shift(temp1);
+  step2[12] = dct_const_round_shift(temp2);
+
+  step2[16] = step1[16] + step1[17];
+  step2[17] = step1[16] - step1[17];
+  step2[18] = -step1[18] + step1[19];
+  step2[19] = step1[18] + step1[19];
+  step2[20] = step1[20] + step1[21];
+  step2[21] = step1[20] - step1[21];
+  step2[22] = -step1[22] + step1[23];
+  step2[23] = step1[22] + step1[23];
+  step2[24] = step1[24] + step1[25];
+  step2[25] = step1[24] - step1[25];
+  step2[26] = -step1[26] + step1[27];
+  step2[27] = step1[26] + step1[27];
+  step2[28] = step1[28] + step1[29];
+  step2[29] = step1[28] - step1[29];
+  step2[30] = -step1[30] + step1[31];
+  step2[31] = step1[30] + step1[31];
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = dct_const_round_shift(temp1);
+  step1[7] = dct_const_round_shift(temp2);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = dct_const_round_shift(temp1);
+  step1[6] = dct_const_round_shift(temp2);
+
+  step1[8] = step2[8] + step2[9];
+  step1[9] = step2[8] - step2[9];
+  step1[10] = -step2[10] + step2[11];
+  step1[11] = step2[10] + step2[11];
+  step1[12] = step2[12] + step2[13];
+  step1[13] = step2[12] - step2[13];
+  step1[14] = -step2[14] + step2[15];
+  step1[15] = step2[14] + step2[15];
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = dct_const_round_shift(temp1);
+  step1[30] = dct_const_round_shift(temp2);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = dct_const_round_shift(temp1);
+  step1[29] = dct_const_round_shift(temp2);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = dct_const_round_shift(temp1);
+  step1[26] = dct_const_round_shift(temp2);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = dct_const_round_shift(temp1);
+  step1[25] = dct_const_round_shift(temp2);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = dct_const_round_shift(temp1);
+  step2[1] = dct_const_round_shift(temp2);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = dct_const_round_shift(temp1);
+  step2[3] = dct_const_round_shift(temp2);
+  step2[4] = step1[4] + step1[5];
+  step2[5] = step1[4] - step1[5];
+  step2[6] = -step1[6] + step1[7];
+  step2[7] = step1[6] + step1[7];
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = dct_const_round_shift(temp1);
+  step2[14] = dct_const_round_shift(temp2);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = dct_const_round_shift(temp1);
+  step2[13] = dct_const_round_shift(temp2);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = step1[16] + step1[19];
+  step2[17] = step1[17] + step1[18];
+  step2[18] = step1[17] - step1[18];
+  step2[19] = step1[16] - step1[19];
+  step2[20] = -step1[20] + step1[23];
+  step2[21] = -step1[21] + step1[22];
+  step2[22] = step1[21] + step1[22];
+  step2[23] = step1[20] + step1[23];
+
+  step2[24] = step1[24] + step1[27];
+  step2[25] = step1[25] + step1[26];
+  step2[26] = step1[25] - step1[26];
+  step2[27] = step1[24] - step1[27];
+  step2[28] = -step1[28] + step1[31];
+  step2[29] = -step1[29] + step1[30];
+  step2[30] = step1[29] + step1[30];
+  step2[31] = step1[28] + step1[31];
+
+  // stage 5
+  step1[0] = step2[0] + step2[3];
+  step1[1] = step2[1] + step2[2];
+  step1[2] = step2[1] - step2[2];
+  step1[3] = step2[0] - step2[3];
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = dct_const_round_shift(temp1);
+  step1[6] = dct_const_round_shift(temp2);
+  step1[7] = step2[7];
+
+  step1[8] = step2[8] + step2[11];
+  step1[9] = step2[9] + step2[10];
+  step1[10] = step2[9] - step2[10];
+  step1[11] = step2[8] - step2[11];
+  step1[12] = -step2[12] + step2[15];
+  step1[13] = -step2[13] + step2[14];
+  step1[14] = step2[13] + step2[14];
+  step1[15] = step2[12] + step2[15];
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = dct_const_round_shift(temp1);
+  step1[29] = dct_const_round_shift(temp2);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = dct_const_round_shift(temp1);
+  step1[28] = dct_const_round_shift(temp2);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = dct_const_round_shift(temp1);
+  step1[27] = dct_const_round_shift(temp2);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = dct_const_round_shift(temp1);
+  step1[26] = dct_const_round_shift(temp2);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = step1[0] + step1[7];
+  step2[1] = step1[1] + step1[6];
+  step2[2] = step1[2] + step1[5];
+  step2[3] = step1[3] + step1[4];
+  step2[4] = step1[3] - step1[4];
+  step2[5] = step1[2] - step1[5];
+  step2[6] = step1[1] - step1[6];
+  step2[7] = step1[0] - step1[7];
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = dct_const_round_shift(temp1);
+  step2[13] = dct_const_round_shift(temp2);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = dct_const_round_shift(temp1);
+  step2[12] = dct_const_round_shift(temp2);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = step1[16] + step1[23];
+  step2[17] = step1[17] + step1[22];
+  step2[18] = step1[18] + step1[21];
+  step2[19] = step1[19] + step1[20];
+  step2[20] = step1[19] - step1[20];
+  step2[21] = step1[18] - step1[21];
+  step2[22] = step1[17] - step1[22];
+  step2[23] = step1[16] - step1[23];
+
+  step2[24] = -step1[24] + step1[31];
+  step2[25] = -step1[25] + step1[30];
+  step2[26] = -step1[26] + step1[29];
+  step2[27] = -step1[27] + step1[28];
+  step2[28] = step1[27] + step1[28];
+  step2[29] = step1[26] + step1[29];
+  step2[30] = step1[25] + step1[30];
+  step2[31] = step1[24] + step1[31];
+
+  // stage 7
+  step1[0] = step2[0] + step2[15];
+  step1[1] = step2[1] + step2[14];
+  step1[2] = step2[2] + step2[13];
+  step1[3] = step2[3] + step2[12];
+  step1[4] = step2[4] + step2[11];
+  step1[5] = step2[5] + step2[10];
+  step1[6] = step2[6] + step2[9];
+  step1[7] = step2[7] + step2[8];
+  step1[8] = step2[7] - step2[8];
+  step1[9] = step2[6] - step2[9];
+  step1[10] = step2[5] - step2[10];
+  step1[11] = step2[4] - step2[11];
+  step1[12] = step2[3] - step2[12];
+  step1[13] = step2[2] - step2[13];
+  step1[14] = step2[1] - step2[14];
+  step1[15] = step2[0] - step2[15];
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = dct_const_round_shift(temp1);
+  step1[27] = dct_const_round_shift(temp2);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = dct_const_round_shift(temp1);
+  step1[26] = dct_const_round_shift(temp2);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = dct_const_round_shift(temp1);
+  step1[25] = dct_const_round_shift(temp2);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = dct_const_round_shift(temp1);
+  step1[24] = dct_const_round_shift(temp2);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = step1[0] + step1[31];
+  output[1] = step1[1] + step1[30];
+  output[2] = step1[2] + step1[29];
+  output[3] = step1[3] + step1[28];
+  output[4] = step1[4] + step1[27];
+  output[5] = step1[5] + step1[26];
+  output[6] = step1[6] + step1[25];
+  output[7] = step1[7] + step1[24];
+  output[8] = step1[8] + step1[23];
+  output[9] = step1[9] + step1[22];
+  output[10] = step1[10] + step1[21];
+  output[11] = step1[11] + step1[20];
+  output[12] = step1[12] + step1[19];
+  output[13] = step1[13] + step1[18];
+  output[14] = step1[14] + step1[17];
+  output[15] = step1[15] + step1[16];
+  output[16] = step1[15] - step1[16];
+  output[17] = step1[14] - step1[17];
+  output[18] = step1[13] - step1[18];
+  output[19] = step1[12] - step1[19];
+  output[20] = step1[11] - step1[20];
+  output[21] = step1[10] - step1[21];
+  output[22] = step1[9] - step1[22];
+  output[23] = step1[8] - step1[23];
+  output[24] = step1[7] - step1[24];
+  output[25] = step1[6] - step1[25];
+  output[26] = step1[5] - step1[26];
+  output[27] = step1[4] - step1[27];
+  output[28] = step1[3] - step1[28];
+  output[29] = step1[2] - step1[29];
+  output[30] = step1[1] - step1[30];
+  output[31] = step1[0] - step1[31];
 }
 
-#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-
 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 32x32 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[16 * 16];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[32 * 32];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
+  int16_t out[32 * 32];
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
   int i, j;
+  int16_t temp_in[32], temp_out[32];
 
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    idct32_1d(input, outptr);
+    input += half_pitch;
+    outptr += 32;
   }
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
-#endif
-}
 
-#elif DWTDCT_TYPE == DWTDCT16X16
-
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 32x32 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[16 * 16];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[32 * 32];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
-  int i, j;
-
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,
-               sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,
-               sizeof(*buffer2) * 16);
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
   }
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
-#endif
 }
 
-#elif DWTDCT_TYPE == DWTDCT8X8
-
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 32x32 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[8 * 8];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[32 * 32];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
-  int i, j;
-
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct8x8_c_f(input, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);
-  }
-  vp9_short_idct8x8_c_f(input + 8, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);
-  }
-  vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,
-               sizeof(*buffer2) * 8);
-  }
-  vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,
-               sizeof(*buffer2) * 8);
-  }
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);
-#endif
+void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-#endif
-
-#if CONFIG_TX64X64
-void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 64x64 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[16 * 16];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[64 * 64];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
+void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[32 * 32];
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
   int i, j;
+  int16_t temp_in[32], temp_out[32];
 
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 16; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-#elif DWTDCT_TYPE == DWTDCT16X16
-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,
-               sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,
-               sizeof(*buffer2) * 16);
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct32_1d(input, outptr);
+    input += half_pitch;
+    outptr += 32;
   }
 
-  // Copying and scaling highest bands into buffer2
+  // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 32; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 32; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
   }
-#endif  // DWTDCT_TYPE
-
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
-#endif
 }
-#endif  // CONFIG_TX64X64
-#endif  // !CONFIG_DWTDCTHYBRID
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index b5e6e3cc2..1311b9111 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,48 +11,21 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-static void recon_dcblock(MACROBLOCKD *xd) {
-  BLOCKD *b = &xd->block[24];
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    xd->block[i].dqcoeff[0] = b->diff[i];
-  }
-}
-
-static void recon_dcblock_8x8(MACROBLOCKD *xd) {
-  BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10
-
-  xd->block[0].dqcoeff[0] = b->diff[0];
-  xd->block[4].dqcoeff[0] = b->diff[1];
-  xd->block[8].dqcoeff[0] = b->diff[4];
-  xd->block[12].dqcoeff[0] = b->diff[8];
-}
-
 void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) {
   BLOCKD *b = &xd->block[block];
-  if (b->eob <= 1)
-    xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch);
+  if (xd->eobs[block] <= 1)
+    xd->inv_txm4x4_1(b->dqcoeff, b->diff, pitch);
   else
-    xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch);
+    xd->inv_txm4x4(b->dqcoeff, b->diff, pitch);
 }
 
 void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
   int i;
-  BLOCKD *blockd = xd->block;
-  int has_2nd_order = get_2nd_order_usage(xd);
-
-  if (has_2nd_order) {
-    /* do 2nd order transform on the dc block */
-    vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff);
-    recon_dcblock(xd);
-  }
 
   for (i = 0; i < 16; i++) {
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32,
-                   tx_type, 4, xd->block[i].eob);
+      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
       vp9_inverse_transform_b_4x4(xd, i, 32);
     }
@@ -80,19 +53,11 @@ void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,
 void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   int i;
   BLOCKD *blockd = xd->block;
-  int has_2nd_order = get_2nd_order_usage(xd);
-
-  if (has_2nd_order) {
-    // do 2nd order transform on the dc block
-    vp9_short_ihaar2x2(blockd[24].dqcoeff, blockd[24].diff, 8);
-    recon_dcblock_8x8(xd); // need to change for 8x8
-  }
 
   for (i = 0; i < 9; i += 8) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
-                 xd->block[i].eob);
+      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -101,8 +66,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   for (i = 2; i < 11; i += 8) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
-                 xd->block[i + 2].eob);
+      vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,
+                           16, tx_type);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -134,7 +99,7 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
   BLOCKD *bd = &xd->block[0];
   TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
   if (tx_type != DCT_DCT) {
-    vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob);
+    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);
   } else {
     vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
                                   &xd->block[0].diff[0], 32);
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index fd0eb3020..abd5b0fad 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,31 +15,31 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
+void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
 
-extern void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);
+void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);
+void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);
+void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
+void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
                                         int16_t *output_coeff, int pitch);
 
-extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
+void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
                                           int16_t *output_coeff, int pitch);
 
-extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);
+void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
+void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
-extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
+void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
+void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
 
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 7633887a3..9ce5a6378 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -109,6 +109,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm,
   loop_filter_info_n *lfi = &cm->lf_info;
 
   /* update limits if sharpness has changed */
+  // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl);
+  // printf("sharpness level: %d [%d]\n",
+  //        cm->sharpness_level, cm->last_sharpness_level);
   if (cm->last_sharpness_level != cm->sharpness_level) {
     vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
     cm->last_sharpness_level = cm->sharpness_level;
@@ -202,6 +205,7 @@ static int sb_mb_lf_skip(const MODE_INFO *const mip0,
           mbmi1->mv[mbmi1->ref_frame].as_int) &&
          mbmi0->ref_frame != INTRA_FRAME;
 }
+
 void vp9_loop_filter_frame(VP9_COMMON *cm,
                            MACROBLOCKD *xd,
                            int frame_filter_level,
@@ -271,7 +275,6 @@ void vp9_loop_filter_frame(VP9_COMMON *cm,
               vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
                                  post->uv_stride, &lfi);
             }
-
           }
           /* don't apply across umv border */
           if (mb_row > 0 &&
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index fbce50d05..6f434dafe 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -13,7 +13,7 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-static __inline int8_t signed_char_clamp(int t) {
+static INLINE int8_t signed_char_clamp(int t) {
   t = (t < -128 ? -128 : t);
   t = (t > 127 ? 127 : t);
   return (int8_t) t;
@@ -21,11 +21,11 @@ static __inline int8_t signed_char_clamp(int t) {
 
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit,
-                                   uint8_t p3, uint8_t p2,
-                                   uint8_t p1, uint8_t p0,
-                                   uint8_t q0, uint8_t q1,
-                                   uint8_t q2, uint8_t q3) {
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
+                                 uint8_t p3, uint8_t p2,
+                                 uint8_t p1, uint8_t p0,
+                                 uint8_t q0, uint8_t q1,
+                                 uint8_t q2, uint8_t q3) {
   int8_t mask = 0;
   mask |= (abs(p3 - p2) > limit) * -1;
   mask |= (abs(p2 - p1) > limit) * -1;
@@ -39,16 +39,16 @@ static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit,
 }
 
 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                               uint8_t q0, uint8_t q1) {
+static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                             uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
   hev  |= (abs(p1 - p0) > thresh) * -1;
   hev  |= (abs(q1 - q0) > thresh) * -1;
   return hev;
 }
 
-static __inline void filter(int8_t mask, uint8_t hev, uint8_t *op1,
-                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,
+                          uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t ps0, qs0;
   int8_t ps1, qs1;
   int8_t filter, Filter1, Filter2;
@@ -143,11 +143,11 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s,
     s += p;
   } while (++i < count * 8);
 }
-static __inline signed char flatmask(uint8_t thresh,
-                                     uint8_t p4, uint8_t p3, uint8_t p2,
-                                     uint8_t p1, uint8_t p0,
-                                     uint8_t q0, uint8_t q1, uint8_t q2,
-                                     uint8_t q3, uint8_t q4) {
+static INLINE signed char flatmask4(uint8_t thresh,
+                                    uint8_t p3, uint8_t p2,
+                                    uint8_t p1, uint8_t p0,
+                                    uint8_t q0, uint8_t q1,
+                                    uint8_t q2, uint8_t q3) {
   int8_t flat = 0;
   flat |= (abs(p1 - p0) > thresh) * -1;
   flat |= (abs(q1 - q0) > thresh) * -1;
@@ -155,26 +155,34 @@ static __inline signed char flatmask(uint8_t thresh,
   flat |= (abs(q0 - q2) > thresh) * -1;
   flat |= (abs(p3 - p0) > thresh) * -1;
   flat |= (abs(q3 - q0) > thresh) * -1;
+  flat = ~flat;
+  return flat;
+}
+static INLINE signed char flatmask5(uint8_t thresh,
+                                    uint8_t p4, uint8_t p3, uint8_t p2,
+                                    uint8_t p1, uint8_t p0,
+                                    uint8_t q0, uint8_t q1, uint8_t q2,
+                                    uint8_t q3, uint8_t q4) {
+  int8_t flat = 0;
   flat |= (abs(p4 - p0) > thresh) * -1;
   flat |= (abs(q4 - q0) > thresh) * -1;
   flat = ~flat;
-  return flat;
+  return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
 }
 
-static __inline void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
-                              uint8_t *op4, uint8_t *op3, uint8_t *op2,
-                              uint8_t *op1, uint8_t *op0,
-                              uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
-                              uint8_t *oq3, uint8_t *oq4) {
+
+static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
+                            uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0,
+                            uint8_t *oq0, uint8_t *oq1,
+                            uint8_t *oq2, uint8_t *oq3) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   if (flat && mask) {
     uint8_t p0, q0;
     uint8_t p1, q1;
     uint8_t p2, q2;
     uint8_t p3, q3;
-    uint8_t p4, q4;
 
-    p4 = *op4;
     p3 = *op3;
     p2 = *op2;
     p1 = *op1;
@@ -183,14 +191,13 @@ static __inline void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
     q1 = *oq1;
     q2 = *oq2;
     q3 = *oq3;
-    q4 = *oq4;
 
-    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
-    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
+    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
+    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
     *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
     *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;
+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;
   } else {
     int8_t ps0, qs0;
     int8_t ps1, qs1;
@@ -254,12 +261,11 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s,
 
     hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
 
-    flat = flatmask(1,
-                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
+    flat = flatmask4(1, s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                        s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
     mbfilter(mask, hev, flat,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p);
 
     ++s;
   } while (++i < count * 8);
@@ -283,21 +289,21 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s,
                        s[0], s[1], s[2], s[3]);
 
     hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask(1,
-                    s[-5], s[-4], s[-3], s[-2], s[-1],
-                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
+    flat = flatmask4(1,
+                    s[-4], s[-3], s[-2], s[-1],
+                    s[ 0], s[ 1], s[ 2], s[ 3]);
     mbfilter(mask, hev, flat,
-             s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4);
+             s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3);
     s += p;
   } while (++i < count * 8);
 
 }
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline int8_t simple_filter_mask(uint8_t blimit,
-                                          uint8_t p1, uint8_t p0,
-                                          uint8_t q0, uint8_t q1) {
+static INLINE int8_t simple_filter_mask(uint8_t blimit,
+                                        uint8_t p1, uint8_t p0,
+                                        uint8_t q0, uint8_t q1) {
   /* Why does this cause problems for win32?
    * error C2143: syntax error : missing ';' before 'type'
    *  (void) limit;
@@ -306,9 +312,9 @@ static __inline int8_t simple_filter_mask(uint8_t blimit,
   return mask;
 }
 
-static __inline void simple_filter(int8_t mask,
-                                   uint8_t *op1, uint8_t *op0,
-                                   uint8_t *oq0, uint8_t *oq1) {
+static INLINE void simple_filter(int8_t mask,
+                                 uint8_t *op1, uint8_t *op0,
+                                 uint8_t *oq0, uint8_t *oq1) {
   int8_t filter, Filter1, Filter2;
   int8_t p1 = (int8_t) *op1 ^ 0x80;
   int8_t p0 = (int8_t) *op0 ^ 0x80;
@@ -481,14 +487,14 @@ void vp9_loop_filter_bvs_c(uint8_t *y_ptr, int y_stride,
   vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
 }
 
-static __inline void wide_mbfilter(int8_t mask, uint8_t hev,
-                                   uint8_t flat, uint8_t flat2,
-                                   uint8_t *op7, uint8_t *op6, uint8_t *op5,
-                                   uint8_t *op4, uint8_t *op3, uint8_t *op2,
-                                   uint8_t *op1, uint8_t *op0, uint8_t *oq0,
-                                   uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
-                                   uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
-                                   uint8_t *oq7) {
+static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
+                                 uint8_t flat, uint8_t flat2,
+                                 uint8_t *op7, uint8_t *op6, uint8_t *op5,
+                                 uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                                 uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                                 uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                                 uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
+                                 uint8_t *oq7) {
   /* use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line */
   if (flat2 && flat && mask) {
     uint8_t p0, q0;
@@ -550,9 +556,7 @@ static __inline void wide_mbfilter(int8_t mask, uint8_t hev,
     unsigned char p1, q1;
     unsigned char p2, q2;
     unsigned char p3, q3;
-    unsigned char p4, q4;
 
-    p4 = *op4;
     p3 = *op3;
     p2 = *op2;
     p1 = *op1;
@@ -561,14 +565,13 @@ static __inline void wide_mbfilter(int8_t mask, uint8_t hev,
     q1 = *oq1;
     q2 = *oq2;
     q3 = *oq3;
-    q4 = *oq4;
 
-    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
-    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
+    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
+    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
     *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
     *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;
+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;
   } else {
     signed char ps0, qs0;
     signed char ps1, qs1;
@@ -636,19 +639,19 @@ void vp9_mb_lpf_horizontal_edge_w
 
     hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
 
-    flat = flatmask(1,
-                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
+    flat = flatmask4(1,
+                     s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                     s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
 
-    flat2 = flatmask(1,
-                    s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
-                    s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
+    flat2 = flatmask5(1,
+                      s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
+                      s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
 
     wide_mbfilter(mask, hev, flat, flat2,
-             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,         s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+                  s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+                  s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                  s,         s + 1 * p, s + 2 * p, s + 3 * p,
+                  s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
 
     ++s;
   } while (++i < count * 8);
@@ -674,18 +677,18 @@ void vp9_mb_lpf_vertical_edge_w
                        s[0], s[1], s[2], s[3]);
 
     hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask(1,
-                    s[-5], s[-4], s[-3], s[-2], s[-1],
-                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
-    flat2 = flatmask(1,
-                    s[-8], s[-7], s[-6], s[-5], s[-1],
-                    s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+    flat = flatmask4(1,
+                     s[-4], s[-3], s[-2], s[-1],
+                     s[ 0], s[ 1], s[ 2], s[ 3]);
+    flat2 = flatmask5(1,
+                     s[-8], s[-7], s[-6], s[-5], s[-1],
+                     s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
 
     wide_mbfilter(mask, hev, flat, flat2,
-             s - 8, s - 7, s - 6, s - 5,
-             s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3,
-             s + 4, s + 5, s + 6, s + 7);
+                  s - 8, s - 7, s - 6, s - 5,
+                  s - 4, s - 3, s - 2, s - 1,
+                  s,     s + 1, s + 2, s + 3,
+                  s + 4, s + 5, s + 6, s + 7);
     s += p;
   } while (++i < count * 8);
 }
diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c
index e94144813..ed96292a4 100644
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -102,9 +102,7 @@ void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
     }
   }
 
-  blockd[24].diff = &xd->diff[384];
-
-  for (r = 0; r < 25; r++) {
+  for (r = 0; r < 24; r++) {
     blockd[r].qcoeff  = xd->qcoeff  + r * 16;
     blockd[r].dqcoeff = xd->dqcoeff + r * 16;
   }
diff --git a/vp9/common/vp9_modecont.c b/vp9/common/vp9_modecont.c
index f7f2b9013..73cb5e15e 100644
--- a/vp9/common/vp9_modecont.c
+++ b/vp9/common/vp9_modecont.c
@@ -12,7 +12,7 @@
 #include "vp9/common/vp9_entropy.h"
 
 const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = {
-  {223,     1,     1,    237},  // 0,0 best: Only candidate
+  {1,       223,   1,    237},  // 0,0 best: Only candidate
   {87,      166,   26,   219},  // 0,0 best: non zero candidates
   {89,      67,    18,   125},  // 0,0 best: non zero candidates, split
   {16,      141,   69,   226},  // strong nz candidate(s), no split
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 8acd4046b..a1eef4649 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -23,4 +23,14 @@ typedef union int_mv {
   MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */
 
+struct mv32 {
+  int32_t row;
+  int32_t col;
+};
+
+typedef union int_mv32 {
+  uint64_t    as_int;
+  struct mv32 as_mv;
+} int_mv32; /* facilitates faster equality tests and copies */
+
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 786b02188..25aa53b5a 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -24,9 +24,9 @@ static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
 static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =
   { 3, 3, 2, 2, 2, 1, 1, 1 };
 
-// clamp_mv
+// clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
-static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
+static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
 
   if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
     mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
@@ -85,18 +85,17 @@ static void get_non_matching_candidates(
 
     // Second candidate
     if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.second_ref_frame != ref_frame)) {  // &&
-        // (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        // (candidate_mi->mbmi.mv[1].as_int !=
-        // candidate_mi->mbmi.mv[0].as_int)) {
+        (candidate_mi->mbmi.second_ref_frame != ref_frame) &&
+        (candidate_mi->mbmi.mv[1].as_int !=
+         candidate_mi->mbmi.mv[0].as_int)) {
       *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
       c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
     }
   }
 }
 
-// Performs mv adjustment based on reference frame and clamps the MV
-// if it goes off the edge of the buffer.
+
+// Performs mv sign inversion if indicated by the reference frame combination.
 static void scale_mv(
   MACROBLOCKD *xd,
   MV_REFERENCE_FRAME this_ref_frame,
@@ -104,54 +103,55 @@ static void scale_mv(
   int_mv *candidate_mv,
   int *ref_sign_bias
 ) {
-
-  if (candidate_ref_frame != this_ref_frame) {
-
-    //int frame_distances[MAX_REF_FRAMES];
-    //int last_distance = 1;
-    //int gf_distance = xd->frames_since_golden;
-    //int arf_distance = xd->frames_till_alt_ref_frame;
-
-    // Sign inversion where appropriate.
-    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
-      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
-      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
-    }
-
-    // Scale based on frame distance if the reference frames not the same.
-    /*frame_distances[INTRA_FRAME] = 1;   // should never be used
-    frame_distances[LAST_FRAME] = 1;
-    frame_distances[GOLDEN_FRAME] =
-      (xd->frames_since_golden) ? xd->frames_since_golden : 1;
-    frame_distances[ALTREF_FRAME] =
-      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
-
-    if (frame_distances[this_ref_frame] &&
-        frame_distances[candidate_ref_frame]) {
-      candidate_mv->as_mv.row =
-        (short)(((int)(candidate_mv->as_mv.row) *
-                 frame_distances[this_ref_frame]) /
-                frame_distances[candidate_ref_frame]);
-
-      candidate_mv->as_mv.col =
-        (short)(((int)(candidate_mv->as_mv.col) *
-                 frame_distances[this_ref_frame]) /
-                frame_distances[candidate_ref_frame]);
-    }
-    */
+  // int frame_distances[MAX_REF_FRAMES];
+  // int last_distance = 1;
+  // int gf_distance = xd->frames_since_golden;
+  // int arf_distance = xd->frames_till_alt_ref_frame;
+
+  // Sign inversion where appropriate.
+  if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
+    candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
+    candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
   }
 
-  // Clamp the MV so it does not point out of the frame buffer
-  clamp_mv(xd, candidate_mv);
+  /*
+  // Scale based on frame distance if the reference frames not the same.
+  frame_distances[INTRA_FRAME] = 1;   // should never be used
+  frame_distances[LAST_FRAME] = 1;
+  frame_distances[GOLDEN_FRAME] =
+    (xd->frames_since_golden) ? xd->frames_si nce_golden : 1;
+  frame_distances[ALTREF_FRAME] =
+    (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
+
+  if (frame_distances[this_ref_frame] &&
+      frame_distances[candidate_ref_frame]) {
+    candidate_mv->as_mv.row =
+      (short)(((int)(candidate_mv->as_mv.row) *
+               frame_distances[this_ref_frame]) /
+              frame_distances[candidate_ref_frame]);
+
+    candidate_mv->as_mv.col =
+      (short)(((int)(candidate_mv->as_mv.col) *
+               frame_distances[this_ref_frame]) /
+              frame_distances[candidate_ref_frame]);
+  }
+  */
 }
 
-// Adds a new candidate reference vector to the list if indeed it is new.
-// If it is not new then the score of the existing candidate that it matches
-// is increased and the list is resorted.
+/*
+// Adds a new candidate reference vector to the sorted list.
+// If it is a repeat the weight of the existing entry is increased
+// and the order of the list is resorted.
+// This method of add plus sort has been deprecated for now as there is a
+// further sort of the best candidates in vp9_find_best_ref_mvs() and the
+// incremental benefit of both is small. If the decision is made to remove
+// the sort in vp9_find_best_ref_mvs() for performance reasons then it may be
+// worth re-instating some sort of list reordering by weight here.
+//
 static void addmv_and_shuffle(
   int_mv *mv_list,
   int *mv_scores,
-  int *index,
+  int *refmv_count,
   int_mv candidate_mv,
   int weight
 ) {
@@ -162,11 +162,11 @@ static void addmv_and_shuffle(
 
   // Check for duplicates. If there is one increase its score.
   // We only compare vs the current top candidates.
-  insert_point = (*index < (MAX_MV_REF_CANDIDATES - 1))
-                 ? *index : (MAX_MV_REF_CANDIDATES - 1);
+  insert_point = (*refmv_count < (MAX_MV_REF_CANDIDATES - 1))
+                 ? *refmv_count : (MAX_MV_REF_CANDIDATES - 1);
 
   i = insert_point;
-  if (*index > i)
+  if (*refmv_count > i)
     i++;
   while (i > 0) {
     i--;
@@ -184,7 +184,7 @@ static void addmv_and_shuffle(
       mv_scores[insert_point] = weight;
       i = insert_point;
     }
-    (*index)++;
+    (*refmv_count)++;
   }
 
   // Reshuffle the list so that highest scoring mvs at the top.
@@ -202,11 +202,48 @@ static void addmv_and_shuffle(
       break;
   }
 }
+*/
+
+// Adds a new candidate reference vector to the list.
+// The mv is thrown out if it is already in the list.
+// Unlike the addmv_and_shuffle() this does not reorder the list
+// but assumes that candidates are added in the order most likely to
+// match distance and reference frame bias.
+static void add_candidate_mv(
+  int_mv *mv_list,
+  int *mv_scores,
+  int *candidate_count,
+  int_mv candidate_mv,
+  int weight
+) {
+  int i;
+  int insert_point;
+
+  // Make sure we dont insert off the end of the list
+  insert_point = (*candidate_count < (MAX_MV_REF_CANDIDATES - 1))
+                 ? *candidate_count : (MAX_MV_REF_CANDIDATES - 1);
+
+  // Look for duplicates
+  for (i = 0; i <= insert_point; ++i) {
+    if (candidate_mv.as_int == mv_list[i].as_int)
+      break;
+  }
+
+  // Add the candidate. If the list is already full it is only desirable that
+  // it should overwrite if it has a higher weight than the last entry.
+  if ((i >= insert_point) &&
+      (weight > mv_scores[insert_point])) {
+    mv_list[insert_point].as_int = candidate_mv.as_int;
+    mv_scores[insert_point] = weight;
+    *candidate_count += (*candidate_count < MAX_MV_REF_CANDIDATES);
+  }
+}
 
 // This function searches the neighbourhood of a given MB/SB and populates a
 // list of candidate reference vectors.
 //
 void vp9_find_mv_refs(
+  VP9_COMMON *cm,
   MACROBLOCKD *xd,
   MODE_INFO *here,
   MODE_INFO *lf_here,
@@ -224,10 +261,12 @@ void vp9_find_mv_refs(
   MV_REFERENCE_FRAME c_ref_frame;
   MV_REFERENCE_FRAME c2_ref_frame;
   int candidate_scores[MAX_MV_REF_CANDIDATES];
-  int index = 0;
+  int refmv_count = 0;
   int split_count = 0;
   int (*mv_ref_search)[2];
   int *ref_distance_weight;
+  int zero_seen = FALSE;
+  const int mb_col = (-xd->mb_to_left_edge) >> 7;
 
   // Blank the reference vector lists and other local structures.
   vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
@@ -245,39 +284,44 @@ void vp9_find_mv_refs(
   // We first scan for candidate vectors that match the current reference frame
   // Look at nearest neigbours
   for (i = 0; i < 2; ++i) {
-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+    const int mb_search_col = mb_col + mv_ref_search[i][0];
+
+    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&
+        (mb_search_col < cm->cur_tile_mb_col_end) &&
         ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
 
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
       if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
-        clamp_mv(xd, &c_refmv);
-        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_distance_weight[i] + 16);
+        add_candidate_mv(candidate_mvs, candidate_scores,
+                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);
       }
       split_count += (candidate_mi->mbmi.mode == SPLITMV);
     }
   }
-  // Look in the last frame
-  candidate_mi = lf_here;
-  if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
-    clamp_mv(xd, &c_refmv);
-    addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, 18);
+  // Look in the last frame if it exists
+  if (lf_here) {
+    candidate_mi = lf_here;
+    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+      add_candidate_mv(candidate_mvs, candidate_scores,
+                       &refmv_count, c_refmv, 18);
+    }
   }
   // More distant neigbours
   for (i = 2; (i < MVREF_NEIGHBOURS) &&
-              (index < (MAX_MV_REF_CANDIDATES - 1)); ++i) {
-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+              (refmv_count < (MAX_MV_REF_CANDIDATES - 1)); ++i) {
+    const int mb_search_col = mb_col + mv_ref_search[i][0];
+
+    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&
+        (mb_search_col < cm->cur_tile_mb_col_end) &&
         ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
       if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
-        clamp_mv(xd, &c_refmv);
-        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_distance_weight[i] + 16);
+        add_candidate_mv(candidate_mvs, candidate_scores,
+                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);
       }
     }
   }
@@ -286,9 +330,12 @@ void vp9_find_mv_refs(
   // reference frame does not match. Break out when we have
   // MAX_MV_REF_CANDIDATES candidates.
   // Look first at spatial neighbours
-  if (index < (MAX_MV_REF_CANDIDATES - 1)) {
+  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1)) {
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-      if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+      const int mb_search_col = mb_col + mv_ref_search[i][0];
+
+      if ((mb_search_col >= cm->cur_tile_mb_col_start) &&
+          (mb_search_col < cm->cur_tile_mb_col_end) &&
           ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
 
         candidate_mi = here + mv_ref_search[i][0] +
@@ -300,24 +347,24 @@ void vp9_find_mv_refs(
 
         if (c_ref_frame != INTRA_FRAME) {
           scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c_refmv, ref_distance_weight[i]);
+          add_candidate_mv(candidate_mvs, candidate_scores,
+                           &refmv_count, c_refmv, ref_distance_weight[i]);
         }
 
         if (c2_ref_frame != INTRA_FRAME) {
           scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_distance_weight[i]);
+          add_candidate_mv(candidate_mvs, candidate_scores,
+                           &refmv_count, c2_refmv, ref_distance_weight[i]);
         }
       }
 
-      if (index >= (MAX_MV_REF_CANDIDATES - 1)) {
+      if (refmv_count >= (MAX_MV_REF_CANDIDATES - 1)) {
         break;
       }
     }
   }
-  // Look at the last frame
-  if (index < (MAX_MV_REF_CANDIDATES - 1)) {
+  // Look at the last frame if it exists
+  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1) && lf_here) {
     candidate_mi = lf_here;
     get_non_matching_candidates(candidate_mi, ref_frame,
                                 &c_ref_frame, &c_refmv,
@@ -325,14 +372,14 @@ void vp9_find_mv_refs(
 
     if (c_ref_frame != INTRA_FRAME) {
       scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-      addmv_and_shuffle(candidate_mvs, candidate_scores,
-                        &index, c_refmv, 2);
+      add_candidate_mv(candidate_mvs, candidate_scores,
+                       &refmv_count, c_refmv, 2);
     }
 
     if (c2_ref_frame != INTRA_FRAME) {
       scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-      addmv_and_shuffle(candidate_mvs, candidate_scores,
-                        &index, c2_refmv, 2);
+      add_candidate_mv(candidate_mvs, candidate_scores,
+                       &refmv_count, c2_refmv, 2);
     }
   }
 
@@ -340,7 +387,7 @@ void vp9_find_mv_refs(
   // 0,0 was best
   if (candidate_mvs[0].as_int == 0) {
     // 0,0 is only candidate
-    if (index <= 1) {
+    if (refmv_count <= 1) {
       mbmi->mb_mode_context[ref_frame] = 0;
     // non zero candidates candidates available
     } else if (split_count == 0) {
@@ -350,26 +397,30 @@ void vp9_find_mv_refs(
     }
   // Non zero best, No Split MV cases
   } else if (split_count == 0) {
-    if (candidate_scores[0] >= 32) {
+    if (candidate_scores[0] >= 16) {
       mbmi->mb_mode_context[ref_frame] = 3;
     } else {
       mbmi->mb_mode_context[ref_frame] = 4;
     }
   // Non zero best, some split mv
   } else {
-    if (candidate_scores[0] >= 32) {
+    if (candidate_scores[0] >= 16) {
       mbmi->mb_mode_context[ref_frame] = 5;
     } else {
       mbmi->mb_mode_context[ref_frame] = 6;
     }
   }
 
-  // 0,0 is always a valid reference.
+  // Scan for 0,0 case and clamp non zero choices
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    if (candidate_mvs[i].as_int == 0)
-      break;
+    if (candidate_mvs[i].as_int == 0) {
+      zero_seen = TRUE;
+    } else {
+      clamp_mv_ref(xd, &candidate_mvs[i]);
+    }
   }
-  if (i == MAX_MV_REF_CANDIDATES) {
+  // 0,0 is always a valid reference. Add it if not already seen.
+  if (!zero_seen) {
     candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0;
   }
 
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index ca6d89e91..a81366997 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -14,7 +14,8 @@
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_
 
-void vp9_find_mv_refs(MACROBLOCKD *xd,
+void vp9_find_mv_refs(VP9_COMMON *cm,
+                      MACROBLOCKD *xd,
                       MODE_INFO *here,
                       MODE_INFO *lf_here,
                       MV_REFERENCE_FRAME ref_frame,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index e4ad72f21..d93b7d5fb 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -16,6 +16,7 @@ extern "C"
 {
 #endif
 
+#include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
@@ -62,7 +63,7 @@ extern "C"
 
 
 #include <assert.h>
-  static __inline void Scale2Ratio(int mode, int *hr, int *hs) {
+  static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
     switch (mode) {
       case    NORMAL:
         *hr = 1;
@@ -159,10 +160,25 @@ extern "C"
 
     int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
 
+    /* Bitfield defining the error resiliency features to enable.
+     * Can provide decodable frames after losses in previous
+     * frames and decodable partitions after losses in the same frame.
+     */
+    unsigned int error_resilient_mode;
+
+    /* Bitfield defining the parallel decoding mode where the
+     * decoding in successive frames may be conducted in parallel
+     * just by decoding the frame headers.
+     */
+    unsigned int frame_parallel_decoding_mode;
+
     int arnr_max_frames;
     int arnr_strength;
     int arnr_type;
 
+    int tile_columns;
+    int tile_rows;
+
     struct vpx_fixed_buf         two_pass_stats_in;
     struct vpx_codec_pkt_list  *output_pkt_list;
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ac66e4902..c4bb12340 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -37,7 +37,16 @@ void vp9_initialize_common(void);
 
 #define QINDEX_RANGE (MAXQ + 1)
 
-#define NUM_YV12_BUFFERS 4
+#define NUM_REF_FRAMES 3
+#define NUM_REF_FRAMES_LG2 2
+
+// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)
+
+#define NUM_FRAME_CONTEXTS_LG2 2
+#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)
 
 #define COMP_PRED_CONTEXTS   2
 
@@ -49,12 +58,9 @@ typedef struct frame_contexts {
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];
 
   nmv_context nmvc;
@@ -74,20 +80,14 @@ typedef struct frame_contexts {
   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
   unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
 
-  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs pre_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs pre_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_probs pre_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES_32X32];
 
-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];
 
   nmv_context_counts NMVcount;
@@ -128,11 +128,12 @@ typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
   DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, int16_t, Y2dequant[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]);
 
   int Width;
   int Height;
+  int last_width;
+  int last_height;
   int horiz_scale;
   int vert_scale;
 
@@ -142,8 +143,15 @@ typedef struct VP9Common {
   YV12_BUFFER_CONFIG *frame_to_show;
 
   YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
-  int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
-  int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+  int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */
+  int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */
+
+  /* TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+   * roll new_fb_idx into it.
+   */
+  int active_ref_idx[3]; /* each frame can reference 3 buffers */
+  int new_fb_idx;
+  struct scale_factors active_ref_scale[3];
 
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG temp_scale_frame;
@@ -173,8 +181,6 @@ typedef struct VP9Common {
   int last_kf_gf_q;  /* Q used on the last GF or KF */
 
   int y1dc_delta_q;
-  int y2dc_delta_q;
-  int y2ac_delta_q;
   int uvdc_delta_q;
   int uvac_delta_q;
 
@@ -202,18 +208,11 @@ typedef struct VP9Common {
   int last_sharpness_level;
   int sharpness_level;
 
-  int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
-  int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
-  int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
-
-  int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
-  int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
-
   int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
 
   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
 
-  /* Y,U,V,Y2 */
+  /* Y,U,V */
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
   ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */
 
@@ -250,9 +249,9 @@ typedef struct VP9Common {
 
   vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
 
-  FRAME_CONTEXT lfc_a; /* last alt ref entropy */
-  FRAME_CONTEXT lfc; /* last frame entropy */
   FRAME_CONTEXT fc;  /* this frame entropy */
+  FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
+  unsigned int  frame_context_idx; /* Context to use/update */
 
   unsigned int current_video_frame;
   int near_boffset[3];
@@ -272,6 +271,33 @@ typedef struct VP9Common {
   int use_interintra;
 #endif
 
+  int error_resilient_mode;
+  int frame_parallel_decoding_mode;
+
+  int tile_columns, log2_tile_columns;
+  int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_col_idx;
+  int tile_rows, log2_tile_rows;
+  int cur_tile_mb_row_start, cur_tile_mb_row_end, cur_tile_row_idx;
 } VP9_COMMON;
 
+static int get_free_fb(VP9_COMMON *cm) {
+  int i;
+  for (i = 0; i < NUM_YV12_BUFFERS; i++)
+    if (cm->fb_idx_ref_cnt[i] == 0)
+      break;
+
+  assert(i < NUM_YV12_BUFFERS);
+  cm->fb_idx_ref_cnt[i] = 1;
+  return i;
+}
+
+static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
+  if (buf[*idx] > 0)
+    buf[*idx]--;
+
+  *idx = new_idx;
+
+  buf[new_idx]++;
+}
+
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 76ae0b36b..41a4e000b 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -29,14 +29,16 @@ unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
   // The prediction flags in these dummy entries are initialised to 0.
   switch (pred_id) {
     case PRED_SEG_ID:
-      pred_context = (m - 1)->mbmi.seg_id_predicted +
-                     (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
+      pred_context = (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
+      if (xd->left_available)
+        pred_context += (m - 1)->mbmi.seg_id_predicted;
       break;
 
 
     case PRED_REF:
-      pred_context = (m - 1)->mbmi.ref_predicted +
-                     (m - cm->mode_info_stride)->mbmi.ref_predicted;
+      pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted;
+      if (xd->left_available)
+        pred_context += (m - 1)->mbmi.ref_predicted;
       break;
 
     case PRED_COMP:
@@ -61,13 +63,14 @@ unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
       break;
 
     case PRED_MBSKIP:
-      pred_context = (m - 1)->mbmi.mb_skip_coeff +
-                     (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
+      pred_context = (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
+      if (xd->left_available)
+        pred_context += (m - 1)->mbmi.mb_skip_coeff;
       break;
 
     case PRED_SWITCHABLE_INTERP:
       {
-        int left_in_image = (m - 1)->mbmi.mb_in_image;
+        int left_in_image = xd->left_available && (m - 1)->mbmi.mb_in_image;
         int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
         int left_mode = (m - 1)->mbmi.mode;
         int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
@@ -355,9 +358,10 @@ MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
   above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;
 
   // Are neighbours in image
-  left_in_image = (m - 1)->mbmi.mb_in_image;
+  left_in_image = (m - 1)->mbmi.mb_in_image && xd->left_available;
   above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
-  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;
+  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image &&
+                        xd->left_available;
 
   // Adjust scores for candidate reference frames based on neigbours
   if (frame_allowed[left] && left_in_image) {
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 20de7b7f1..30e8951af 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -8,66 +8,130 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       YV12_BUFFER_CONFIG *other,
+                                       int this_w, int this_h) {
+  int other_w, other_h;
+
+  other_h = other->y_height;
+  other_w = other->y_width;
+  scale->x_num = other_w;
+  scale->x_den = this_w;
+  scale->x_offset_q4 = 0;  // calculated per-mb
+  scale->x_step_q4 = 16 * other_w / this_w;
+  scale->y_num = other_h;
+  scale->y_den = this_h;
+  scale->y_offset_q4 = 0;  // calculated per-mb
+  scale->y_step_q4 = 16 * other_h / this_h;
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  if (scale->x_step_q4 == 16) {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in either direction.
+      scale->predict[0][0][0] = vp9_convolve_copy;
+      scale->predict[0][0][1] = vp9_convolve_avg;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      scale->predict[0][0][0] = vp9_convolve8_vert;
+      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  } else {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      scale->predict[0][0][0] = vp9_convolve8_horiz;
+      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      scale->predict[0][0][0] = vp9_convolve8;
+      scale->predict[0][0][1] = vp9_convolve8_avg;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  scale->predict[1][1][0] = vp9_convolve8;
+  scale->predict[1][1][1] = vp9_convolve8_avg;
+}
+
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
-#if CONFIG_ENABLE_6TAP
-  if (mcomp_filter_type == SIXTAP) {
-    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;
-    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;
-    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;
-    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;
-    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;
-    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
-    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
-  } else {
-#endif
-  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;
-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;
-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;
-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;
-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;
-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
-  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {
-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;
-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;
-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;
-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;
-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;
-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;
-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;
-  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;
-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;
-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;
-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;
-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;
-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
-  } else {
-    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;
-    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;
-    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;
-    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;
-    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;
-    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
-    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
+  int i;
+
+  /* Calculate scaling factors for each of the 3 available references */
+  for (i = 0; i < 3; ++i) {
+    if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
+      memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));
+      continue;
+    }
+
+    vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
+                                      &cm->yv12_fb[cm->active_ref_idx[i]],
+                                      cm->mb_cols * 16, cm->mb_rows * 16);
   }
-#if CONFIG_ENABLE_6TAP
+
+  if (xd->mode_info_context) {
+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+
+    set_scale_factors(xd,
+                      mbmi->ref_frame - 1,
+                      mbmi->second_ref_frame - 1,
+                      cm->active_ref_scale);
   }
+
+
+  switch (mcomp_filter_type) {
+    case EIGHTTAP:
+    case SWITCHABLE:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
+      break;
+    case EIGHTTAP_SMOOTH:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
+      break;
+    case EIGHTTAP_SHARP:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
+      break;
+    case BILINEAR:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
+      break;
+#if CONFIG_ENABLE_6TAP
+    case SIXTAP:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;
+      break;
 #endif
+  }
+  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
-void vp9_copy_mem16x16_c(uint8_t *src,
+void vp9_copy_mem16x16_c(const uint8_t *src,
                          int src_stride,
                          uint8_t *dst,
                          int dst_stride) {
@@ -93,10 +157,10 @@ void vp9_copy_mem16x16_c(uint8_t *src,
     dst[15] = src[15];
 
 #else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
-    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
+    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
+    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
 
 #endif
     src += src_stride;
@@ -104,25 +168,7 @@ void vp9_copy_mem16x16_c(uint8_t *src,
   }
 }
 
-void vp9_avg_mem16x16_c(uint8_t *src,
-                        int src_stride,
-                        uint8_t *dst,
-                        int dst_stride) {
-  int r;
-
-  for (r = 0; r < 16; r++) {
-    int n;
-
-    for (n = 0; n < 16; n++) {
-      dst[n] = (dst[n] + src[n] + 1) >> 1;
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x8_c(uint8_t *src,
+void vp9_copy_mem8x8_c(const uint8_t *src,
                        int src_stride,
                        uint8_t *dst,
                        int dst_stride) {
@@ -139,33 +185,15 @@ void vp9_copy_mem8x8_c(uint8_t *src,
     dst[6] = src[6];
     dst[7] = src[7];
 #else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 #endif
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void vp9_avg_mem8x8_c(uint8_t *src,
-                      int src_stride,
-                      uint8_t *dst,
-                      int dst_stride) {
-  int r;
-
-  for (r = 0; r < 8; r++) {
-    int n;
-
-    for (n = 0; n < 8; n++) {
-      dst[n] = (dst[n] + src[n] + 1) >> 1;
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x4_c(uint8_t *src,
+void vp9_copy_mem8x4_c(const uint8_t *src,
                        int src_stride,
                        uint8_t *dst,
                        int dst_stride) {
@@ -182,153 +210,159 @@ void vp9_copy_mem8x4_c(uint8_t *src,
     dst[6] = src[6];
     dst[7] = src[7];
 #else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 #endif
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
-  int r;
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-          (mv.as_mv.col >> 3);
-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
-         pred_ptr, pitch);
-  } else {
-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-                (mv.as_mv.col >> 3);
-    ptr = ptr_base;
+static void set_scaled_offsets(struct scale_factors *scale,
+                               int row, int col) {
+  const int x_q4 = 16 * col;
+  const int y_q4 = 16 * row;
 
-    for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-      pred_ptr[0]  = ptr[0];
-      pred_ptr[1]  = ptr[1];
-      pred_ptr[2]  = ptr[2];
-      pred_ptr[3]  = ptr[3];
-#else
-      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
-#endif
-      pred_ptr     += pitch;
-      ptr         += d->pre_stride;
-    }
-  }
+  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
+  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
 }
 
-/*
- * Similar to vp9_build_inter_predictors_b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to vp9_build_inter_predictors_b()) with the
- * predictor of the second reference frame / motion vector.
- */
-void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                      vp9_subpix_fn_t sppf) {
-  int r;
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv.second.as_int;
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-          (mv.as_mv.col >> 3);
-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
-         pred_ptr, pitch);
-  } else {
-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-                (mv.as_mv.col >> 3);
-    ptr = ptr_base;
-
-    for (r = 0; r < 4; r++) {
-      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;
-      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;
-      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;
-      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;
-      pred_ptr    += pitch;
-      ptr         += d->pre_stride;
-    }
-  }
+static int32_t scale_motion_vector_component_q3(int mv_q3,
+                                                int num,
+                                                int den,
+                                                int offset_q4) {
+  // returns the scaled and offset value of the mv component.
+  const int32_t mv_q4 = mv_q3 << 1;
+
+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+  return mv_q4 * num / den + offset_q4;
 }
 
-void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
+static int32_t scale_motion_vector_component_q4(int mv_q4,
+                                                int num,
+                                                int den,
+                                                int offset_q4) {
+  // returns the scaled and offset value of the mv component.
 
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+  return mv_q4 * num / den + offset_q4;
+}
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
-  }
+static int_mv32 scale_motion_vector_q3_to_q4(
+    const int_mv *src_mv,
+    const struct scale_factors *scale) {
+  // returns mv * scale + offset
+  int_mv32 result;
+
+  result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row,
+                                                      scale->y_num,
+                                                      scale->y_den,
+                                                      scale->y_offset_q4);
+  result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col,
+                                                      scale->x_num,
+                                                      scale->x_den,
+                                                      scale->x_offset_q4);
+  return result;
 }
 
-/*
- * Similar to build_inter_predictors_4b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to build_inter_predictors_4b()) with the
- * predictor of the second reference frame / motion vector.
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int_mv *mv_q3,
+                               const struct scale_factors *scale,
+                               int w, int h, int do_avg,
+                               const struct subpix_fn_table *subpix) {
+  int_mv32 mv;
+
+  mv = scale_motion_vector_q3_to_q4(mv_q3, scale);
+  src = src + (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
+
+  scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][do_avg](
+      src, src_stride, dst, dst_stride,
+      subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4,
+      subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4,
+      w, h);
+}
+
+/* Like vp9_build_inter_predictor, but takes the full-pel part of the
+ * mv separately, and the fractional part as a q4.
  */
-void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                      BLOCKD *d, int pitch) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv.second.as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
-  }
+void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride,
+                                  const int_mv *fullpel_mv_q3,
+                                  const int_mv *frac_mv_q4,
+                                  const struct scale_factors *scale,
+                                  int w, int h, int do_avg,
+                                  const struct subpix_fn_table *subpix) {
+  const int mv_row_q4 = ((fullpel_mv_q3->as_mv.row >> 3) << 4)
+                        + (frac_mv_q4->as_mv.row & 0xf);
+  const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4)
+                        + (frac_mv_q4->as_mv.col & 0xf);
+  const int scaled_mv_row_q4 =
+      scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den,
+                                       scale->y_offset_q4);
+  const int scaled_mv_col_q4 =
+      scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den,
+                                       scale->x_offset_q4);
+  const int subpel_x = scaled_mv_col_q4 & 15;
+  const int subpel_y = scaled_mv_row_q4 & 15;
+
+  src = src + (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
+  scale->predict[!!subpel_x][!!subpel_y][do_avg](
+      src, src_stride, dst, dst_stride,
+      subpix->filter_x[subpel_x], scale->x_step_q4,
+      subpix->filter_y[subpel_y], scale->y_step_q4,
+      w, h);
 }
 
-static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
+static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1,
+                                      struct scale_factors *scale,
+                                      int block_size, int stride, int which_mv,
+                                      const struct subpix_fn_table *subpix,
+                                      int row, int col) {
+  assert(d1->predictor - d0->predictor == block_size);
+  assert(d1->pre == d0->pre + block_size);
 
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
+  set_scaled_offsets(&scale[which_mv], row, col);
+
+  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {
+    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;
+
+    vp9_build_inter_predictor(*base_pre + d0->pre,
+                              d0->pre_stride,
+                              d0->predictor, stride,
+                              &d0->bmi.as_mv[which_mv],
+                              &scale[which_mv],
+                              2 * block_size, block_size, which_mv,
+                              subpix);
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
   } else {
-    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
+    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;
+    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;
+
+    vp9_build_inter_predictor(*base_pre0 + d0->pre,
+                              d0->pre_stride,
+                              d0->predictor, stride,
+                              &d0->bmi.as_mv[which_mv],
+                              &scale[which_mv],
+                              block_size, block_size, which_mv,
+                              subpix);
+
+    set_scaled_offsets(&scale[which_mv], row, col + block_size);
+
+    vp9_build_inter_predictor(*base_pre1 + d1->pre,
+                              d1->pre_stride,
+                              d1->predictor, stride,
+                              &d1->bmi.as_mv[which_mv],
+                              &scale[which_mv],
+                              block_size, block_size, which_mv,
+                              subpix);
   }
 }
 
 /*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
+                                        int mb_row,
+                                        int mb_col) {
   int i, j;
   BLOCKD *blockd = xd->block;
 
@@ -340,38 +374,38 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
       int voffset = 20 + i * 2 + j;
       int temp;
 
-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;
+      temp = blockd[yoffset  ].bmi.as_mv[0].as_mv.row
+             + blockd[yoffset + 1].bmi.as_mv[0].as_mv.row
+             + blockd[yoffset + 4].bmi.as_mv[0].as_mv.row
+             + blockd[yoffset + 5].bmi.as_mv[0].as_mv.row;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+      xd->block[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) &
         xd->fullpixel_mask;
 
-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;
+      temp = blockd[yoffset  ].bmi.as_mv[0].as_mv.col
+             + blockd[yoffset + 1].bmi.as_mv[0].as_mv.col
+             + blockd[yoffset + 4].bmi.as_mv[0].as_mv.col
+             + blockd[yoffset + 5].bmi.as_mv[0].as_mv.col;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+      blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) &
         xd->fullpixel_mask;
 
-      blockd[voffset].bmi.as_mv.first.as_mv.row =
-        blockd[uoffset].bmi.as_mv.first.as_mv.row;
-      blockd[voffset].bmi.as_mv.first.as_mv.col =
-        blockd[uoffset].bmi.as_mv.first.as_mv.col;
+      blockd[voffset].bmi.as_mv[0].as_mv.row =
+        blockd[uoffset].bmi.as_mv[0].as_mv.row;
+      blockd[voffset].bmi.as_mv[0].as_mv.col =
+        blockd[uoffset].bmi.as_mv[0].as_mv.col;
 
       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;
+        temp = blockd[yoffset  ].bmi.as_mv[1].as_mv.row
+               + blockd[yoffset + 1].bmi.as_mv[1].as_mv.row
+               + blockd[yoffset + 4].bmi.as_mv[1].as_mv.row
+               + blockd[yoffset + 5].bmi.as_mv[1].as_mv.row;
 
         if (temp < 0) {
           temp -= 4;
@@ -379,13 +413,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+        blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) &
           xd->fullpixel_mask;
 
-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;
+        temp = blockd[yoffset  ].bmi.as_mv[1].as_mv.col
+               + blockd[yoffset + 1].bmi.as_mv[1].as_mv.col
+               + blockd[yoffset + 4].bmi.as_mv[1].as_mv.col
+               + blockd[yoffset + 5].bmi.as_mv[1].as_mv.col;
 
         if (temp < 0) {
           temp -= 4;
@@ -393,31 +427,29 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+        blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) &
           xd->fullpixel_mask;
 
-        blockd[voffset].bmi.as_mv.second.as_mv.row =
-          blockd[uoffset].bmi.as_mv.second.as_mv.row;
-        blockd[voffset].bmi.as_mv.second.as_mv.col =
-          blockd[uoffset].bmi.as_mv.second.as_mv.col;
+        blockd[voffset].bmi.as_mv[1].as_mv.row =
+          blockd[uoffset].bmi.as_mv[1].as_mv.row;
+        blockd[voffset].bmi.as_mv[1].as_mv.col =
+          blockd[uoffset].bmi.as_mv[1].as_mv.col;
       }
     }
   }
 
   for (i = 16; i < 24; i += 2) {
+    const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
+    const int x = 4 * (i & 1);
+    const int y = ((i - 16) >> 1) * 4;
+
+    int which_mv;
     BLOCKD *d0 = &blockd[i];
     BLOCKD *d1 = &blockd[i + 1];
 
-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
-      build_inter_predictors2b(xd, d0, 8);
-    else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
-    }
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
+    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,
+                                &xd->subpix, mb_row * 8 + y, mb_col * 8 + x);
     }
   }
 }
@@ -459,90 +491,100 @@ static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
 }
 
 /*encoder only*/
-void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             uint8_t *dst_y,
-                                             int dst_ystride,
-                                             int clamp_mvs) {
-  uint8_t *ptr_base = xd->pre.y_buffer;
-  uint8_t *ptr;
-  int pre_stride = xd->block[0].pre_stride;
-  int_mv ymv;
-
-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
-  if (clamp_mvs)
-    clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
-  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
-
-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
-      xd->subpixel_predict16x16(ptr, pre_stride,
-                                (ymv.as_mv.col & 7) << 1,
-                                (ymv.as_mv.row & 7) << 1,
-                                dst_y, dst_ystride);
-    } else {
-      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
-}
-
-void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              uint8_t *dst_u,
-                                              uint8_t *dst_v,
-                                              int dst_uvstride) {
-  int offset;
-  uint8_t *uptr, *vptr;
-  int pre_stride = xd->block[0].pre_stride;
-  int_mv _o16x16mv;
-  int_mv _16x16mv;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  _o16x16mv = _16x16mv;
-  /* calc uv motion vectors */
-  if (_16x16mv.as_mv.row < 0)
-    _16x16mv.as_mv.row -= 1;
-  else
-    _16x16mv.as_mv.row += 1;
-
-  if (_16x16mv.as_mv.col < 0)
-    _16x16mv.as_mv.col -= 1;
-  else
-    _16x16mv.as_mv.col += 1;
-
-  _16x16mv.as_mv.row /= 2;
-  _16x16mv.as_mv.col /= 2;
-
-  _16x16mv.as_mv.row &= xd->fullpixel_mask;
-  _16x16mv.as_mv.col &= xd->fullpixel_mask;
-
-  pre_stride >>= 1;
-  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
-  uptr = xd->pre.u_buffer + offset;
-  vptr = xd->pre.v_buffer + offset;
-
-    if (_o16x16mv.as_int & 0x000f000f) {
-      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
-                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
-      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
-                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
-    } else {
-      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
-      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
-    }
+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                         uint8_t *dst_y,
+                                         int dst_ystride,
+                                         int mb_row,
+                                         int mb_col) {
+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
+  int which_mv;
+
+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+    const int clamp_mvs =
+        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
+                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
+    uint8_t *base_pre;
+    int_mv ymv;
+    int pre_stride;
+
+    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
+    base_pre = which_mv ? xd->second_pre.y_buffer
+                        : xd->pre.y_buffer;
+    pre_stride = which_mv ? xd->second_pre.y_stride
+                          : xd->pre.y_stride;
+    if (clamp_mvs)
+      clamp_mv_to_umv_border(&ymv.as_mv, xd);
+
+    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);
+
+    vp9_build_inter_predictor(base_pre, pre_stride,
+                              dst_y, dst_ystride,
+                              &ymv, &xd->scale_factor[which_mv],
+                              16, 16, which_mv, &xd->subpix);
+  }
 }
 
-
-void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            uint8_t *dst_y,
-                                            uint8_t *dst_u,
-                                            uint8_t *dst_v,
-                                            int dst_ystride, int dst_uvstride) {
-  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
-      xd->mode_info_context->mbmi.need_to_clamp_mvs);
-  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                          uint8_t *dst_u,
+                                          uint8_t *dst_v,
+                                          int dst_uvstride,
+                                          int mb_row,
+                                          int mb_col) {
+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
+  int which_mv;
+
+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+    const int clamp_mvs =
+        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
+                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
+    uint8_t *uptr, *vptr;
+    int pre_stride = which_mv ? xd->second_pre.y_stride
+                              : xd->pre.y_stride;
+    int_mv _o16x16mv;
+    int_mv _16x16mv;
+
+    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
+
+    if (clamp_mvs)
+      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+    _o16x16mv = _16x16mv;
+    /* calc uv motion vectors */
+    if (_16x16mv.as_mv.row < 0)
+      _16x16mv.as_mv.row -= 1;
+    else
+      _16x16mv.as_mv.row += 1;
+
+    if (_16x16mv.as_mv.col < 0)
+      _16x16mv.as_mv.col -= 1;
+    else
+      _16x16mv.as_mv.col += 1;
+
+    _16x16mv.as_mv.row /= 2;
+    _16x16mv.as_mv.col /= 2;
+
+    _16x16mv.as_mv.row &= xd->fullpixel_mask;
+    _16x16mv.as_mv.col &= xd->fullpixel_mask;
+
+    pre_stride >>= 1;
+    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);
+    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);
+
+    set_scaled_offsets(&xd->scale_factor_uv[which_mv],
+                       mb_row * 16, mb_col * 16);
+
+    vp9_build_inter_predictor_q4(uptr, pre_stride,
+                                 dst_u, dst_uvstride,
+                                 &_16x16mv, &_o16x16mv,
+                                 &xd->scale_factor_uv[which_mv],
+                                 8, 8, which_mv, &xd->subpix);
+
+    vp9_build_inter_predictor_q4(vptr, pre_stride,
+                                 dst_v, dst_uvstride,
+                                 &_16x16mv, &_o16x16mv,
+                                 &xd->scale_factor_uv[which_mv],
+                                 8, 8, which_mv, &xd->subpix);
+  }
 }
 
 void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
@@ -550,7 +592,9 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
                                         uint8_t *dst_u,
                                         uint8_t *dst_v,
                                         int dst_ystride,
-                                        int dst_uvstride) {
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col) {
   uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
   uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
           *v2 = x->second_pre.v_buffer;
@@ -563,32 +607,43 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
 
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
+    int scaled_uv_offset;
 
     x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);
     x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);
     x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);
     x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);
 
-    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
-    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,
+                                                y_idx * 16,
+                                                x->pre.y_stride,
+                                                &x->scale_factor[0]);
+    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
+                                            y_idx * 8,
+                                            x->pre.uv_stride,
+                                            &x->scale_factor_uv[0]);
+    x->pre.u_buffer = u1 + scaled_uv_offset;
+    x->pre.v_buffer = v1 + scaled_uv_offset;
 
-    vp9_build_1st_inter16x16_predictors_mb(x,
-      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-      dst_ystride, dst_uvstride);
     if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
-      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+      x->second_pre.y_buffer = y2 +
+          scaled_buffer_offset(x_idx * 16,
+                               y_idx * 16,
+                               x->second_pre.y_stride,
+                               &x->scale_factor[1]);
+      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
+                                              y_idx * 8,
+                                              x->second_pre.uv_stride,
+                                              &x->scale_factor_uv[1]);
+      x->second_pre.u_buffer = u2 + scaled_uv_offset;
+      x->second_pre.v_buffer = v2 + scaled_uv_offset;
+    }
 
-      vp9_build_2nd_inter16x16_predictors_mb(x,
+    vp9_build_inter16x16_predictors_mb(x,
         dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
         dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
         dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_ystride, dst_uvstride);
-    }
+        dst_ystride, dst_uvstride, mb_row + y_idx, mb_col + x_idx);
   }
 
   x->mb_to_top_edge    = edge[0];
@@ -619,7 +674,9 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
                                         uint8_t *dst_u,
                                         uint8_t *dst_v,
                                         int dst_ystride,
-                                        int dst_uvstride) {
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col) {
   uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
   uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
           *v2 = x->second_pre.v_buffer;
@@ -632,27 +689,43 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
 
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
+    int scaled_uv_offset;
 
     x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
     x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
     x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
     x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
 
-    x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
-    x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
-    x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,
+                                                y_idx * 32,
+                                                x->pre.y_stride,
+                                                &x->scale_factor[0]);
+    scaled_uv_offset = scaled_buffer_offset(x_idx * 16,
+                                            y_idx * 16,
+                                            x->pre.uv_stride,
+                                            &x->scale_factor_uv[0]);
+    x->pre.u_buffer = u1 + scaled_uv_offset;
+    x->pre.v_buffer = v1 + scaled_uv_offset;
 
     if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
-      x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
-      x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+      x->second_pre.y_buffer = y2 +
+          scaled_buffer_offset(x_idx * 32,
+                               y_idx * 32,
+                               x->second_pre.y_stride,
+                               &x->scale_factor[1]);
+      scaled_uv_offset = scaled_buffer_offset(x_idx * 16,
+                                              y_idx * 16,
+                                              x->second_pre.uv_stride,
+                                              &x->scale_factor_uv[1]);
+      x->second_pre.u_buffer = u2 + scaled_uv_offset;
+      x->second_pre.v_buffer = v2 + scaled_uv_offset;
     }
 
     vp9_build_inter32x32_predictors_sb(x,
         dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
         dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,
         dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,
-        dst_ystride, dst_uvstride);
+        dst_ystride, dst_uvstride, mb_row + y_idx * 2, mb_col + x_idx * 2);
   }
 
   x->mb_to_top_edge    = edge[0];
@@ -678,170 +751,48 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
 #endif
 }
 
-/*
- * The following functions should be called after an initial
- * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
- * It will run a second filter on a (different) ref
- * frame and average the result with the output of the
- * first filter. The second reference frame is stored
- * in x->second_pre (the reference frame index is in
- * x->mode_info_context->mbmi.second_ref_frame). The second
- * motion vector is x->mode_info_context->mbmi.second_mv.
- *
- * This allows blending prediction from two reference frames
- * which sometimes leads to better prediction than from a
- * single reference framer.
- */
-void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             uint8_t *dst_y,
-                                             int dst_ystride) {
-  uint8_t *ptr;
-
-  int_mv _16x16mv;
-  int mv_row;
-  int mv_col;
-
-  uint8_t *ptr_base = xd->second_pre.y_buffer;
-  int pre_stride = xd->block[0].pre_stride;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  mv_row = _16x16mv.as_mv.row;
-  mv_col = _16x16mv.as_mv.col;
-
-  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
-  if ((mv_row | mv_col) & 7) {
-    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
-                                  (mv_row & 7) << 1, dst_y, dst_ystride);
-  } else {
-    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-  }
-}
-
-void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              uint8_t *dst_u,
-                                              uint8_t *dst_v,
-                                              int dst_uvstride) {
-  int offset;
-  uint8_t *uptr, *vptr;
-
-  int_mv _16x16mv;
-  int mv_row;
-  int mv_col;
-  int omv_row, omv_col;
-
-  int pre_stride = xd->block[0].pre_stride;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  mv_row = _16x16mv.as_mv.row;
-  mv_col = _16x16mv.as_mv.col;
-
-  /* calc uv motion vectors */
-  omv_row = mv_row;
-  omv_col = mv_col;
-  mv_row = (mv_row + (mv_row > 0)) >> 1;
-  mv_col = (mv_col + (mv_col > 0)) >> 1;
-
-  mv_row &= xd->fullpixel_mask;
-  mv_col &= xd->fullpixel_mask;
-
-  pre_stride >>= 1;
-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-  uptr = xd->second_pre.u_buffer + offset;
-  vptr = xd->second_pre.v_buffer + offset;
-
-    if ((omv_row | omv_col) & 15) {
-      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
-                                  omv_row & 15, dst_u, dst_uvstride);
-      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
-                                  omv_row & 15, dst_v, dst_uvstride);
-    } else {
-      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
-      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
-    }
-}
-
-void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            uint8_t *dst_y,
-                                            uint8_t *dst_u,
-                                            uint8_t *dst_v,
-                                            int dst_ystride,
-                                            int dst_uvstride) {
-  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
-  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
-}
-
-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
+static void build_inter4x4_predictors_mb(MACROBLOCKD *xd,
+                                         int mb_row, int mb_col) {
   int i;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
   BLOCKD *blockd = xd->block;
+  int which_mv = 0;
+  const int use_second_ref = mbmi->second_ref_frame > 0;
 
   if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
-    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
-    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
-    blockd[10].bmi = xd->mode_info_context->bmi[10];
-
-    if (mbmi->need_to_clamp_mvs) {
-      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);
-      if (mbmi->second_ref_frame > 0) {
-        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);
-      }
-    }
+    for (i = 0; i < 16; i += 8) {
+      BLOCKD *d0 = &blockd[i];
+      BLOCKD *d1 = &blockd[i + 2];
+      const int y = i & 8;
 
+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
+      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
 
-    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[10], 16);
+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+        if (mbmi->need_to_clamp_mvs) {
+          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
+          clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
+        }
 
-    if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
+        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16,
+                                  which_mv, &xd->subpix,
+                                  mb_row * 16 + y, mb_col * 16);
+      }
     }
   } else {
     for (i = 0; i < 16; i += 2) {
       BLOCKD *d0 = &blockd[i];
       BLOCKD *d1 = &blockd[i + 1];
+      const int x = (i & 3) * 4;
+      const int y = (i >> 2) * 4;
 
       blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
       blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
 
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);
-        if (mbmi->second_ref_frame > 0) {
-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);
-          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);
-        }
-      }
-
-      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
-        build_inter_predictors2b(xd, d0, 16);
-      else {
-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);
-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);
-      }
-
-      if (mbmi->second_ref_frame > 0) {
-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);
-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);
+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16,
+                                  which_mv, &xd->subpix,
+                                  mb_row * 16 + y, mb_col * 16 + x);
       }
     }
   }
@@ -849,17 +800,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
   for (i = 16; i < 24; i += 2) {
     BLOCKD *d0 = &blockd[i];
     BLOCKD *d1 = &blockd[i + 1];
+    const int x = 4 * (i & 1);
+    const int y = ((i - 16) >> 1) * 4;
 
-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
-      build_inter_predictors2b(xd, d0, 8);
-    else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
-    }
-
-    if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
+    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8,
+                                which_mv, &xd->subpix,
+                                mb_row * 8 + y, mb_col * 8 + x);
     }
   }
 }
@@ -877,44 +824,44 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
 
       int temp;
 
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;
+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.row;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+      blockd[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) &
                                                   xd->fullpixel_mask;
 
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;
+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.col;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+      blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) &
         xd->fullpixel_mask;
 
       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd);
 
       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd);
 
-      blockd[voffset].bmi.as_mv.first.as_mv.row =
-        blockd[uoffset].bmi.as_mv.first.as_mv.row;
-      blockd[voffset].bmi.as_mv.first.as_mv.col =
-        blockd[uoffset].bmi.as_mv.first.as_mv.col;
+      blockd[voffset].bmi.as_mv[0].as_mv.row =
+        blockd[uoffset].bmi.as_mv[0].as_mv.row;
+      blockd[voffset].bmi.as_mv[0].as_mv.col =
+        blockd[uoffset].bmi.as_mv[0].as_mv.col;
 
       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;
+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.row;
 
         if (temp < 0) {
           temp -= 4;
@@ -922,13 +869,13 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+       blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) &
                                                     xd->fullpixel_mask;
 
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;
+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.col;
 
         if (temp < 0) {
           temp -= 4;
@@ -936,42 +883,51 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+        blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) &
                                                         xd->fullpixel_mask;
 
         // if (mbmi->need_to_clamp_mvs)
         clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+          &blockd[uoffset].bmi.as_mv[1].as_mv, xd);
 
         // if (mbmi->need_to_clamp_mvs)
         clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+          &blockd[uoffset].bmi.as_mv[1].as_mv, xd);
 
-        blockd[voffset].bmi.as_mv.second.as_mv.row =
-          blockd[uoffset].bmi.as_mv.second.as_mv.row;
-        blockd[voffset].bmi.as_mv.second.as_mv.col =
-          blockd[uoffset].bmi.as_mv.second.as_mv.col;
+        blockd[voffset].bmi.as_mv[1].as_mv.row =
+          blockd[uoffset].bmi.as_mv[1].as_mv.row;
+        blockd[voffset].bmi.as_mv[1].as_mv.col =
+          blockd[uoffset].bmi.as_mv[1].as_mv.col;
       }
     }
   }
 }
 
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
+void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col) {
+  vp9_build_inter16x16_predictors_mby(xd, dst_y, dst_ystride, mb_row, mb_col);
+  vp9_build_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride,
+                                       mb_row, mb_col);
+}
+
+
+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
+                                   int mb_row,
+                                   int mb_col) {
   if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,
-                                           &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      /* 256 = offset of U plane in Y+U+V buffer;
-       * 320 = offset of V plane in Y+U+V buffer.
-       * (256=16x16, 320=16x16+8x8). */
-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,
-                                             &xd->predictor[256],
-                                             &xd->predictor[320], 16, 8);
-    }
+    vp9_build_inter16x16_predictors_mb(xd, xd->predictor,
+                                       &xd->predictor[256],
+                                       &xd->predictor[320], 16, 8,
+                                       mb_row, mb_col);
+
 #if CONFIG_COMP_INTERINTRA_PRED
-    else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+    if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
       vp9_build_interintra_16x16_predictors_mb(xd, xd->predictor,
                                                &xd->predictor[256],
                                                &xd->predictor[320], 16, 8);
@@ -979,6 +935,6 @@ void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
 #endif
   } else {
     build_4x4uvmvs(xd);
-    build_inter4x4_predictors_mb(xd);
+    build_inter4x4_predictors_mb(xd, mb_row, mb_col);
   }
 }
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 89868b95e..831ce2a73 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -14,71 +14,128 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    uint8_t *dst_y,
-                                                    int dst_ystride,
-                                                    int clamp_mvs);
-
-extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     uint8_t *dst_u,
-                                                     uint8_t *dst_v,
-                                                     int dst_uvstride);
-
-extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   uint8_t *dst_y,
-                                                   uint8_t *dst_u,
-                                                   uint8_t *dst_v,
-                                                   int dst_ystride,
-                                                   int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    uint8_t *dst_y,
-                                                    int dst_ystride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     uint8_t *dst_u,
-                                                     uint8_t *dst_v,
-                                                     int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   uint8_t *dst_y,
-                                                   uint8_t *dst_u,
-                                                   uint8_t *dst_v,
-                                                   int dst_ystride,
-                                                   int dst_uvstride);
-
-extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                               uint8_t *dst_y,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_ystride,
-                                               int dst_uvstride);
-
-extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
-                                               uint8_t *dst_y,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_ystride,
-                                               int dst_uvstride);
-
-extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
-
-extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                         vp9_subpix_fn_t sppf);
-
-extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                             vp9_subpix_fn_t sppf);
-
-extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
-                                         int pitch);
-
-extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                             BLOCKD *d, int pitch);
-
-extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
-
-extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                                     INTERPOLATIONFILTERTYPE filter,
-                                     VP9_COMMON *cm);
+struct subpix_fn_table;
+
+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                         uint8_t *dst_y,
+                                         int dst_ystride,
+                                         int mb_row,
+                                         int mb_col);
+
+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                          uint8_t *dst_u,
+                                          uint8_t *dst_v,
+                                          int dst_uvstride,
+                                          int mb_row,
+                                          int mb_col);
+
+void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
+                                   int mb_row,
+                                   int mb_col);
+
+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATIONFILTERTYPE filter,
+                              VP9_COMMON *cm);
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       YV12_BUFFER_CONFIG *other,
+                                       int this_w, int this_h);
+
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int_mv *mv_q3,
+                               const struct scale_factors *scale,
+                               int w, int h, int do_avg,
+                               const struct subpix_fn_table *subpix);
+
+void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride,
+                                  const int_mv *fullpel_mv_q3,
+                                  const int_mv *frac_mv_q4,
+                                  const struct scale_factors *scale,
+                                  int w, int h, int do_avg,
+                                  const struct subpix_fn_table *subpix);
+
+static int scale_value_x(int val, const struct scale_factors *scale) {
+  return val * scale->x_num / scale->x_den;
+}
+
+static int scale_value_y(int val, const struct scale_factors *scale) {
+  return val * scale->y_num / scale->y_den;
+}
+
+static int scaled_buffer_offset(int x_offset,
+                                int y_offset,
+                                int stride,
+                                const struct scale_factors *scale) {
+  return scale_value_y(y_offset, scale) * stride +
+      scale_value_x(x_offset, scale);
+}
+
+static void setup_pred_block(YV12_BUFFER_CONFIG *dst,
+                             const YV12_BUFFER_CONFIG *src,
+                             int mb_row, int mb_col,
+                             const struct scale_factors *scale,
+                             const struct scale_factors *scale_uv) {
+  const int recon_y_stride = src->y_stride;
+  const int recon_uv_stride = src->uv_stride;
+  int recon_yoffset;
+  int recon_uvoffset;
+
+  if (scale) {
+    recon_yoffset = scaled_buffer_offset(16 * mb_col, 16 * mb_row,
+                                         recon_y_stride, scale);
+    recon_uvoffset = scaled_buffer_offset(8 * mb_col, 8 * mb_row,
+                                          recon_uv_stride, scale_uv);
+  } else {
+    recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
+    recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
+  }
+  *dst = *src;
+  dst->y_buffer += recon_yoffset;
+  dst->u_buffer += recon_uvoffset;
+  dst->v_buffer += recon_uvoffset;
+}
+
+static void set_scale_factors(MACROBLOCKD *xd,
+    int ref0, int ref1,
+    struct scale_factors scale_factor[MAX_REF_FRAMES]) {
+
+  xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0];
+  xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0];
+  xd->scale_factor_uv[0] = xd->scale_factor[0];
+  xd->scale_factor_uv[1] = xd->scale_factor[1];
+}
 
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 9b2fad5b1..eb8de2126 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -18,6 +18,23 @@
  * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
  */
 
+/* Using multiplication and shifting instead of division in diagonal prediction.
+ * iscale table is calculated from ((1<<16) + (i+2)/2) / (i+2) and used as
+ * ((A + B) * iscale[i] + (1<<15)) >> 16;
+ * where A and B are weighted pixel values.
+ */
+static const unsigned int iscale[64] = {
+  32768, 21845, 16384, 13107, 10923,  9362,  8192,  7282,
+   6554,  5958,  5461,  5041,  4681,  4369,  4096,  3855,
+   3641,  3449,  3277,  3121,  2979,  2849,  2731,  2621,
+   2521,  2427,  2341,  2260,  2185,  2114,  2048,  1986,
+   1928,  1872,  1820,  1771,  1725,  1680,  1638,  1598,
+   1560,  1524,  1489,  1456,  1425,  1394,  1365,  1337,
+   1311,  1285,  1260,  1237,  1214,  1192,  1170,  1150,
+   1130,  1111,  1092,  1074,  1057,  1040,  1024,  1008,
+};
+
+
 static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c, h, w, v;
@@ -29,7 +46,7 @@ static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
     else
       a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
     b = yabove_row[c + 2];
-    ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+    ypred_ptr[c] = ((2 * a + (c + 1) * b) * iscale[1+c] + (1<<15)) >> 16;
   }
   for (r = 1; r < n / 2 - 1; r++) {
     for (c = 0; c < n - 2 - 2 * r; c++) {
@@ -38,7 +55,8 @@ static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
       else
         a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
       b = ypred_ptr[(r - 1) * y_stride + c + 2];
-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+      ypred_ptr[r * y_stride + c] =
+                ((2 * a + (c + 1) * b) * iscale[1+c] + (1<<15)) >> 16;
     }
   }
   for (; r < n - 1; ++r) {
@@ -77,7 +95,8 @@ static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
     else
       a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
     b = yleft_col[r + 2];
-    ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);
+    ypred_ptr[r * y_stride] = ((2 * a + (r + 1) * b) * iscale[1+r] +
+                              (1<<15)) >> 16;
   }
   for (c = 1; c < n / 2 - 1; c++) {
     for (r = 0; r < n - 2 - 2 * c; r++) {
@@ -86,7 +105,8 @@ static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
       else
         a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
       b = ypred_ptr[(r + 2) * y_stride + c - 1];
-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+      ypred_ptr[r * y_stride + c] = ((2 * a + (c + 1) * b) * iscale[1+c] +
+                                    (1<<15)) >> 16;
     }
   }
   for (; c < n - 1; ++c) {
@@ -119,8 +139,8 @@ static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
   for (r = 0; r < n - 1; ++r) {
     for (c = 0; c <= r; ++c) {
       ypred_ptr[(r - c) * y_stride + c] =
-        (yabove_row[r + 1] * (c + 1) +
-         yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);
+        ((yabove_row[r + 1] * (c + 1) +
+          yleft_col[r + 1] * (r - c + 1)) * iscale[r] + (1<<15)) >> 16;
     }
   }
   for (c = 0; c <= r; ++c) {
@@ -129,8 +149,8 @@ static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
     int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -
                                    //            yleft_col[r-1]);
     ypred_ptr[(r - c) * y_stride + c] =
-      (yabove_ext * (c + 1) +
-       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
+      ((yabove_ext * (c + 1) +
+        yleft_ext * (r - c + 1)) * iscale[r] + (1<<15)) >> 16;
   }
   for (r = 1; r < n; ++r) {
     for (c = n - r; c < n; ++c) {
@@ -251,16 +271,40 @@ void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
 void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,
                                          uint8_t *ypred_ptr,
                                          int y_stride, int mode, int bsize,
-                                         int up_available, int left_available) {
-
-  uint8_t *yabove_row = src - src_stride;
-  uint8_t yleft_col[64];
-  uint8_t ytop_left = yabove_row[-1];
+                                         int up_available, int left_available,
+                                         int right_available) {
   int r, c, i;
+  uint8_t yleft_col[64], yabove_data[65], ytop_left;
+  uint8_t *yabove_row = yabove_data + 1;
+  /*
+   * 127 127 127 .. 127 127 127 127 127 127
+   * 129  A   B  ..  Y   Z
+   * 129  C   D  ..  W   X
+   * 129  E   F  ..  U   V
+   * 129  G   H  ..  S   T   T   T   T   T
+   *  ..
+   */
+
+  if (left_available) {
+    for (i = 0; i < bsize; i++)
+      yleft_col[i] = src[i * src_stride - 1];
+  } else {
+    vpx_memset(yleft_col, 129, bsize);
+  }
 
-  for (i = 0; i < bsize; i++) {
-    yleft_col[i] = src[i * src_stride - 1];
+  if (up_available) {
+    uint8_t *yabove_ptr = src - src_stride;
+    vpx_memcpy(yabove_row, yabove_ptr, bsize);
+    if (left_available) {
+      ytop_left = yabove_ptr[-1];
+    } else {
+      ytop_left = 127;
+    }
+  } else {
+    vpx_memset(yabove_row, 127, bsize);
+    ytop_left = 127;
   }
+  yabove_row[-1] = ytop_left;
 
   /* for Y */
   switch (mode) {
@@ -383,155 +427,28 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
   static const int scale_max = 256;     // 1 << scale_bits;
   static const int scale_round = 127;   // (1 << (scale_bits - 1));
   // This table is a function A + B*exp(-kx), where x is hor. index
-  static const int weights1d[32] = {
-    128, 122, 116, 111, 107, 103,  99,  96,
-    93, 90, 88, 85, 83, 81, 80, 78,
-    77, 76, 75, 74, 73, 72, 71, 70,
-    70, 69, 69, 68, 68, 68, 67, 67,
-  };
-  // This table is a function A + B*exp(-k.sqrt(xy)), where x, y are
-  // hor. and vert. indices
-  static const int weights2d[1024] = {
-    128, 128, 128, 128, 128, 128, 128, 128,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    128, 122, 120, 118, 116, 115, 114, 113,
-    112, 111, 111, 110, 109, 109, 108, 107,
-    107, 106, 106, 105, 105, 104, 104, 104,
-    103, 103, 102, 102, 102, 101, 101, 101,
-    128, 120, 116, 114, 112, 111, 109, 108,
-    107, 106, 105, 104, 103, 102, 102, 101,
-    100, 100,  99,  99,  98,  97,  97,  96,
-    96,  96,  95,  95,  94,  94,  93,  93,
-    128, 118, 114, 111, 109, 107, 106, 104,
-    103, 102, 101, 100,  99,  98,  97,  97,
-    96,  95,  95,  94,  93,  93,  92,  92,
-    91,  91,  90,  90,  90,  89,  89,  88,
-    128, 116, 112, 109, 107, 105, 103, 102,
-    100,  99,  98,  97,  96,  95,  94,  93,
-    93,  92,  91,  91,  90,  90,  89,  89,
-    88,  88,  87,  87,  86,  86,  85,  85,
-    128, 115, 111, 107, 105, 103, 101,  99,
-    98,  97,  96,  94,  93,  93,  92,  91,
-    90,  89,  89,  88,  88,  87,  86,  86,
-    85,  85,  84,  84,  84,  83,  83,  82,
-    128, 114, 109, 106, 103, 101,  99,  97,
-    96,  95,  93,  92,  91,  90,  90,  89,
-    88,  87,  87,  86,  85,  85,  84,  84,
-    83,  83,  82,  82,  82,  81,  81,  80,
-    128, 113, 108, 104, 102,  99,  97,  96,
-    94,  93,  92,  91,  90,  89,  88,  87,
-    86,  85,  85,  84,  84,  83,  83,  82,
-    82,  81,  81,  80,  80,  79,  79,  79,
-    128, 112, 107, 103, 100,  98,  96,  94,
-    93,  91,  90,  89,  88,  87,  86,  85,
-    85,  84,  83,  83,  82,  82,  81,  80,
-    80,  80,  79,  79,  78,  78,  78,  77,
-    128, 111, 106, 102,  99,  97,  95,  93,
-    91,  90,  89,  88,  87,  86,  85,  84,
-    83,  83,  82,  81,  81,  80,  80,  79,
-    79,  78,  78,  77,  77,  77,  76,  76,
-    128, 111, 105, 101,  98,  96,  93,  92,
-    90,  89,  88,  86,  85,  84,  84,  83,
-    82,  81,  81,  80,  80,  79,  79,  78,
-    78,  77,  77,  76,  76,  76,  75,  75,
-    128, 110, 104, 100,  97,  94,  92,  91,
-    89,  88,  86,  85,  84,  83,  83,  82,
-    81,  80,  80,  79,  79,  78,  78,  77,
-    77,  76,  76,  75,  75,  75,  74,  74,
-    128, 109, 103,  99,  96,  93,  91,  90,
-    88,  87,  85,  84,  83,  82,  82,  81,
-    80,  79,  79,  78,  78,  77,  77,  76,
-    76,  75,  75,  75,  74,  74,  74,  73,
-    128, 109, 102,  98,  95,  93,  90,  89,
-    87,  86,  84,  83,  82,  81,  81,  80,
-    79,  78,  78,  77,  77,  76,  76,  75,
-    75,  75,  74,  74,  73,  73,  73,  73,
-    128, 108, 102,  97,  94,  92,  90,  88,
-    86,  85,  84,  83,  82,  81,  80,  79,
-    78,  78,  77,  77,  76,  76,  75,  75,
-    74,  74,  73,  73,  73,  73,  72,  72,
-    128, 107, 101,  97,  93,  91,  89,  87,
-    85,  84,  83,  82,  81,  80,  79,  78,
-    78,  77,  76,  76,  75,  75,  74,  74,
-    74,  73,  73,  73,  72,  72,  72,  71,
-    128, 107, 100,  96,  93,  90,  88,  86,
-    85,  83,  82,  81,  80,  79,  78,  78,
-    77,  76,  76,  75,  75,  74,  74,  73,
-    73,  73,  72,  72,  72,  71,  71,  71,
-    128, 106, 100,  95,  92,  89,  87,  85,
-    84,  83,  81,  80,  79,  78,  78,  77,
-    76,  76,  75,  75,  74,  74,  73,  73,
-    72,  72,  72,  72,  71,  71,  71,  70,
-    128, 106,  99,  95,  91,  89,  87,  85,
-    83,  82,  81,  80,  79,  78,  77,  76,
-    76,  75,  75,  74,  74,  73,  73,  72,
-    72,  72,  71,  71,  71,  71,  70,  70,
-    128, 105,  99,  94,  91,  88,  86,  84,
-    83,  81,  80,  79,  78,  77,  77,  76,
-    75,  75,  74,  74,  73,  73,  72,  72,
-    72,  71,  71,  71,  70,  70,  70,  70,
-    128, 105,  98,  93,  90,  88,  85,  84,
-    82,  81,  80,  79,  78,  77,  76,  75,
-    75,  74,  74,  73,  73,  72,  72,  71,
-    71,  71,  71,  70,  70,  70,  70,  69,
-    128, 104,  97,  93,  90,  87,  85,  83,
-    82,  80,  79,  78,  77,  76,  76,  75,
-    74,  74,  73,  73,  72,  72,  71,  71,
-    71,  70,  70,  70,  70,  69,  69,  69,
-    128, 104,  97,  92,  89,  86,  84,  83,
-    81,  80,  79,  78,  77,  76,  75,  74,
-    74,  73,  73,  72,  72,  71,  71,  71,
-    70,  70,  70,  70,  69,  69,  69,  69,
-    128, 104,  96,  92,  89,  86,  84,  82,
-    80,  79,  78,  77,  76,  75,  75,  74,
-    73,  73,  72,  72,  71,  71,  71,  70,
-    70,  70,  70,  69,  69,  69,  69,  68,
-    128, 103,  96,  91,  88,  85,  83,  82,
-    80,  79,  78,  77,  76,  75,  74,  74,
-    73,  72,  72,  72,  71,  71,  70,  70,
-    70,  70,  69,  69,  69,  69,  68,  68,
-    128, 103,  96,  91,  88,  85,  83,  81,
-    80,  78,  77,  76,  75,  75,  74,  73,
-    73,  72,  72,  71,  71,  70,  70,  70,
-    70,  69,  69,  69,  69,  68,  68,  68,
-    128, 102,  95,  90,  87,  84,  82,  81,
-    79,  78,  77,  76,  75,  74,  73,  73,
-    72,  72,  71,  71,  71,  70,  70,  70,
-    69,  69,  69,  69,  68,  68,  68,  68,
-    128, 102,  95,  90,  87,  84,  82,  80,
-    79,  77,  76,  75,  75,  74,  73,  73,
-    72,  72,  71,  71,  70,  70,  70,  69,
-    69,  69,  69,  68,  68,  68,  68,  68,
-    128, 102,  94,  90,  86,  84,  82,  80,
-    78,  77,  76,  75,  74,  73,  73,  72,
-    72,  71,  71,  70,  70,  70,  69,  69,
-    69,  69,  68,  68,  68,  68,  68,  67,
-    128, 101,  94,  89,  86,  83,  81,  79,
-    78,  77,  76,  75,  74,  73,  73,  72,
-    71,  71,  71,  70,  70,  69,  69,  69,
-    69,  68,  68,  68,  68,  68,  67,  67,
-    128, 101,  93,  89,  85,  83,  81,  79,
-    78,  76,  75,  74,  74,  73,  72,  72,
-    71,  71,  70,  70,  70,  69,  69,  69,
-    68,  68,  68,  68,  68,  67,  67,  67,
-    128, 101,  93,  88,  85,  82,  80,  79,
-    77,  76,  75,  74,  73,  73,  72,  71,
-    71,  70,  70,  70,  69,  69,  69,  68,
-    68,  68,  68,  68,  67,  67,  67,  67,
+  static const int weights1d[64] = {
+    128, 125, 122, 119, 116, 114, 111, 109,
+    107, 105, 103, 101,  99,  97,  96,  94,
+     93,  91,  90,  89,  88,  86,  85,  84,
+     83,  82,  81,  81,  80,  79,  78,  78,
+     77,  76,  76,  75,  75,  74,  74,  73,
+     73,  72,  72,  71,  71,  71,  70,  70,
+     70,  70,  69,  69,  69,  69,  68,  68,
+     68,  68,  68,  67,  67,  67,  67,  67,
   };
-  int size_scale = (size >= 32 ? 1 :
-                    size == 16 ? 2 :
-                    size == 8  ? 4 : 8);
-  int size_shift = size == 64 ? 1 : 0;
+
+  int size_scale = (size >= 64 ? 1:
+                    size == 32 ? 2 :
+                    size == 16 ? 4 :
+                    size == 8  ? 8 : 16);
   int i, j;
   switch (mode) {
     case V_PRED:
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights1d[i * size_scale >> size_shift];
+          int scale = weights1d[i * size_scale];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -544,7 +461,7 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights1d[j * size_scale >> size_shift];
+          int scale = weights1d[j * size_scale];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -558,9 +475,8 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = (weights2d[(i * size_scale * 32 +
-                                  j * size_scale) >> size_shift] +
-                       weights1d[i * size_scale >> size_shift]) >> 1;
+          int scale = (weights1d[i * size_scale] * 3 +
+                       weights1d[j * size_scale]) >> 2;
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -574,9 +490,8 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = (weights2d[(i * size_scale * 32 +
-                                  j * size_scale) >> size_shift] +
-                       weights1d[j * size_scale >> size_shift]) >> 1;
+          int scale = (weights1d[j * size_scale] * 3 +
+                       weights1d[i * size_scale]) >> 2;
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -589,8 +504,7 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights2d[(i * size_scale * 32 +
-                                 j * size_scale) >> size_shift];
+          int scale = weights1d[(i < j ? i : j) * size_scale];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -600,8 +514,21 @@ static void combine_interintra(MB_PREDICTION_MODE mode,
       break;
 
     case D45_PRED:
-    case DC_PRED:
+      for (i = 0; i < size; ++i) {
+        for (j = 0; j < size; ++j) {
+          int k = i * interstride + j;
+          int scale = (weights1d[i * size_scale] +
+                       weights1d[j * size_scale]) >> 1;
+          interpred[k] =
+              ((scale_max - scale) * interpred[k] +
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
+        }
+      }
+      break;
+
     case TM_PRED:
+    case DC_PRED:
     default:
       // simple average
       for (i = 0; i < size; ++i) {
@@ -631,7 +558,7 @@ void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
       xd->dst.y_buffer, xd->dst.y_stride,
       intrapredictor, 16,
       xd->mode_info_context->mbmi.interintra_mode, 16,
-      xd->up_available, xd->left_available);
+      xd->up_available, xd->left_available, xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
                      ypred, ystride, intrapredictor, 16, 16);
 }
@@ -646,12 +573,12 @@ void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
       xd->dst.u_buffer, xd->dst.uv_stride,
       uintrapredictor, 8,
       xd->mode_info_context->mbmi.interintra_uv_mode, 8,
-      xd->up_available, xd->left_available);
+      xd->up_available, xd->left_available, xd->right_available);
   vp9_build_intra_predictors_internal(
       xd->dst.v_buffer, xd->dst.uv_stride,
       vintrapredictor, 8,
       xd->mode_info_context->mbmi.interintra_uv_mode, 8,
-      xd->up_available, xd->left_available);
+      xd->up_available, xd->left_available, xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
                      upred, uvstride, uintrapredictor, 8, 8);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
@@ -666,7 +593,7 @@ void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,
       xd->dst.y_buffer, xd->dst.y_stride,
       intrapredictor, 32,
       xd->mode_info_context->mbmi.interintra_mode, 32,
-      xd->up_available, xd->left_available);
+      xd->up_available, xd->left_available, xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
                      ypred, ystride, intrapredictor, 32, 32);
 }
@@ -681,12 +608,12 @@ void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,
       xd->dst.u_buffer, xd->dst.uv_stride,
       uintrapredictor, 16,
       xd->mode_info_context->mbmi.interintra_uv_mode, 16,
-      xd->up_available, xd->left_available);
+      xd->up_available, xd->left_available, xd->right_available);
   vp9_build_intra_predictors_internal(
       xd->dst.v_buffer, xd->dst.uv_stride,
       vintrapredictor, 16,
       xd->mode_info_context->mbmi.interintra_uv_mode, 16,
-      xd->up_available, xd->left_available);
+      xd->up_available, xd->left_available, xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
                      upred, uvstride, uintrapredictor, 16, 16);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
@@ -710,7 +637,8 @@ void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,
   const int mode = xd->mode_info_context->mbmi.interintra_mode;
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       intrapredictor, 64, mode, 64,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
                      ypred, ystride, intrapredictor, 64, 64);
 }
@@ -724,10 +652,12 @@ void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,
   const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;
   vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
                                       uintrapredictor, 32, mode, 32,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
   vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
                                       vintrapredictor, 32, mode, 32,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
                      upred, uvstride, uintrapredictor, 32, 32);
   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
@@ -749,28 +679,32 @@ void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->predictor, 16,
                                       xd->mode_info_context->mbmi.mode, 16,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
 }
 
 void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->mode_info_context->mbmi.mode, 16,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
 }
 
 void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->mode_info_context->mbmi.mode, 32,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
 }
 
 void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->mode_info_context->mbmi.mode, 64,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
 }
 
 void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
@@ -780,10 +714,12 @@ void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
                                               int mode, int bsize) {
   vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
                                       upred_ptr, uv_stride, mode, bsize,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
   vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
                                       vpred_ptr, uv_stride, mode, bsize,
-                                      xd->up_available, xd->left_available);
+                                      xd->up_available, xd->left_available,
+                                      xd->right_available);
 }
 
 void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
@@ -815,20 +751,35 @@ void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) {
                                            32);
 }
 
-void vp9_intra8x8_predict(BLOCKD *xd,
+void vp9_intra8x8_predict(MACROBLOCKD *xd,
+                          BLOCKD *b,
                           int mode,
                           uint8_t *predictor) {
-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
-                                      xd->dst_stride, predictor, 16,
-                                      mode, 8, 1, 1);
+  const int block4x4_idx = (b - xd->block);
+  const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2);
+  const int have_top = (block_idx >> 1) || xd->up_available;
+  const int have_left = (block_idx & 1)  || xd->left_available;
+  const int have_right = !(block_idx & 1) || xd->right_available;
+
+  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,
+                                      b->dst_stride, predictor, 16,
+                                      mode, 8, have_top, have_left,
+                                      have_right);
 }
 
-void vp9_intra_uv4x4_predict(BLOCKD *xd,
+void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,
+                             BLOCKD *b,
                              int mode,
                              uint8_t *predictor) {
-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
-                                      xd->dst_stride, predictor, 8,
-                                      mode, 4, 1, 1);
+  const int block_idx = (b - xd->block) & 3;
+  const int have_top = (block_idx >> 1) || xd->up_available;
+  const int have_left = (block_idx & 1)  || xd->left_available;
+  const int have_right = !(block_idx & 1) || xd->right_available;
+
+  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,
+                                      b->dst_stride, predictor, 8,
+                                      mode, 4, have_top, have_left,
+                                      have_right);
 }
 
 /* TODO: try different ways of use Y-UV mode correlation
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index 88584ad3b..3031fb699 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,37 +14,43 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
-extern B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                                     int stride, int n);
-extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
+
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n);
+
+B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+
 #if CONFIG_COMP_INTERINTRA_PRED
-extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                                     uint8_t *ypred,
-                                                     uint8_t *upred,
-                                                     uint8_t *vpred,
-                                                     int ystride,
-                                                     int uvstride);
-extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                                      uint8_t *ypred,
-                                                      int ystride);
-extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                       uint8_t *upred,
-                                                       uint8_t *vpred,
-                                                       int uvstride);
+void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride);
+
+void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
+                                               uint8_t *ypred,
+                                               int ystride);
+
+void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
+                                                int uvstride);
 #endif  // CONFIG_COMP_INTERINTRA_PRED
 
-extern void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                                     uint8_t *ypred,
-                                                     uint8_t *upred,
-                                                     uint8_t *vpred,
-                                                     int ystride,
-                                                     int uvstride);
-extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
-                                                     uint8_t *ypred,
-                                                     uint8_t *upred,
-                                                     uint8_t *vpred,
-                                                     int ystride,
-                                                     int uvstride);
+void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride);
+
+void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride);
 
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index da607e81c..7fbee7c32 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -151,19 +151,99 @@ B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {
 }
 #endif
 
-void vp9_intra4x4_predict(BLOCKD *x,
+void vp9_intra4x4_predict(MACROBLOCKD *xd,
+                          BLOCKD *x,
                           int b_mode,
                           uint8_t *predictor) {
   int i, r, c;
+  const int block_idx = x - xd->block;
+  const int have_top = (block_idx >> 2) || xd->up_available;
+  const int have_left = (block_idx & 3)  || xd->left_available;
+  const int have_right = (block_idx & 3) != 3 || xd->right_available;
+  uint8_t left[4], above[8], top_left;
+  /*
+   * 127 127 127 .. 127 127 127 127 127 127
+   * 129  A   B  ..  Y   Z
+   * 129  C   D  ..  W   X
+   * 129  E   F  ..  U   V
+   * 129  G   H  ..  S   T   T   T   T   T
+   *  ..
+   */
+
+  if (have_left) {
+    uint8_t *left_ptr = *(x->base_dst) + x->dst - 1;
+    const int stride = x->dst_stride;
+
+    left[0] = left_ptr[0 * stride];
+    left[1] = left_ptr[1 * stride];
+    left[2] = left_ptr[2 * stride];
+    left[3] = left_ptr[3 * stride];
+  } else {
+    left[0] = left[1] = left[2] = left[3] = 129;
+  }
+
+  if (have_top) {
+    uint8_t *above_ptr = *(x->base_dst) + x->dst - x->dst_stride;
 
-  uint8_t *above = *(x->base_dst) + x->dst - x->dst_stride;
-  uint8_t left[4];
-  uint8_t top_left = above[-1];
+    if (have_left) {
+      top_left = above_ptr[-1];
+    } else {
+      top_left = 127;
+    }
 
-  left[0] = (*(x->base_dst))[x->dst - 1];
-  left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-  left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-  left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+    above[0] = above_ptr[0];
+    above[1] = above_ptr[1];
+    above[2] = above_ptr[2];
+    above[3] = above_ptr[3];
+    if (((block_idx & 3) != 3) ||
+        (have_right && block_idx == 3 &&
+         ((xd->mb_index != 3 && xd->sb_index != 3) ||
+          ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {
+      above[4] = above_ptr[4];
+      above[5] = above_ptr[5];
+      above[6] = above_ptr[6];
+      above[7] = above_ptr[7];
+    } else if (have_right) {
+      uint8_t *above_right = above_ptr + 4;
+
+      if (xd->sb_index == 3 && (xd->mb_index & 1))
+        above_right -= 32 * x->dst_stride;
+      if (xd->mb_index == 3)
+        above_right -= 16 * x->dst_stride;
+      above_right -= (block_idx & ~3) * x->dst_stride;
+
+      /* use a more distant above-right (from closest available top-right
+       * corner), but with a "localized DC" (similar'ish to TM-pred):
+       *
+       *  A   B   C   D   E   F   G   H
+       *  I   J   K   L
+       *  M   N   O   P
+       *  Q   R   S   T
+       *  U   V   W   X   x1  x2  x3  x4
+       *
+       * Where:
+       * x1 = clip_pixel(E + X - D)
+       * x2 = clip_pixel(F + X - D)
+       * x3 = clip_pixel(G + X - D)
+       * x4 = clip_pixel(H + X - D)
+       *
+       * This is applied anytime when we use a "distant" above-right edge
+       * that is not immediately top-right to the block that we're going
+       * to do intra prediction for.
+       */
+      above[4] = clip_pixel(above_right[0] + above_ptr[3] - above_right[-1]);
+      above[5] = clip_pixel(above_right[1] + above_ptr[3] - above_right[-1]);
+      above[6] = clip_pixel(above_right[2] + above_ptr[3] - above_right[-1]);
+      above[7] = clip_pixel(above_right[3] + above_ptr[3] - above_right[-1]);
+    } else {
+      // extend edge
+      above[4] = above[5] = above[6] = above[7] = above[3];
+    }
+  } else {
+    above[0] = above[1] = above[2] = above[3] = 127;
+    above[4] = above[5] = above[6] = above[7] = 127;
+    top_left = 127;
+  }
 
 #if CONFIG_NEWBINTRAMODES
   if (b_mode == B_CONTEXT_PRED)
@@ -411,39 +491,3 @@ void vp9_intra4x4_predict(BLOCKD *x,
 #endif
   }
 }
-
-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
- * to the right prediction have filled in pixels to use.
- */
-void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
-  int extend_edge = xd->mb_to_right_edge == 0 && xd->mb_index < 2;
-  uint8_t *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
-                               xd->block[0].dst_stride + 16;
-  uint32_t *dst_ptr0 = (uint32_t *)above_right;
-  uint32_t *dst_ptr1 =
-    (uint32_t *)(above_right + 4 * xd->block[0].dst_stride);
-  uint32_t *dst_ptr2 =
-    (uint32_t *)(above_right + 8 * xd->block[0].dst_stride);
-  uint32_t *dst_ptr3 =
-    (uint32_t *)(above_right + 12 * xd->block[0].dst_stride);
-
-  uint32_t *src_ptr = (uint32_t *) above_right;
-
-  if ((xd->sb_index >= 2 && xd->mb_to_right_edge == 0) ||
-      (xd->sb_index == 3 && xd->mb_index & 1))
-    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 32 *
-                                                    xd->block[0].dst_stride);
-  if (xd->mb_index == 3 ||
-      (xd->mb_to_right_edge == 0 && xd->mb_index == 2))
-    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 16 *
-                                                    xd->block[0].dst_stride);
-
-  if (extend_edge) {
-    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
-  }
-
-  *dst_ptr0 = *src_ptr;
-  *dst_ptr1 = *src_ptr;
-  *dst_ptr2 = *src_ptr;
-  *dst_ptr3 = *src_ptr;
-}
diff --git a/vp9/common/vp9_reconintra4x4.h b/vp9/common/vp9_reconintra4x4.h
deleted file mode 100644
index 4e58731e8..000000000
--- a/vp9/common/vp9_reconintra4x4.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_VP9_RECONINTRA4X4_H_
-#define VP9_COMMON_VP9_RECONINTRA4X4_H_
-
-extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);
-
-#endif  // VP9_COMMON_VP9_RECONINTRA4X4_H_
diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index 277d5b217..72613ae07 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c
@@ -12,10 +12,9 @@
 #include "vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-extern void vpx_scale_rtcd(void);
+void vpx_scale_rtcd(void);
 
-void vp9_rtcd()
-{
+void vp9_rtcd() {
     vpx_scale_rtcd();
     once(setup_rtcd_internal);
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ddca11931..b9acadea7 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -23,90 +23,50 @@ EOF
 }
 forward_decls vp9_common_forward_decls
 
-prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-
-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
-# compiles warning free but a dissassembly of generated code show bugs. To be
-# on the safe side, only enabled when compiled with 'gcc'.
-if [ "$CONFIG_GCC" = "yes" ]; then
-    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
-fi
-    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
-    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
-    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
-
 #
 # Dequant
 #
-prototype void vp9_dequantize_b "struct blockd *x"
-specialize vp9_dequantize_b
-
-prototype void vp9_dequantize_b_2x2 "struct blockd *x"
-specialize vp9_dequantize_b_2x2
-
-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dc, struct macroblockd *xd"
-specialize vp9_dequant_dc_idct_add_y_block_8x8
-
-prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd"
+prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_y_block_8x8
 
-prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd"
+prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_uv_block_8x8
 
 prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
 specialize vp9_dequant_idct_add_16x16
 
-prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc, int eob"
+prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
 specialize vp9_dequant_idct_add_8x8
 
-prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride"
+prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
 specialize vp9_dequant_idct_add
 
-prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc"
-specialize vp9_dequant_dc_idct_add
-
-prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"
-specialize vp9_dequant_dc_idct_add_y_block
-
-prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
+prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_y_block
 
-prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
+prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_uv_block
 
 prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"
 specialize vp9_dequant_idct_add_32x32
 
-prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
+prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_uv_block_16x16
 
 #
 # RECON
 #
-prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem16x16 mmx sse2 dspr2
 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 
-prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x8 mmx dspr2
 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 
-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_avg_mem16x16
-
-prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_avg_mem8x8
-
-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx dspr2
-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
-
 prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
@@ -164,13 +124,13 @@ specialize vp9_build_intra_predictors_sb64y_s;
 prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_sb64uv_s;
 
-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"
+prototype void vp9_intra4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra4x4_predict;
 
-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, uint8_t *predictor"
+prototype void vp9_intra8x8_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra8x8_predict;
 
-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"
+prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra_uv4x4_predict;
 
 #
@@ -263,116 +223,29 @@ specialize vp9_sad16x3 sse2
 prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad3x16 sse2
 
-prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, const int source_stride, const int xoffset, const int  yoffset, const uint8_t *ref_ptr, const int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x2 sse2
 
 #
 # Sub Pixel Filters
 #
-prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict16x16
-
-prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x8
-
-prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg16x16
-
-prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg8x8
-
-prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg4x4
-
-prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x4
-
-prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict4x4
-
-prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict16x16_sharp
-
-prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x8_sharp
-
-prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg16x16_sharp
-
-prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg8x8_sharp
-
-prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg4x4_sharp
-
-prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x4_sharp
-
-prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict4x4_sharp
-
-prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict16x16_smooth
-
-prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x8_smooth
-
-prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg16x16_smooth
-
-prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg8x8_smooth
+prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8 ssse3
 
-prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg4x4_smooth
+prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_horiz ssse3
 
-prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x4_smooth
+prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_vert ssse3
 
-prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict4x4_smooth
+prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg ssse3
 
-prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict16x16
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_horiz ssse3
 
-prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict8x8
-
-prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg16x16
-
-prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg8x8
-
-prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict8x4
-
-prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict4x4
-
-prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg4x4
-
-prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict16x16 sse2
-
-prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x8 sse2
-
-prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict_avg16x16
-
-prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict_avg8x8
-
-prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x4
-
-prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict4x4
-
-prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict_avg4x4
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_vert ssse3
 
 #
 # dct
@@ -389,8 +262,8 @@ specialize vp9_short_idct8x8
 prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct10_8x8
 
-prototype void vp9_short_ihaar2x2 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_ihaar2x2
+prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
+specialize vp9_short_idct1_8x8
 
 prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct16x16
@@ -398,36 +271,39 @@ specialize vp9_short_idct16x16
 prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct10_16x16
 
+prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
+specialize vp9_short_idct1_16x16
+
+
 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct32x32
 
-prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
-specialize vp9_ihtllm
+prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
+specialize vp9_short_idct1_32x32
 
-#
-# 2nd order
-#
-prototype void vp9_short_inv_walsh4x4_1 "int16_t *in, int16_t *out"
-specialize vp9_short_inv_walsh4x4_1
+prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_idct10_32x32
 
-prototype void vp9_short_inv_walsh4x4 "int16_t *in, int16_t *out"
-specialize vp9_short_inv_walsh4x4_
+prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
+specialize vp9_short_iht8x8
 
+prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
+specialize vp9_short_iht4x4
+
+prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16
 
 # dct and add
-prototype void vp9_dc_only_idct_add_8x8 "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_idct_add_8x8
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_idct_add
+specialize vp9_dc_only_idct_add sse2
 
-if [ "$CONFIG_LOSSLESS" = "yes" ]; then
 prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_inv_walsh4x4_1_x8
 prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_inv_walsh4x4_x8
 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-prototype void vp9_short_inv_walsh4x4_1_lossless "int16_t *in, int16_t *out"
-prototype void vp9_short_inv_walsh4x4_lossless "int16_t *in, int16_t *out"
-fi
+specialize vp9_dc_only_inv_walsh_add
 
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
@@ -475,58 +351,52 @@ specialize vp9_variance4x4 mmx sse2
 vp9_variance4x4_sse2=vp9_variance4x4_wmt
 vp9_variance4x4_mmx=vp9_variance4x4_mmx
 
-prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64
+prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance64x64 sse2
 
-prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32
+prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance32x32 sse2
 
-prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
-vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
 
-prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
-prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
-prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
-prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad64x64
+specialize vp9_sad64x64 sse2
 
 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32
+specialize vp9_sad32x32 sse2
 
 prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx sse2 sse3
-vp9_sad16x16_sse2=vp9_sad16x16_wmt
+specialize vp9_sad16x16 mmx sse2
 
 prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad16x8 mmx sse2
-vp9_sad16x8_sse2=vp9_sad16x8_wmt
 
 prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x16 mmx sse2
-vp9_sad8x16_sse2=vp9_sad8x16_wmt
 
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
-vp9_sad8x8_sse2=vp9_sad8x8_wmt
 
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx sse2
-vp9_sad4x4_sse2=vp9_sad4x4_wmt
+specialize vp9_sad4x4 mmx sse
 
 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_h mmx sse2
@@ -579,76 +449,64 @@ specialize vp9_sad8x8x3 sse3
 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x3 sse3
 
-prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad64x64x8
 
-prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad32x32x8
 
-prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad16x16x8 sse4
 
-prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad16x8x8 sse4
 
-prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x16x8 sse4
 
-prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x8x8 sse4
 
-prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x64x4d
+specialize vp9_sad64x64x4d sse2
 
 prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x4d
+specialize vp9_sad32x32x4d sse2
 
 prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x4d sse3
+specialize vp9_sad16x16x4d sse2
 
 prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x4d sse3
+specialize vp9_sad16x8x4d sse2
 
 prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x4d sse3
+specialize vp9_sad8x16x4d sse2
 
 prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x4d sse3
+specialize vp9_sad8x8x4d sse2
 
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x4d sse3
-
-#
-# Block copy
-#
-case $arch in
-    x86*)
-    prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"
-    specialize vp9_copy32xn sse2 sse3
-    ;;
-esac
-
+specialize vp9_sad4x4x4d sse
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
-vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
 
 prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse16x16 mmx sse2
 vp9_mse16x16_sse2=vp9_mse16x16_wmt
 
-prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_mse64x64
 
-prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_mse32x32
 
 prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
-prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
+prototype int vp9_mbblock_error "struct macroblock *mb"
 specialize vp9_mbblock_error mmx sse2
 vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
 
@@ -686,14 +544,17 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
 fi
 
 # fdct functions
-prototype void vp9_fht "const int16_t *input, int pitch, int16_t *output, int tx_type, int tx_dim"
-specialize vp9_fht
+prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+specialize vp9_short_fht4x4
 
-prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x8
+prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+specialize vp9_short_fht8x8
+
+prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+specialize vp9_short_fht16x16
 
-prototype void vp9_short_fhaar2x2 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fhaar2x2
+prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_fdct8x8 sse2
 
 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct4x4
@@ -701,18 +562,12 @@ specialize vp9_short_fdct4x4
 prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x4
 
-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4
-
 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct32x32
 
 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct16x16
 
-prototype void vp9_short_walsh4x4_lossless "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4_lossless
-
 prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4_x8
 
diff --git a/vp9/common/vp9_sadmxn.h b/vp9/common/vp9_sadmxn.h
index fe3cdc2b3..b2dfd63f9 100644
--- a/vp9/common/vp9_sadmxn.h
+++ b/vp9/common/vp9_sadmxn.h
@@ -11,14 +11,15 @@
 #ifndef VP9_COMMON_VP9_SADMXN_H_
 #define VP9_COMMON_VP9_SADMXN_H_
 
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static __inline unsigned int sad_mx_n_c(const uint8_t *src_ptr,
-                                        int src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int ref_stride,
-                                        int m,
-                                        int n) {
+static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr,
+                                      int src_stride,
+                                      const uint8_t *ref_ptr,
+                                      int ref_stride,
+                                      int m,
+                                      int n) {
   int r, c;
   unsigned int sad = 0;
 
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 89c1e458d..07a4d4484 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -12,9 +12,9 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_seg_common.h"
 
-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
+static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0 };
 static const int seg_feature_data_max[SEG_LVL_MAX] =
-                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1};
+                 { MAXQ, 63, 0xf, 0xf, TX_SIZE_MAX_SB - 1};
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
diff --git a/vp9/common/vp9_setupintrarecon.h b/vp9/common/vp9_setupintrarecon.h
index 457265528..e389f3c91 100644
--- a/vp9/common/vp9_setupintrarecon.h
+++ b/vp9/common/vp9_setupintrarecon.h
@@ -13,6 +13,6 @@
 
 #include "vpx_scale/yv12config.h"
 
-extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
 
 #endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h
deleted file mode 100644
index dc4eadfb1..000000000
--- a/vp9/common/vp9_subpixel.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPIXEL_H_
-#define VP9_COMMON_VP9_SUBPIXEL_H_
-
-#define prototype_subpixel_predict(sym) \
-  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
-           uint8_t *dst, int dst_pitch)
-
-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
-
-#endif  // VP9_COMMON_VP9_SUBPIXEL_H_
diff --git a/vp9/common/vp9_textblit.c b/vp9/common/vp9_textblit.c
index 52c6b87c6..60e95e08f 100644
--- a/vp9/common/vp9_textblit.c
+++ b/vp9/common/vp9_textblit.c
@@ -12,22 +12,26 @@
 
 #include "vp9/common/vp9_textblit.h"
 
+static const int font[] = {
+  0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+  0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+  0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+  0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+  0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+  0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+  0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+  0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+  0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+};
+
+static void plot(int x, int y, unsigned char *image, int pitch) {
+  image[x + y * pitch] ^= 255;
+}
+
 void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
   int letter_bitmap;
   unsigned char *output_pos = address;
-  int colpos;
-  const int font[] = {
-    0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
-    0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
-    0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
-    0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
-    0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
-    0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
-    0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
-    0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
-    0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
-  };
-  colpos = 0;
+  int colpos = 0;
 
   while (msg[colpos] != 0) {
     char letter = msg[colpos];
@@ -50,12 +54,11 @@ void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
   }
 }
 
-static void plot(const int x, const int y, unsigned char *image, const int pitch) {
-  image [x + y * pitch] ^= 255;
-}
+
 
 /* Bresenham line algorithm */
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {
+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+                   int pitch) {
   int steep = abs(y1 - y0) > abs(x1 - x0);
   int deltax, deltay;
   int error, ystep, y, x;
diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h
index 8285aa7fd..c968628fe 100644
--- a/vp9/common/vp9_textblit.h
+++ b/vp9/common/vp9_textblit.h
@@ -11,9 +11,9 @@
 #ifndef VP9_COMMON_VP9_TEXTBLIT_H_
 #define VP9_COMMON_VP9_TEXTBLIT_H_
 
-extern void vp9_blit_text(const char *msg, unsigned char *address,
-                          const int pitch);
-extern void vp9_blit_line(int x0, int x1, int y0, int y1,
-                          unsigned char *image, const int pitch);
+void vp9_blit_text(const char *msg, unsigned char *address, int pitch);
+
+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+                   int pitch);
 
 #endif  // VP9_COMMON_VP9_TEXTBLIT_H_
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
new file mode 100644
index 000000000..29f89b618
--- /dev/null
+++ b/vp9/common/vp9_tile_common.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_tile_common.h"
+
+static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
+                                 int *max_tile_off, int tile_idx,
+                                 int log2_n_tiles, int n_mbs) {
+  const int n_sbs = (n_mbs + 3) >> 2;
+  const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
+  const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
+
+  *min_tile_off = (sb_off1 << 2) > n_mbs ? n_mbs : (sb_off1 << 2);
+  *max_tile_off = (sb_off2 << 2) > n_mbs ? n_mbs : (sb_off2 << 2);
+}
+
+void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
+  cm->cur_tile_col_idx = tile_col_idx;
+  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_col_start,
+                       &cm->cur_tile_mb_col_end, tile_col_idx,
+                       cm->log2_tile_columns, cm->mb_cols);
+}
+
+void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
+  cm->cur_tile_row_idx = tile_row_idx;
+  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_row_start,
+                       &cm->cur_tile_mb_row_end, tile_row_idx,
+                       cm->log2_tile_rows, cm->mb_rows);
+}
+
+#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
+#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
+
+void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,
+                         int *delta_log2_n_tiles) {
+  const int sb_cols = (cm->mb_cols + 3) >> 2;
+  int min_log2_n_tiles, max_log2_n_tiles;
+
+  for (max_log2_n_tiles = 0;
+       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;
+       max_log2_n_tiles++) {}
+  for (min_log2_n_tiles = 0;
+       (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;
+       min_log2_n_tiles++) {}
+
+  *min_log2_n_tiles_ptr = min_log2_n_tiles;
+  *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;
+}
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
new file mode 100644
index 000000000..ea6935601
--- /dev/null
+++ b/vp9/common/vp9_tile_common.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
+#define VP9_COMMON_VP9_TILE_COMMON_H_
+
+#include "vp9/common/vp9_onyxc_int.h"
+
+#define MIN_TILE_WIDTH 256
+#define MAX_TILE_WIDTH 4096
+
+void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
+
+void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
+
+void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles,
+                         int *delta_log2_n_tiles);
+
+#endif  // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 0c0c5e96e..f9f1d135e 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -11,6 +11,7 @@
 #ifndef VP9_COMMON_VP9_TREECODER_H_
 #define VP9_COMMON_VP9_TREECODER_H_
 
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
 typedef uint8_t vp9_prob;
@@ -53,20 +54,29 @@ void vp9_tree_probs_from_distribution(int n,  /* n = size of alphabet */
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
                                       const unsigned int num_events[ /* n */ ]);
 
-static __inline vp9_prob clip_prob(int p) {
+static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
 
-static __inline vp9_prob get_prob(int num, int den) {
+// int64 is not needed for normal frame level calculations.
+// However when outputing entropy stats accumulated over many frames
+// or even clips we can overflow int math.
+#ifdef ENTROPY_STATS
+static INLINE vp9_prob get_prob(int num, int den) {
+  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
+}
+#else
+static INLINE vp9_prob get_prob(int num, int den) {
   return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den);
 }
+#endif
 
-static __inline vp9_prob get_binary_prob(int n0, int n1) {
+static INLINE vp9_prob get_binary_prob(int n0, int n1) {
   return get_prob(n0, n0 + n1);
 }
 
 /* this function assumes prob1 and prob2 are already within [1,255] range */
-static __inline vp9_prob weighted_prob(int prob1, int prob2, int factor) {
+static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8;
 }
 
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f09e2d78b..6d3bb021a 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,91 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_subpixel.h"
-
-extern const short vp9_six_tap_mmx[8][6 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
-                                      unsigned short  *output_ptr,
-                                      unsigned int     src_pixels_per_line,
-                                      unsigned int     pixel_step,
-                                      unsigned int     output_height,
-                                      unsigned int     output_width,
-                                      const short     *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
-                                       unsigned char  *output_ptr,
-                                       int             output_pitch,
-                                       unsigned int    pixels_per_line,
-                                       unsigned int    pixel_step,
-                                       unsigned int    output_height,
-                                       unsigned int    output_width,
-                                       const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,
-                                        unsigned short *output_ptr,
-                                        unsigned int    src_pixels_per_line,
-                                        unsigned int    pixel_step,
-                                        unsigned int    output_height,
-                                        unsigned int    output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    pixel_step,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
-                                        unsigned char *output_ptr,
-                                        int dst_ptich,
-                                        unsigned int pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
-                                         unsigned char *output_ptr,
-                                         int dst_ptich,
-                                         unsigned int pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
-                                              unsigned int   src_pixels_per_lin,
-                                              unsigned char *output_ptr,
-                                              int            dst_pitch,
-                                              unsigned int   output_height,
-                                              const short   *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
 ///////////////////////////////////////////////////////////////////////////
 // the mmx function that does the bilinear filtering and var calculation //
 // int one pass                                                          //
@@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
   {   8,  8,  8,  8, 120, 120, 120, 120 }
 };
 
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
-  const short *hfilter, *vfilter;
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
-                            src_pixels_per_line, 1, 9, 8, hfilter);
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
-                             8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
-                             32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 8, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 4, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 4, 8, vfilter);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,
-                                  int  src_pixels_per_line,
-                                  int  xoffset,
-                                  int  yoffset,
-                                  unsigned char *dst_ptr,
-                                  int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                   src_pixels_per_line, 1, 21, 32, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                   32, 16, 16, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                        dst_ptr, dst_pitch, 16, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                 src_pixels_per_line, 21, 32);
-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                 32, 16, 16, dst_pitch, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 13, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 8, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 8, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 8, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 9, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 4, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 4, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 4, vfilter);
-  }
-}
-#endif
-
 #if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,
-                                          unsigned int    src_pixels_per_line,
-                                          unsigned char  *output_ptr,
-                                          unsigned int    output_pitch,
-                                          unsigned int    output_height,
-                                          unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
-                                          unsigned int   src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int   out_pitch,
-                                          unsigned int   output_height,
-                                          unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                    src_pixels_per_line,
-                                    fdata2, 16, 21, xoffset);
-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
-                                    16, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
-                                    dst_ptr, dst_pitch, 16, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                  src_pixels_per_line,
-                                  dst_ptr, dst_pitch, 16, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
-    } else {
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 8, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 8, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,
-                                 int   src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);
-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
                                    const unsigned int src_pitch,
                                    unsigned char *output_ptr,
@@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
                                    unsigned int output_height,
                                    const short *filter);
 
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
-                                      const unsigned int src_stride,
-                                      const short *hfilter_aligned16,
-                                      const short *vfilter_aligned16,
-                                      unsigned char *dst_ptr,
-                                      unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                  fdata2, 16, 23, hfilter_aligned16);
-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
-                                  vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
-                                    16, hfilter_aligned16);
-    } else {
-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);
-    }
-  }
-}
-
 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
                                    const unsigned int src_pitch,
                                    unsigned char *output_ptr,
@@ -551,51 +65,303 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
                                    unsigned int output_height,
                                    const short *filter);
 
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
 
-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                 fdata2, 16, 15, hfilter_aligned16);
-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
-                                 vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);
+void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
+                                       const unsigned int src_pitch,
+                                       unsigned char *output_ptr,
+                                       unsigned int out_pitch,
+                                       unsigned int output_height,
+                                       const short *filter);
+
+void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
+                                       const unsigned int src_pitch,
+                                       unsigned char *output_ptr,
+                                       unsigned int out_pitch,
+                                       unsigned int output_height,
+                                       const short *filter);
+
+void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
+                                     const unsigned int src_pitch,
+                                     unsigned char *output_ptr,
+                                     unsigned int out_pitch,
+                                     unsigned int output_height,
+                                     const short *filter);
+
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_ssse3(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
     }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+  }
+}
+
+void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4, filter_y, y_step_q4,
+                         w, h);
   }
 }
 
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+  }
+}
 
-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   fdata2, 16, 11, hfilter_aligned16);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
-                                   vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
+                              uint8_t *dst, int dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+  }
+}
+
+void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+  // check w/h due to fixed size fdata2 array
+  assert(w <= 16);
+  assert(h <= 16);
+
+  if (x_step_q4 == 16 && y_step_q4 == 16 &&
+      filter_x[3] != 128 && filter_y[3] != 128) {
+    if (w == 16) {
+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+                                    fdata2, 16,
+                                    h + 7, filter_x);
+      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      return;
+    }
+    if (w == 8) {
+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      return;
+    }
+    if (w == 4) {
+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d4_v8_ssse3(fdata2, 16,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      return;
+    }
+  }
+  vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                  filter_x, x_step_q4, filter_y, y_step_q4,
+                  w, h);
+}
+
+void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
+                         uint8_t *dst, int dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+  // check w/h due to fixed size fdata2 array
+  assert(w <= 16);
+  assert(h <= 16);
+
+  if (x_step_q4 == 16 && y_step_q4 == 16 &&
+      filter_x[3] != 128 && filter_y[3] != 128) {
+    if (w == 16) {
+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+                                    fdata2, 16,
+                                    h + 7, filter_x);
+      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
+                                        dst, dst_stride,
+                                        h, filter_y);
+      return;
+    }
+    if (w == 8) {
+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
+                                       dst, dst_stride,
+                                       h, filter_y);
+      return;
+    }
+    if (w == 4) {
+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
+                                       dst, dst_stride,
+                                       h, filter_y);
+      return;
     }
   }
+  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h);
 }
 #endif
diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c
deleted file mode 100644
index 8e02ac197..000000000
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/emmintrin_compat.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
-  {                                                                            \
-  /* Do shifted load to achieve require shuffles through unpacking */          \
-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
-  /* multiply accumulate them */                                               \
-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-        DECLARE_ALIGNED(16, unsigned char, temp[32]);
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_0);
-        DO_FOUR_PIXELS(col0, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_1);
-        DO_FOUR_PIXELS(col1, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_2);
-        DO_FOUR_PIXELS(col2, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_3);
-        DO_FOUR_PIXELS(col3, temp, 0);
-      }
-      // transpose
-      {
-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
-        col0 = _mm_unpacklo_epi64(T0, T1);
-        col1 = _mm_unpackhi_epi64(T0, T1);
-        col2 = _mm_unpacklo_epi64(T2, T3);
-        col3 = _mm_unpackhi_epi64(T2, T3);
-      }
-      // saturate to 8 bit
-      {
-        col0 = _mm_packs_epi32(col0, col0);
-        col0 = _mm_packus_epi16(col0, col0);
-        col1 = _mm_packs_epi32(col1, col1);
-        col1 = _mm_packus_epi16(col1, col1);
-        col2 = _mm_packs_epi32 (col2, col2);
-        col2 = _mm_packus_epi16(col2, col2);
-        col3 = _mm_packs_epi32 (col3, col3);
-        col3 = _mm_packus_epi16(col3, col3);
-      }
-      // store
-      {
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
-                                  HFilter_aligned16, VFilter_aligned16,
-                                  dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c
deleted file mode 100644
index 52c35b296..000000000
--- a/vp9/common/x86/vp9_filter_sse4.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-//           duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
-  0x00, 0x01,
-  0x01, 0x02,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x04, 0x05,
-  0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
-  0x04, 0x05,
-  0x05, 0x06,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x08, 0x09,
-  0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
-  0, 4,  8, 12,
-  1, 5,  9, 13,
-  2, 6, 10, 14,
-  3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset)                                         \
-  {                                                                            \
-  /*load pixels*/                                                              \
-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \
-  /* extract the ones used for first column */                                 \
-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \
-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \
-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \
-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \
-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \
-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \
-  /* multiply accumulate them */                                               \
-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \
-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \
-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \
-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \
-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \
-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(0, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(1, 1, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(2, 2, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(3, 3, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-      {
-        //load pixels
-        __m128i src  = transpose3_0;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col0 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_1;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col1 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_2;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col2 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_3;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col3 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);
-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);
-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
-        //TODO(cd): look into Ronald's comment:
-        //    Future suggestion: I believe here, too, you can merge the
-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that
-        //    you get the data in a single register, and then use pshufb
-        //    (shuffle_epi8()) instead of the unpacks here. Should be
-        //    2+3+2 instructions faster.
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =
-            _mm_extract_epi32(col0123, 0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =
-            _mm_extract_epi32(col0123, 1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =
-            _mm_extract_epi32(col0123, 2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =
-            _mm_extract_epi32(col0123, 3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm
deleted file mode 100644
index 15e81addb..000000000
--- a/vp9/common/x86/vp9_idctllm_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2:      times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16:         times 4 dw 16
-
-SECTION .text
-
-
-; /****************************************************************************
-; * Notes:
-; *
-; * This implementation makes use of 16 bit fixed point version of two multiply
-; * constants:
-; *        1.   sqrt(2) * cos (pi/8)
-; *        2.   sqrt(2) * sin (pi/8)
-; * Because the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point precision as the second one, we use a trick of
-; *        x * a = x + x*(a-1)
-; * so
-; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
-; *
-; * For the second constant, because of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
-; * number.
-; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
-; *
-; **************************************************************************/
-
-INIT_MMX
-
-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
-    mova            m0,     [inpq +0]
-    mova            m1,     [inpq +8]
-
-    mova            m2,     [inpq+16]
-    mova            m3,     [inpq+24]
-
-    psubw           m0,      m2             ; b1= 0-2
-    paddw           m2,      m2             ;
-
-    mova            m5,      m1
-    paddw           m2,      m0             ; a1 =0+2
-
-    pmulhw          m5,     [x_s1sqr2]       ;
-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova            m7,      m3             ;
-    pmulhw          m7,     [x_c1sqr2less1]   ;
-
-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw           m7,      m5             ; c1
-
-    mova            m5,      m1
-    mova            m4,      m3
-
-    pmulhw          m5,     [x_c1sqr2less1]
-    paddw           m5,      m1
-
-    pmulhw          m3,     [x_s1sqr2]
-    paddw           m3,      m4
-
-    paddw           m3,      m5             ; d1
-    mova            m6,      m2             ; a1
-
-    mova            m4,      m0             ; b1
-    paddw           m2,      m3             ;0
-
-    paddw           m4,      m7             ;1
-    psubw           m0,      m7             ;2
-
-    psubw           m6,      m3             ;3
-
-    mova            m1,      m2             ; 03 02 01 00
-    mova            m3,      m4             ; 23 22 21 20
-
-    punpcklwd       m1,      m0             ; 11 01 10 00
-    punpckhwd       m2,      m0             ; 13 03 12 02
-
-    punpcklwd       m3,      m6             ; 31 21 30 20
-    punpckhwd       m4,      m6             ; 33 23 32 22
-
-    mova            m0,      m1             ; 11 01 10 00
-    mova            m5,      m2             ; 13 03 12 02
-
-    punpckldq       m0,      m3             ; 30 20 10 00
-    punpckhdq       m1,      m3             ; 31 21 11 01
-
-    punpckldq       m2,      m4             ; 32 22 12 02
-    punpckhdq       m5,      m4             ; 33 23 13 03
-
-    mova            m3,      m5             ; 33 23 13 03
-
-    psubw           m0,      m2             ; b1= 0-2
-    paddw           m2,      m2             ;
-
-    mova            m5,      m1
-    paddw           m2,      m0             ; a1 =0+2
-
-    pmulhw          m5,     [x_s1sqr2]        ;
-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova            m7,      m3             ;
-    pmulhw          m7,     [x_c1sqr2less1]   ;
-
-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw           m7,      m5             ; c1
-
-    mova            m5,      m1
-    mova            m4,      m3
-
-    pmulhw          m5,     [x_c1sqr2less1]
-    paddw           m5,      m1
-
-    pmulhw          m3,     [x_s1sqr2]
-    paddw           m3,      m4
-
-    paddw           m3,      m5             ; d1
-    paddw           m0,     [pw_16]
-
-    paddw           m2,     [pw_16]
-    mova            m6,      m2             ; a1
-
-    mova            m4,      m0             ; b1
-    paddw           m2,      m3             ;0
-
-    paddw           m4,      m7             ;1
-    psubw           m0,      m7             ;2
-
-    psubw           m6,      m3             ;3
-    psraw           m2,      5
-
-    psraw           m0,      5
-    psraw           m4,      5
-
-    psraw           m6,      5
-
-    mova            m1,      m2             ; 03 02 01 00
-    mova            m3,      m4             ; 23 22 21 20
-
-    punpcklwd       m1,      m0             ; 11 01 10 00
-    punpckhwd       m2,      m0             ; 13 03 12 02
-
-    punpcklwd       m3,      m6             ; 31 21 30 20
-    punpckhwd       m4,      m6             ; 33 23 32 22
-
-    mova            m0,      m1             ; 11 01 10 00
-    mova            m5,      m2             ; 13 03 12 02
-
-    punpckldq       m0,      m3             ; 30 20 10 00
-    punpckhdq       m1,      m3             ; 31 21 11 01
-
-    punpckldq       m2,      m4             ; 32 22 12 02
-    punpckhdq       m5,      m4             ; 33 23 13 03
-
-    mova        [outq],      m0
-
-    mova     [outq+r2],      m1
-    mova [outq+pitq*2],      m2
-
-    add           outq,      pitq
-    mova [outq+pitq*2],      m5
-    RET
-
-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
-    movh            m0,     [inpq]
-    paddw           m0,     [pw_16]
-    psraw           m0,      5
-    punpcklwd       m0,      m0
-    punpckldq       m0,      m0
-
-    mova        [outq],      m0
-    mova   [outq+pitq],      m0
-
-    mova [outq+pitq*2],      m0
-    add             r1,      r2
-
-    mova [outq+pitq*2],      m0
-    RET
-
-
-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
-%if ARCH_X86_64
-    movsxd         strideq,      dword stridem
-%else
-    mov            strideq,      stridem
-%endif
-    pxor                m0,      m0
-
-    movh                m5,      in_dcq ; dc
-    paddw               m5,     [pw_16]
-
-    psraw               m5,      5
-
-    punpcklwd           m5,      m5
-    punpckldq           m5,      m5
-
-    movh                m1,     [predq]
-    punpcklbw           m1,      m0
-    paddsw              m1,      m5
-    packuswb            m1,      m0              ; pack and unpack to saturate
-    movh            [dstq],      m1
-
-    movh                m2,     [predq+pitq]
-    punpcklbw           m2,      m0
-    paddsw              m2,      m5
-    packuswb            m2,      m0              ; pack and unpack to saturate
-    movh    [dstq+strideq],      m2
-
-    movh                m3,     [predq+2*pitq]
-    punpcklbw           m3,      m0
-    paddsw              m3,      m5
-    packuswb            m3,      m0              ; pack and unpack to saturate
-    movh  [dstq+2*strideq],      m3
-
-    add               dstq,      strideq
-    add              predq,      pitq
-    movh                m4,     [predq+2*pitq]
-    punpcklbw           m4,      m0
-    paddsw              m4,      m5
-    packuswb            m4,      m0              ; pack and unpack to saturate
-    movh  [dstq+2*strideq],      m4
-    RET
-
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c
new file mode 100644
index 000000000..667f5c1d3
--- /dev/null
+++ b/vp9/common/x86/vp9_idctllm_x86.c
@@ -0,0 +1,76 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+#if HAVE_SSE2
+// In order to improve performance, clip absolute diff values to [0, 255],
+// which allows to keep the additions/subtractions in 8 bits.
+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
+                               uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
+  int16_t out;
+  uint8_t abs_diff;
+  __m128i p0, p1, p2, p3;
+  unsigned int extended_diff;
+  __m128i diff;
+
+  out = dct_const_round_shift(input_dc * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  // Read prediction data.
+  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
+  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
+  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
+  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
+
+  // Unpack prediction data, and store 4x4 array in 1 XMM register.
+  p0 = _mm_unpacklo_epi32(p0, p1);
+  p2 = _mm_unpacklo_epi32(p2, p3);
+  p0 = _mm_unpacklo_epi64(p0, p2);
+
+  // Clip dc value to [0, 255] range. Then, do addition or subtraction
+  // according to its sign.
+  if (a1 >= 0) {
+    abs_diff = (a1 > 255) ? 255 : a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_adds_epu8(p0, diff);
+  } else {
+    abs_diff = (a1 < -255) ? 255 : -a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_subs_epu8(p0, diff);
+  }
+
+  // Store results to dst.
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+}
+#endif
diff --git a/vp9/common/x86/vp9_loopfilter_x86.c b/vp9/common/x86/vp9_loopfilter_x86.c
index e73850dd9..c8487547b 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.c
+++ b/vp9/common/x86/vp9_loopfilter_x86.c
@@ -94,14 +94,16 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
   DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
   DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
 
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+
+  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
+  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+
+
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
   __m128i q5, q6, q7;
@@ -126,12 +128,24 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+
+  _mm_store_si128((__m128i *)ap[4], p4);
+  _mm_store_si128((__m128i *)ap[3], p3);
+  _mm_store_si128((__m128i *)ap[2], p2);
+  _mm_store_si128((__m128i *)ap[1], p1);
+  _mm_store_si128((__m128i *)ap[0], p0);
+  _mm_store_si128((__m128i *)aq[4], q4);
+  _mm_store_si128((__m128i *)aq[3], q3);
+  _mm_store_si128((__m128i *)aq[2], q2);
+  _mm_store_si128((__m128i *)aq[1], q1);
+  _mm_store_si128((__m128i *)aq[0], q0);
+
+
   {
     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
                                           _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
                                           _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
@@ -163,246 +177,8 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     mask = _mm_max_epu8(work, mask);
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
   }
 
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // calculate flat2
-  p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-//  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-//  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
-  {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i work;
-    flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-  }
-  // calculate flat2
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    i = 0;
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // wide flat
-  // TODO(slavarnway): interleave with the flat pixel calculations (see above)
-  {
-    const __m128i eight = _mm_set1_epi16(8);
-    unsigned char *src = s;
-    int i = 0;
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);
-      p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);
-      p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-      q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);
-      q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);
-      q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);
-
-
-      workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
-      workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);
-      workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));
-      workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
-      _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // wide flat
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
   // lp filter
   {
     const __m128i t4 = _mm_set1_epi8(4);
@@ -413,14 +189,10 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
-    __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -442,6 +214,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
     /* Filter2 >> 3 */
     work_a = _mm_cmpgt_epi8(zero, filter2);
@@ -449,6 +222,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
     /* filt >> 1 */
     filt = _mm_adds_epi8(filter1, t1);
@@ -457,20 +231,265 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
     work_a = _mm_and_si128(work_a, t80);
     filt = _mm_and_si128(filt, t7f);
     filt = _mm_or_si128(filt, work_a);
-
     filt = _mm_andnot_si128(hev, filt);
-
-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    // loopfilter done
+
+    {
+      __m128i work;
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                       _mm_subs_epu8(p0, p2)),
+                           _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                        _mm_subs_epu8(q0, q2)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                       _mm_subs_epu8(p0, p3)),
+                           _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                        _mm_subs_epu8(q0, q3)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                       _mm_subs_epu8(p0, p4)),
+                           _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                        _mm_subs_epu8(q0, q4)));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
+                                       _mm_subs_epu8(p0, p5)),
+                           _mm_or_si128(_mm_subs_epu8(q5, q0),
+                                        _mm_subs_epu8(q0, q5)));
+      _mm_store_si128((__m128i *)ap[5], p5);
+      _mm_store_si128((__m128i *)aq[5], q5);
+      flat2 = _mm_max_epu8(work, flat2);
+      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
+                                       _mm_subs_epu8(p0, p6)),
+                           _mm_or_si128(_mm_subs_epu8(q6, q0),
+                                        _mm_subs_epu8(q0, q6)));
+      _mm_store_si128((__m128i *)ap[6], p6);
+      _mm_store_si128((__m128i *)aq[6], q6);
+      flat2 = _mm_max_epu8(work, flat2);
+
+      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
+                                       _mm_subs_epu8(p0, p7)),
+                           _mm_or_si128(_mm_subs_epu8(q7, q0),
+                                        _mm_subs_epu8(q0, q7)));
+      _mm_store_si128((__m128i *)ap[7], p7);
+      _mm_store_si128((__m128i *)aq[7], q7);
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i four = _mm_set1_epi16(4);
+      __m128i temp_flat2 = flat2;
+      unsigned char *src = s;
+      int i = 0;
+      do {
+        __m128i workp_shft;
+        __m128i a, b, c;
+
+        unsigned int off = i * 8;
+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+
+        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
+        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+
+        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
+        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
+        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+
+        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q1, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
+        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q2, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
+        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q3, a);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
+        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        b = _mm_add_epi16(q3, b);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
+        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+
+        c = _mm_add_epi16(q4, c);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        b = _mm_add_epi16(q3, b);
+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
+        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+                                          , b));
+        a = _mm_add_epi16(q5, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q6, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        a = _mm_add_epi16(q7, a);
+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+                         _mm_packus_epi16(workp_shft, workp_shft));
+
+        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
+        src += 8;
+      } while (++i < 2);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    work_a = _mm_load_si128((__m128i *)ap[2]);
+    p2 = _mm_load_si128((__m128i *)flat_op[2]);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+    _mm_store_si128((__m128i *)flat_op[2], p2);
+
+    p1 = _mm_load_si128((__m128i *)flat_op[1]);
+    work_a = _mm_andnot_si128(flat, ps1);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+    _mm_store_si128((__m128i *)flat_op[1], p1);
+
+    p0 = _mm_load_si128((__m128i *)flat_op[0]);
+    work_a = _mm_andnot_si128(flat, ps0);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+    _mm_store_si128((__m128i *)flat_op[0], p0);
+
+    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+    work_a = _mm_andnot_si128(flat, qs0);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+    _mm_store_si128((__m128i *)flat_oq[0], q0);
+
+    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+    work_a = _mm_andnot_si128(flat, qs1);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+    _mm_store_si128((__m128i *)flat_oq[1], q1);
+
+    work_a = _mm_load_si128((__m128i *)aq[2]);
+    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+    _mm_store_si128((__m128i *)flat_oq[2], q2);
 
     // write out op6 - op3
     {
       unsigned char *dst = (s - 7 * p);
       for (i = 6; i > 2; i--) {
         __m128i flat2_output;
-        work_a = _mm_loadu_si128((__m128i *)dst);
+        work_a = _mm_load_si128((__m128i *)ap[i]);
         flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
@@ -480,62 +499,42 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
       }
     }
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    work_a = _mm_or_si128(work_a, p2);
+    work_a = _mm_load_si128((__m128i *)flat_op[2]);
     p2 = _mm_load_si128((__m128i *)flat2_op[2]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p2 = _mm_and_si128(flat2, p2);
     p2 = _mm_or_si128(work_a, p2);
     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
 
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, ps1);
-    p1 = _mm_and_si128(flat, p1);
-    work_a = _mm_or_si128(work_a, p1);
+    work_a = _mm_load_si128((__m128i *)flat_op[1]);
     p1 = _mm_load_si128((__m128i *)flat2_op[1]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p1 = _mm_and_si128(flat2, p1);
     p1 = _mm_or_si128(work_a, p1);
     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
 
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, ps0);
-    p0 = _mm_and_si128(flat, p0);
-    work_a = _mm_or_si128(work_a, p0);
+    work_a = _mm_load_si128((__m128i *)flat_op[0]);
     p0 = _mm_load_si128((__m128i *)flat2_op[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p0 = _mm_and_si128(flat2, p0);
     p0 = _mm_or_si128(work_a, p0);
     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
 
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, qs0);
-    q0 = _mm_and_si128(flat, q0);
-    work_a = _mm_or_si128(work_a, q0);
+    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
     q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q0 = _mm_and_si128(flat2, q0);
     q0 = _mm_or_si128(work_a, q0);
     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, qs1);
-    q1 = _mm_and_si128(flat, q1);
-    work_a = _mm_or_si128(work_a, q1);
+    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
     q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q1 = _mm_and_si128(flat2, q1);
     q1 = _mm_or_si128(work_a, q1);
     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    work_a = _mm_or_si128(work_a, q2);
+    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
     q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q2 = _mm_and_si128(flat2, q2);
@@ -547,7 +546,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
       unsigned char *dst = (s + 3 * p);
       for (i = 3; i < 7; i++) {
         __m128i flat2_output;
-        work_a = _mm_loadu_si128((__m128i *)dst);
+        work_a = _mm_load_si128((__m128i *)aq[i]);
         flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
@@ -572,7 +571,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   __m128i mask, hev, flat;
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
   const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
   const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
@@ -583,7 +582,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   const __m128i blimit =
       _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
@@ -592,7 +590,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
   {
     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
                                           _mm_subs_epu8(p0, p1));
@@ -641,11 +638,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
                          _mm_or_si128(_mm_subs_epu8(q3, q0),
                                       _mm_subs_epu8(q0, q3)));
     flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                     _mm_subs_epu8(p0, p4)),
-                         _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                      _mm_subs_epu8(q0, q4)));
-    flat = _mm_max_epu8(work, flat);
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
@@ -656,7 +648,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     int i = 0;
     do {
       __m128i workp_a, workp_b, workp_shft;
-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
@@ -665,11 +656,10 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
 
-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_op2[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
@@ -679,7 +669,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       _mm_storel_epi64((__m128i *)&flat_op1[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_op0[i*8],
@@ -691,13 +681,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
       _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
@@ -881,8 +871,8 @@ void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
                    _mm_loadl_epi64((__m128i *)(src + 120)));
 }
 
-static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                   int in_p, unsigned char *out, int out_p) {
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+                                 int in_p, unsigned char *out, int out_p) {
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
@@ -947,9 +937,9 @@ static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 }
 
-static __inline void transpose(unsigned char *src[], int in_p,
-                               unsigned char *dst[], int out_p,
-                               int num_8x8_to_transpose) {
+static INLINE void transpose(unsigned char *src[], int in_p,
+                             unsigned char *dst[], int out_p,
+                             int num_8x8_to_transpose) {
   int idx8x8 = 0;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   do {
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index b644da64c..32f00e289 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -21,34 +21,92 @@
 ;
 ;*************************************************************************************/
 
-;void vp9_filter_block1d8_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
+%macro VERTx4 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.loop:
+    movd        xmm0, [rsi]                 ;A
+    movd        xmm1, [rsi + rdx]           ;B
+    movd        xmm2, [rsi + rdx * 2]       ;C
+    movd        xmm3, [rax + rdx * 2]       ;D
+    movd        xmm4, [rsi + rdx * 4]       ;E
+    movd        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movd        xmm6, [rsi + rbx]           ;G
+    movd        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, krd
+    paddsw      xmm4, xmm6
+    paddsw      xmm0, xmm4
 
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .loop
+%endm
+
+%macro VERTx8 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -86,7 +144,7 @@ sym(vp9_filter_block1d8_v8_ssse3):
     lea         rbx, [rdx + rdx*4]
     add         rbx, rdx                    ;pitch * 6
 
-.vp9_filter_block1d8_v8_ssse3_loop:
+.loop:
     movq        xmm0, [rsi]                 ;A
     movq        xmm1, [rsi + rdx]           ;B
     movq        xmm2, [rsi + rdx * 2]       ;C
@@ -117,7 +175,10 @@ sym(vp9_filter_block1d8_v8_ssse3):
 
     add         rsi,  rdx
     add         rax,  rdx
-
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
     movq        [rdi], xmm0
 
 %if ABI_IS_32BIT
@@ -126,47 +187,11 @@ sym(vp9_filter_block1d8_v8_ssse3):
     add         rdi, r8
 %endif
     dec         rcx
-    jnz         .vp9_filter_block1d8_v8_ssse3_loop
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
+    jnz         .loop
+%endm
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
 
+%macro VERTx16 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -204,7 +229,7 @@ sym(vp9_filter_block1d16_v8_ssse3):
     lea         rbx, [rdx + rdx*4]
     add         rbx, rdx                    ;pitch * 6
 
-.vp9_filter_block1d16_v8_ssse3_loop:
+.loop:
     movq        xmm0, [rsi]                 ;A
     movq        xmm1, [rsi + rdx]           ;B
     movq        xmm2, [rsi + rdx * 2]       ;C
@@ -232,7 +257,10 @@ sym(vp9_filter_block1d16_v8_ssse3):
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
-
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
     movq        [rdi], xmm0
 
     movq        xmm0, [rsi + 8]             ;A
@@ -267,6 +295,10 @@ sym(vp9_filter_block1d16_v8_ssse3):
 
     add         rsi,  rdx
     add         rax,  rdx
+%if %1
+    movq    xmm1, [rdi+8]
+    pavgb   xmm0, xmm1
+%endif
 
     movq        [rdi+8], xmm0
 
@@ -276,7 +308,38 @@ sym(vp9_filter_block1d16_v8_ssse3):
     add         rdi, r8
 %endif
     dec         rcx
-    jnz         .vp9_filter_block1d16_v8_ssse3_loop
+    jnz         .loop
+%endm
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx4 0
 
     add rsp, 16*5
     pop rsp
@@ -289,24 +352,65 @@ sym(vp9_filter_block1d16_v8_ssse3):
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_ssse3
+;void vp9_filter_block1d8_v8_ssse3
 ;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx8 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
     SAVE_XMM 7
-    GET_GOT     rbx
     push        rsi
     push        rdi
+    push        rbx
     ; end prolog
 
     ALIGN_STACK 16, rax
@@ -317,6 +421,121 @@ sym(vp9_filter_block1d8_h8_ssse3):
     %define k6k7 [rsp + 16*3]
     %define krd [rsp + 16*4]
 
+    VERTx16 0
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx4 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx8 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    VERTx16 1
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+%macro HORIZx4 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -340,19 +559,16 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pshufd      xmm5, xmm5, 0
     movdqa      k4k5, xmm2
     movdqa      k6k7, xmm3
-;    movdqa      krd, xmm5
+    movdqa      krd, xmm5
 
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
 
-.filter_block1d8_h8_rowloop_ssse3:
+.loop:
     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
     punpcklqdq  xmm0,   xmm3
 
     movdqa      xmm1,   xmm0
@@ -371,59 +587,94 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pmaddubsw   xmm4,   k6k7
 
     paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   xmm5
     paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
     psraw       xmm0,   7
     packuswb    xmm0,   xmm0
-
+%if %1
+    movd        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
     lea         rsi,    [rsi + rax]
-    movq        [rdi],  xmm0
+    movd        [rdi],  xmm0
 
     lea         rdi,    [rdi + rdx]
     dec         rcx
-    jnz         .filter_block1d8_h8_rowloop_ssse3
+    jnz         .loop
+%endm
 
-    add rsp, 16*5
-    pop rsp
+%macro HORIZx8 1
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
 
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
 
-;void vp9_filter_block1d16_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
 
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
 
+.loop:
+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+
+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      xmm1,   xmm0
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
+    pmaddubsw   xmm0,   k0k1
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   xmm1,   k2k3
+
+    movdqa      xmm4,   xmm2
+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
+    pmaddubsw   xmm2,   k4k5
+
+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
+    pmaddubsw   xmm4,   k6k7
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
+    psraw       xmm0,   7
+    packuswb    xmm0,   xmm0
+%if %1
+    movq        xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
+
+    lea         rsi,    [rsi + rax]
+    movq        [rdi],  xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+%endm
+
+%macro HORIZx16 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
@@ -453,13 +704,10 @@ sym(vp9_filter_block1d16_h8_ssse3):
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
 
-.filter_block1d16_h8_rowloop_ssse3:
+.loop:
     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
     punpcklqdq  xmm0,   xmm3
 
     movdqa      xmm1,   xmm0
@@ -486,10 +734,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
 
 
     movq        xmm3,   [rsi +  5]
-;    movq        xmm7,   [rsi + 12]
     movq        xmm7,   [rsi + 13]
-;note: same as above
-;    punpcklbw   xmm3,   xmm7
     punpcklqdq  xmm3,   xmm7
 
     movdqa      xmm1,   xmm3
@@ -508,19 +753,54 @@ sym(vp9_filter_block1d16_h8_ssse3):
     pmaddubsw   xmm4,   k6k7
 
     paddsw      xmm3,   xmm1
+    paddsw      xmm3,   xmm4
     paddsw      xmm3,   xmm2
     paddsw      xmm3,   krd
-    paddsw      xmm3,   xmm4
     psraw       xmm3,   7
     packuswb    xmm3,   xmm3
     punpcklqdq  xmm0,   xmm3
+%if %1
+    movdqa      xmm1,   [rdi]
+    pavgb       xmm0,   xmm1
+%endif
 
     lea         rsi,    [rsi + rax]
     movdqa      [rdi],  xmm0
 
     lea         rdi,    [rdi + rdx]
     dec         rcx
-    jnz         .filter_block1d16_h8_rowloop_ssse3
+    jnz         .loop
+%endm
+
+;void vp9_filter_block1d4_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx4 0
 
     add rsp, 16*5
     pop rsp
@@ -534,7 +814,188 @@ sym(vp9_filter_block1d16_h8_ssse3):
     pop         rbp
     ret
 
+;void vp9_filter_block1d8_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx8 0
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx16 0
 
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx4 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx8 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    HORIZx16 1
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 SECTION_RODATA
 align 16
 shuf_t0t1:
diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm
deleted file mode 100644
index dee29b8fb..000000000
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ /dev/null
@@ -1,268 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT  7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-;    unsigned char   *src_ptr,
-;    unsigned short  *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx) PRIVATE
-sym(vp9_filter_block1d_h6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,    arg(6) ;vp9_filter
-
-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
-        movq        mm2,    [rdx + 32]         ;
-        movq        mm6,    [rdx + 48]        ;
-        movq        mm7,    [rdx + 64]        ;
-
-        mov         rdi,    arg(1) ;output_ptr
-        mov         rsi,    arg(0) ;src_ptr
-        movsxd      rcx,    dword ptr arg(4) ;output_height
-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
-        pxor        mm0,    mm0              ; mm0 = 00000000
-
-.nextrow:
-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
-        movq        mm4,    mm3              ; mm4 = p-2..p5
-        psrlq       mm3,    8                ; mm3 = p-1..p5
-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        movq        mm4,    mm5              ; mm4 = p-2..p5;
-        psrlq       mm5,    16               ; mm5 = p0..p5;
-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        psrlq       mm4,    24               ; mm4 = p1..p5
-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        ; do outer positive taps
-        movd        mm4,    [rsi+3]
-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3,    mm0              ; pack and unpack to saturate
-        punpcklbw   mm3,    mm0              ;
-
-        movq        [rdi],  mm3              ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
-        add         rdi,    rax;
-%else
-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
-        add         rdi,    rax;
-
-        add         rsi,    r8               ; next line
-%endif
-
-        dec         rcx                      ; decrement count
-        jnz         .nextrow                 ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;    int output_pitch,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
-sym(vp9_filter_block1dc_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(7) ;vp9_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(5) ;output_height
-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-.nextrow_cv:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
-        ; recon block should be in cache this shouldn't cost much.  Its obviously
-        ; avoidable!!!.
-        lea         rdi,  [rdi+rax] ;
-        dec         rcx                   ; decrement count
-        jnz         .nextrow_cv           ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-rd:
-    times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 128
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 0
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 123
-    times 8 dw 12
-    times 8 dw -1
-    times 8 dw 0
-
-    times 8 dw 2
-    times 8 dw -11
-    times 8 dw 108
-    times 8 dw 36
-    times 8 dw -8
-    times 8 dw 1
-
-    times 8 dw 0
-    times 8 dw -9
-    times 8 dw 93
-    times 8 dw 50
-    times 8 dw -6
-    times 8 dw 0
-
-    times 8 dw 3
-    times 8 dw -16
-    times 8 dw 77
-    times 8 dw 77
-    times 8 dw -16
-    times 8 dw 3
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 50
-    times 8 dw 93
-    times 8 dw -9
-    times 8 dw 0
-
-    times 8 dw 1
-    times 8 dw -8
-    times 8 dw 36
-    times 8 dw 108
-    times 8 dw -11
-    times 8 dw 2
-
-    times 8 dw 0
-    times 8 dw -1
-    times 8 dw 12
-    times 8 dw 123
-    times 8 dw -6
-    times 8 dw 0
-
diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm
deleted file mode 100644
index b0c4f1282..000000000
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi+16],      xmm4
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-;    short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        pmullw      xmm1,       [rax]
-
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
-        pmullw      xmm3,       [rax + 32]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
-        pmullw      xmm5,       [rax + 64]
-
-        add         rsi,        rdx
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
-
-        pmullw      xmm4,       [rax + 48]
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
-
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-;    unsigned short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    const short    *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
-sym(vp9_filter_block1d16_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
-        pmullw      xmm1,       [rax + 16]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm3,       [rax + 64]
-        pmullw      xmm4,       [rax + 64]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm5,       [rax + 32]
-        pmullw      xmm6,       [rax + 32]
-
-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
-        pmullw      xmm7,       [rax]
-        pmullw      xmm0,       [rax]
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm0
-
-        add         rsi,        rdx
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm3,       [rax + 48]
-        pmullw      xmm4,       [rax + 48]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm5,       [rax + 80]
-        pmullw      xmm6,       [rax + 80]
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm1,       7
-        psraw       xmm2,       7
-
-        packuswb    xmm1,       xmm2              ; pack and saturate
-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-
-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_only_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; lower 8 bytes
-
-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; higher 8 bytes
-
-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int output_height,
-;    const short    *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        mov         rax,        arg(5) ;vp9_filter
-
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
-        movq        xmm1,       MMWORD PTR [rsi]
-        movq        xmm2,       MMWORD PTR [rsi + rdx]
-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
-        add         rsi,        rdx
-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
-
-        punpcklbw   xmm1,       xmm0
-        pmullw      xmm1,       [rax]
-
-        punpcklbw   xmm2,       xmm0
-        pmullw      xmm2,       [rax + 16]
-
-        punpcklbw   xmm3,       xmm0
-        pmullw      xmm3,       [rax + 32]
-
-        punpcklbw   xmm5,       xmm0
-        pmullw      xmm5,       [rax + 64]
-
-        punpcklbw   xmm4,       xmm0
-        pmullw      xmm4,       [rax + 48]
-
-        punpcklbw   xmm6,       xmm0
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    output_height,
-;    unsigned int    output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
-sym(vp9_unpack_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(3) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        punpcklbw   xmm1,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm1
-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(4) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
-sym(vp9_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
-sym(vp9_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-rd:
-    times 8 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm
deleted file mode 100644
index b260480e0..000000000
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4
-
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    mov         rdi, arg(2)             ;output_ptr
-
-    cmp         esi, DWORD PTR [rax]
-    je          vp9_filter_block1d8_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    pmaddubsw   xmm1,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm2,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-    jnz         .filter_block1d8_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp9_filter_block1d8_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm2,   xmm0
-    pshufb      xmm0,   xmm3
-
-    pshufb      xmm2,   xmm4
-    pmaddubsw   xmm0,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-
-    jnz         .filter_block1d8_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)           ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    mov         rdi, arg(2)                     ;output_ptr
-
-    mov         rsi, arg(0)                     ;src_ptr
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)           ;output_height
-    movsxd      rdx, dword ptr arg(3)           ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    movq        xmm3,   MMWORD PTR [rsi +  6]
-
-    pmaddubsw   xmm1,   xmm5
-    movq        xmm7,   MMWORD PTR [rsi + 11]
-
-    pmaddubsw   xmm2,   xmm6
-    punpcklbw   xmm3,   xmm7
-
-    paddsw      xmm0,   xmm1
-    movdqa      xmm1,   xmm3
-
-    pmaddubsw   xmm3,   xmm4
-    paddsw      xmm0,   xmm2
-
-    movdqa      xmm2,   xmm1
-    paddsw      xmm0,   [GLOBAL(rd)]
-
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-
-    psraw       xmm0,   7
-    pmaddubsw   xmm1,   xmm5
-
-    pmaddubsw   xmm2,   xmm6
-    packuswb    xmm0,   xmm0
-
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3,   xmm1
-
-    paddsw      xmm3,   xmm2
-
-    paddsw      xmm3,   [GLOBAL(rd)]
-
-    psraw       xmm3,   7
-
-    packuswb    xmm3,   xmm3
-
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d16_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [GLOBAL(shuf1b)]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pmaddubsw   xmm0, xmm4
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm7
-    pxor        xmm1, xmm1
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    movd        DWORD PTR [rdi], xmm0
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm1, xmm7
-    paddsw      xmm1, xmm2
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-
-    movd        DWORD PTR [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d16_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2          ;store the results
-
-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d16_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    paddsw      xmm2, [GLOBAL(rd)]
-    paddsw      xmm2, xmm3
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    punpcklbw   xmm5, xmm4                  ;B D
-    punpcklbw   xmm1, xmm0                  ;C E
-
-    pmaddubsw   xmm1, xmm6
-    pmaddubsw   xmm5, xmm7
-
-    movdqa      xmm4, [GLOBAL(rd)]
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm5, xmm1
-    paddsw      xmm5, xmm4
-    psraw       xmm5, 7
-    packuswb    xmm5, xmm5
-
-    punpcklqdq  xmm2, xmm5
-
-    movdqa       XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d8_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-    movdqa      xmm4, [GLOBAL(rd)]
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, xmm4
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d8_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm5, [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm5
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_v4_ssse3
-
-    movq        mm5, MMWORD PTR [rax]         ;k0_k5
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
-    movd        mm1, DWORD PTR [rsi]                  ;A
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
-
-    movq        mm4, [GLOBAL(rd)]
-
-    pmaddubsw   mm3, mm6
-    punpcklbw   mm1, mm0                  ;A F
-    pmaddubsw   mm2, mm7
-    pmaddubsw   mm1, mm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm1
-    paddsw      mm2, mm4
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_v4_ssse3:
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-    movq        mm5, MMWORD PTR [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    pmaddubsw   mm3, mm6
-    pmaddubsw   mm2, mm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm5
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
-sym(vp9_bilinear_predict16x16_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
-
-        movdqa      xmm2,       [rax]
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
-%endif
-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
-
-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm6,       xmm5
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm6,       xmm1
-
-        punpcklbw   xmm4,       xmm5
-        pmaddubsw   xmm4,       xmm1
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        packuswb    xmm6,       xmm4
-        movdqa      xmm5,       xmm7
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm2
-
-        punpckhbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm2
-
-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm5,       xmm7
-        movdqa      xmm7,       xmm6
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]               ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-        ; get the first horizontal line done
-        movq        xmm4,       [rsi]               ; load row 0
-        movq        xmm2,       [rsi + 8]           ; load row 0
-
-        lea         rsi,        [rsi + rax]         ; next line
-.next_row_sp:
-        movq        xmm3,       [rsi]               ; load row + 1
-        movq        xmm5,       [rsi + 8]           ; load row + 1
-
-        punpcklbw   xmm4,       xmm3
-        punpcklbw   xmm2,       xmm5
-
-        pmaddubsw   xmm4,       xmm1
-        movq        xmm7,       [rsi + rax]         ; load row + 2
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
-
-        punpcklbw   xmm3,       xmm7
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm3,       xmm1
-        paddw       xmm4,       [GLOBAL(rd)]
-
-        pmaddubsw   xmm5,       xmm1
-        paddw       xmm2,       [GLOBAL(rd)]
-
-        psraw       xmm4,       VP9_FILTER_SHIFT
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        packuswb    xmm4,       xmm2
-        paddw       xmm3,       [GLOBAL(rd)]
-
-        movdqa      [rdi],      xmm4                ; store row 0
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm5
-        movdqa      xmm4,       xmm7
-
-        movdqa      [rdi + rdx],xmm3                ; store row 1
-        lea         rsi,        [rsi + 2*rax]
-
-        movdqa      xmm2,       xmm6
-        lea         rdi,        [rdi + 2*rdx]
-
-        cmp         rdi,        rcx
-        jne         .next_row_sp
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-.next_row_fp:
-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm2,       xmm4
-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rax]         ; next line
-        punpcklbw   xmm3,       xmm4
-
-        pmaddubsw   xmm3,       xmm1
-        movq        xmm5,       [rsi]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        movq        xmm7,       [rsi+1]
-
-        movq        xmm6,       [rsi+8]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        punpcklbw   xmm5,       xmm7
-        movq        xmm7,       [rsi+9]
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        pmaddubsw   xmm5,       xmm1
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        punpcklbw   xmm6,       xmm7
-
-        packuswb    xmm2,       xmm3
-        pmaddubsw   xmm6,       xmm1
-
-        movdqa      [rdi],      xmm2                ; store the results in the destination
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm6,       VP9_FILTER_SHIFT
-
-        packuswb    xmm5,       xmm6
-        lea         rsi,        [rsi + rax]         ; next line
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-.done:
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
-sym(vp9_bilinear_predict8x8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b8x8_sp_only
-
-        shl         rax,        4
-        add         rax,        rcx                 ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b8x8_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm1,       [rax]
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
-        psrldq      xmm5,       1
-        lea         rsp,        [rsp + 16]          ; next line
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        lea         rsp,        [rsp + 16]          ; next line
-
-        movdqa      xmm5,       xmm6
-
-        psrldq      xmm5,       1
-
-        punpcklbw   xmm6,       xmm5
-        pmaddubsw   xmm6,       xmm0
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        packuswb    xmm6,       xmm6
-
-        punpcklbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm1
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm7,       xmm7
-
-        movq        [rdi],      xmm7                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]
-
-        movdqa      xmm7,       xmm6
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done8x8
-
-.b8x8_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]               ; VFilter
-
-        movq        xmm1,       XMMWORD PTR [rsp]
-        movq        xmm2,       XMMWORD PTR [rsp+16]
-
-        movq        xmm3,       XMMWORD PTR [rsp+32]
-        punpcklbw   xmm1,       xmm2
-
-        movq        xmm4,       XMMWORD PTR [rsp+48]
-        punpcklbw   xmm2,       xmm3
-
-        movq        xmm5,       XMMWORD PTR [rsp+64]
-        punpcklbw   xmm3,       xmm4
-
-        movq        xmm6,       XMMWORD PTR [rsp+80]
-        punpcklbw   xmm4,       xmm5
-
-        movq        xmm7,       XMMWORD PTR [rsp+96]
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm1,       xmm0
-        pmaddubsw   xmm2,       xmm0
-
-        pmaddubsw   xmm3,       xmm0
-        pmaddubsw   xmm4,       xmm0
-
-        pmaddubsw   xmm5,       xmm0
-        punpcklbw   xmm6,       xmm7
-
-        pmaddubsw   xmm6,       xmm0
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        psraw       xmm6,       VP9_FILTER_SHIFT
-        packuswb    xmm1,       xmm1
-
-        packuswb    xmm2,       xmm2
-        movq        [rdi],      xmm1
-
-        packuswb    xmm3,       xmm3
-        movq        [rdi+rdx],  xmm2
-
-        packuswb    xmm4,       xmm4
-        movq        xmm1,       XMMWORD PTR [rsp+112]
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        xmm2,       XMMWORD PTR [rsp+128]
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm3
-
-        packuswb    xmm6,       xmm6
-        movq        [rdi+rdx],  xmm4
-
-        lea         rdi,        [rdi + 2*rdx]
-        punpcklbw   xmm7,       xmm1
-
-        movq        [rdi],      xmm5
-        pmaddubsw   xmm7,       xmm0
-
-        movq        [rdi+rdx],  xmm6
-        punpcklbw   xmm1,       xmm2
-
-        pmaddubsw   xmm1,       xmm0
-        paddw       xmm7,       [GLOBAL(rd)]
-
-        psraw       xmm7,       VP9_FILTER_SHIFT
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        psraw       xmm1,       VP9_FILTER_SHIFT
-        packuswb    xmm7,       xmm7
-
-        packuswb    xmm1,       xmm1
-        lea         rdi,        [rdi + 2*rdx]
-
-        movq        [rdi],      xmm7
-
-        movq        [rdi+rdx],  xmm1
-        lea         rsp,        [rsp + 144]
-
-        jmp         .done8x8
-
-.b8x8_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-
-.next_row_fp:
-        movdqa      xmm1,       XMMWORD PTR [rsp]
-        movdqa      xmm3,       XMMWORD PTR [rsp+16]
-
-        movdqa      xmm2,       xmm1
-        movdqa      xmm5,       XMMWORD PTR [rsp+32]
-
-        psrldq      xmm2,       1
-        movdqa      xmm7,       XMMWORD PTR [rsp+48]
-
-        movdqa      xmm4,       xmm3
-        psrldq      xmm4,       1
-
-        movdqa      xmm6,       xmm5
-        psrldq      xmm6,       1
-
-        punpcklbw   xmm1,       xmm2
-        pmaddubsw   xmm1,       xmm0
-
-        punpcklbw   xmm3,       xmm4
-        pmaddubsw   xmm3,       xmm0
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm0
-
-        movdqa      xmm2,       xmm7
-        psrldq      xmm2,       1
-
-        punpcklbw   xmm7,       xmm2
-        pmaddubsw   xmm7,       xmm0
-
-        paddw       xmm1,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm7,       [GLOBAL(rd)]
-        psraw       xmm7,       VP9_FILTER_SHIFT
-
-        packuswb    xmm1,       xmm1
-        packuswb    xmm3,       xmm3
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm1
-
-        packuswb    xmm7,       xmm7
-        movq        [rdi+rdx],  xmm3
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        [rdi],      xmm5
-
-        lea         rsp,        [rsp + 4*16]
-        movq        [rdi+rdx],  xmm7
-
-        lea         rdi,        [rdi + 2*rdx]
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-        lea         rsp,        [rsp + 16]
-
-.done8x8:
-    ;add rsp, 144
-    pop         rsp
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
-    times 8 dw 0x40
-
-align 16
-k0_k5:
-    times 8 db 0, 0             ;placeholder
-    times 8 db 0, 0
-    times 8 db 2, 1
-    times 8 db 0, 0
-    times 8 db 3, 3
-    times 8 db 0, 0
-    times 8 db 1, 2
-    times 8 db 0, 0
-k1_k3:
-    times 8 db  0,    0         ;placeholder
-    times 8 db  -6,  12
-    times 8 db -11,  36
-    times 8 db  -9,  50
-    times 8 db -16,  77
-    times 8 db  -6,  93
-    times 8 db  -8, 108
-    times 8 db  -1, 123
-k2_k4:
-    times 8 db 128,    0        ;placeholder
-    times 8 db 123,   -1
-    times 8 db 108,   -8
-    times 8 db  93,   -6
-    times 8 db  77,  -16
-    times 8 db  50,   -9
-    times 8 db  36,  -11
-    times 8 db  12,   -6
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db 96,  32
-    times 8 db 88,  40
-    times 8 db 80,  48
-    times 8 db 72,  56
-    times 8 db 64,  64
-    times 8 db 56,  72
-    times 8 db 48,  80
-    times 8 db 40,  88
-    times 8 db 32,  96
-    times 8 db 24,  104
-    times 8 db 16,  112
-    times 8 db 8,   120
-
diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h
deleted file mode 100644
index 25bc26d9b..000000000
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif
diff --git a/vp9/decoder/vp9_dboolhuff.c b/vp9/decoder/vp9_dboolhuff.c
index 5f1ef0408..99c3664b2 100644
--- a/vp9/decoder/vp9_dboolhuff.c
+++ b/vp9/decoder/vp9_dboolhuff.c
@@ -17,10 +17,10 @@ int vp9_start_decode(BOOL_DECODER *br,
                      const unsigned char *source,
                      unsigned int source_sz) {
   br->user_buffer_end = source + source_sz;
-  br->user_buffer     = source;
-  br->value    = 0;
-  br->count    = -8;
-  br->range    = 255;
+  br->user_buffer = source;
+  br->value = 0;
+  br->count = -8;
+  br->range = 255;
 
   if (source_sz && !source)
     return 1;
@@ -33,16 +33,27 @@ int vp9_start_decode(BOOL_DECODER *br,
 
 
 void vp9_bool_decoder_fill(BOOL_DECODER *br) {
-  const unsigned char *bufptr;
-  const unsigned char *bufend;
-  VP9_BD_VALUE         value;
-  int                  count;
-  bufend = br->user_buffer_end;
-  bufptr = br->user_buffer;
-  value = br->value;
-  count = br->count;
+  const unsigned char *bufptr = br->user_buffer;
+  const unsigned char *bufend = br->user_buffer_end;
+  VP9_BD_VALUE value = br->value;
+  int count = br->count;
+  int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8);
+  int loop_end = 0;
+  int bits_left = (int)((bufend - bufptr)*CHAR_BIT);
+  int x = shift + CHAR_BIT - bits_left;
+
+  if (x >= 0) {
+    count += VP9_LOTS_OF_BITS;
+    loop_end = x;
+  }
 
-  VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+  if (x < 0 || bits_left) {
+    while (shift >= loop_end) {
+      count += CHAR_BIT;
+      value |= (VP9_BD_VALUE)*bufptr++ << shift;
+      shift -= CHAR_BIT;
+    }
+  }
 
   br->user_buffer = bufptr;
   br->value = value;
@@ -52,7 +63,9 @@ void vp9_bool_decoder_fill(BOOL_DECODER *br) {
 
 static int get_unsigned_bits(unsigned num_values) {
   int cat = 0;
-  if ((num_values--) <= 1) return 0;
+  if (num_values <= 1)
+    return 0;
+  num_values--;
   while (num_values > 0) {
     cat++;
     num_values >>= 1;
@@ -61,9 +74,12 @@ static int get_unsigned_bits(unsigned num_values) {
 }
 
 int vp9_inv_recenter_nonneg(int v, int m) {
-  if (v > (m << 1)) return v;
-  else if ((v & 1) == 0) return (v >> 1) + m;
-  else return m - ((v + 1) >> 1);
+  if (v > (m << 1))
+    return v;
+  else if ((v & 1) == 0)
+    return (v >> 1) + m;
+  else
+    return m - ((v + 1) >> 1);
 }
 
 int vp9_decode_uniform(BOOL_DECODER *br, int n) {
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index 5afdd67c8..cf31d380a 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -19,11 +19,11 @@
 
 typedef size_t VP9_BD_VALUE;
 
-# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+#define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
 /*This is meant to be a large, positive constant that can still be efficiently
    loaded as an immediate (on platforms like ARM, for example).
   Even relatively modest values like 100 would work fine.*/
-# define VP9_LOTS_OF_BITS (0x40000000)
+#define VP9_LOTS_OF_BITS (0x40000000)
 
 typedef struct {
   const unsigned char *user_buffer_end;
@@ -45,46 +45,13 @@ int vp9_decode_uniform(BOOL_DECODER *br, int n);
 int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);
 int vp9_inv_recenter_nonneg(int v, int m);
 
-/*The refill loop is used in several places, so define it in a macro to make
-   sure they're all consistent.
-  An inline function would be cleaner, but has a significant penalty, because
-   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
-   enough to eliminate the stores to those fields and the subsequent reloads
-   from them when inlining the function.*/
-#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
-  do \
-  { \
-    int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \
-    int loop_end, x; \
-    int bits_left = (int)(((_bufend)-(_bufptr))*CHAR_BIT); \
-    \
-    x = shift + CHAR_BIT - bits_left; \
-    loop_end = 0; \
-    if(x >= 0) \
-    { \
-      (_count) += VP9_LOTS_OF_BITS; \
-      loop_end = x; \
-      if(!bits_left) break; \
-    } \
-    while(shift >= loop_end) \
-    { \
-      (_count) += CHAR_BIT; \
-      (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \
-      shift -= CHAR_BIT; \
-    } \
-  } \
-  while(0) \
-
-
 static int decode_bool(BOOL_DECODER *br, int probability) {
   unsigned int bit = 0;
   VP9_BD_VALUE value;
-  unsigned int split;
   VP9_BD_VALUE bigsplit;
   int count;
   unsigned int range;
-
-  split = 1 + (((br->range - 1) * probability) >> 8);
+  unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
 
   if (br->count < 0)
     vp9_bool_decoder_fill(br);
@@ -150,6 +117,6 @@ static int bool_error(BOOL_DECODER *br) {
   return 0;
 }
 
-extern int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);
+int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);
 
 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c6c3d1576..326c80239 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -12,6 +12,7 @@
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
@@ -28,12 +29,13 @@
 #ifdef DEBUG_DEC_MV
 int dec_mvcount = 0;
 #endif
+
 // #define DEC_DEBUG
 #ifdef DEC_DEBUG
 extern int dec_debug;
 #endif
 
-static int read_bmode(vp9_reader *bc, const vp9_prob *p) {
+static B_PREDICTION_MODE read_bmode(vp9_reader *bc, const vp9_prob *p) {
   B_PREDICTION_MODE m = treed_read(bc, vp9_bmode_tree, p);
 #if CONFIG_NEWBINTRAMODES
   if (m == B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS)
@@ -43,53 +45,71 @@ static int read_bmode(vp9_reader *bc, const vp9_prob *p) {
   return m;
 }
 
-static int read_kf_bmode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_kf_bmode_tree, p);
+static B_PREDICTION_MODE read_kf_bmode(vp9_reader *bc, const vp9_prob *p) {
+  return (B_PREDICTION_MODE)treed_read(bc, vp9_kf_bmode_tree, p);
 }
 
-static int read_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_ymode_tree, p);
+static MB_PREDICTION_MODE read_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_ymode_tree, p);
 }
 
-static int read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_sb_ymode_tree, p);
+static MB_PREDICTION_MODE read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_sb_ymode_tree, p);
 }
 
-static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_uv_mode_tree, p);
+static MB_PREDICTION_MODE read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);
 }
 
-static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_kf_ymode_tree, p);
+static MB_PREDICTION_MODE read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_kf_ymode_tree, p);
 }
 
 static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {
   return treed_read(bc, vp9_i8x8_mode_tree, p);
 }
 
-static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_uv_mode_tree, p);
+static MB_PREDICTION_MODE read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);
 }
 
 // This function reads the current macro block's segnent id from the bitstream
 // It should only be called if a segment map update is indicated.
-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,
-                          MACROBLOCKD *xd) {
+static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *xd) {
   /* Is segmentation enabled */
   if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
     /* If so then read the segment id. */
-    if (vp9_read(r, xd->mb_segment_tree_probs[0]))
-      mi->segment_id =
-        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));
-    else
-      mi->segment_id =
+    mi->segment_id = vp9_read(r, xd->mb_segment_tree_probs[0]) ?
+        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2])):
         (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));
   }
 }
 
+// This function reads the current macro block's segnent id from the bitstream
+// It should only be called if a segment map update is indicated.
+static void read_mb_segid_except(VP9_COMMON *cm,
+                                 vp9_reader *r, MB_MODE_INFO *mi,
+                                 MACROBLOCKD *xd, int mb_row, int mb_col) {
+  int pred_seg_id = vp9_get_pred_mb_segid(cm, xd,
+                                          mb_row * cm->mb_cols + mb_col);
+  const vp9_prob *p = xd->mb_segment_tree_probs;
+  vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id];
+
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+    /* If so then read the segment id. */
+    if (vp9_read(r, p1)) {
+      mi->segment_id = 2 +
+          (pred_seg_id < 2 ? vp9_read(r, p[2]) : (pred_seg_id == 2));
+    } else {
+      mi->segment_id =
+          pred_seg_id >= 2 ? vp9_read(r, p[1]) : (pred_seg_id == 0);
+    }
+  }
+}
+
 #if CONFIG_NEW_MVREF
-int vp9_read_mv_ref_id(vp9_reader *r,
-                       vp9_prob * ref_id_probs) {
+int vp9_read_mv_ref_id(vp9_reader *r, vp9_prob *ref_id_probs) {
   int ref_index = 0;
 
   if (vp9_read(r, ref_id_probs[0])) {
@@ -111,10 +131,13 @@ static void kfread_modes(VP9D_COMP *pbi,
                          int mb_col,
                          BOOL_DECODER* const bc) {
   VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd  = &pbi->mb;
   const int mis = pbi->common.mode_info_stride;
   int map_index = mb_row * pbi->common.mb_cols + mb_col;
   MB_PREDICTION_MODE y_mode;
 
+  m->mbmi.ref_frame = INTRA_FRAME;
+
   // Read the Macroblock segmentation map if it is being updated explicitly
   // this frame (reset to 0 by default).
   m->mbmi.segment_id = 0;
@@ -139,60 +162,56 @@ static void kfread_modes(VP9D_COMP *pbi,
 
   m->mbmi.mb_skip_coeff = 0;
   if (pbi->common.mb_no_coeff_skip &&
-      (!vp9_segfeature_active(&pbi->mb,
-                              m->mbmi.segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(&pbi->mb,
-                        m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {
+      (!vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, SEG_LVL_SKIP))) {
     MACROBLOCKD *const xd  = &pbi->mb;
     m->mbmi.mb_skip_coeff =
-      vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+        vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
   } else {
-    if (vp9_segfeature_active(&pbi->mb,
-                              m->mbmi.segment_id, SEG_LVL_EOB) &&
-        (vp9_get_segdata(&pbi->mb,
-                         m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {
+    if (vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, SEG_LVL_SKIP))
       m->mbmi.mb_skip_coeff = 1;
-    } else
+    else
       m->mbmi.mb_skip_coeff = 0;
   }
 
-  if (m->mbmi.sb_type) {
-    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
-      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-  } else {
-    y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
-      pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-  }
+
+  y_mode = m->mbmi.sb_type ?
+      read_kf_sb_ymode(bc,
+          pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]):
+      read_kf_mb_ymode(bc,
+          pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
 
   m->mbmi.ref_frame = INTRA_FRAME;
 
   if ((m->mbmi.mode = y_mode) == B_PRED) {
     int i = 0;
     do {
-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(m, i);
+      const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
+      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?
+                                  left_block_mode(m, i) : B_DC_PRED;
 
-      m->bmi[i].as_mode.first =
-        (B_PREDICTION_MODE) read_kf_bmode(
-          bc, pbi->common.kf_bmode_prob [A] [L]);
+      m->bmi[i].as_mode.first = read_kf_bmode(bc,
+                                              pbi->common.kf_bmode_prob[a][l]);
     } while (++i < 16);
   }
+
   if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
     int i;
-    int mode8x8;
     for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+      const int ib = vp9_i8x8_block[i];
+      const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+
       m->bmi[ib + 0].as_mode.first = mode8x8;
       m->bmi[ib + 1].as_mode.first = mode8x8;
       m->bmi[ib + 4].as_mode.first = mode8x8;
       m->bmi[ib + 5].as_mode.first = mode8x8;
     }
-  } else
-    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
-                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
+  } else {
+    m->mbmi.uv_mode = read_uv_mode(bc,
+                                   pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
+  }
 
-  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
+  if (cm->txfm_mode == TX_MODE_SELECT &&
+      m->mbmi.mb_skip_coeff == 0 &&
       m->mbmi.mode <= I8X8_PRED) {
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
@@ -215,23 +234,23 @@ static void kfread_modes(VP9D_COMP *pbi,
 static int read_nmv_component(vp9_reader *r,
                               int rv,
                               const nmv_component *mvcomp) {
-  int v, s, z, c, o, d;
-  s = vp9_read(r, mvcomp->sign);
-  c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
-  if (c == MV_CLASS_0) {
+  int mag, d;
+  const int sign = vp9_read(r, mvcomp->sign);
+  const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+
+  if (mv_class == MV_CLASS_0) {
     d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
   } else {
-    int i, b;
+    int i;
+    int n = mv_class + CLASS0_BITS - 1;  // number of bits
+
     d = 0;
-    b = c + CLASS0_BITS - 1;  /* number of bits */
-    for (i = 0; i < b; ++i)
-      d |= (vp9_read(r, mvcomp->bits[i]) << i);
+    for (i = 0; i < n; ++i)
+      d |= vp9_read(r, mvcomp->bits[i]) << i;
   }
-  o = d << 3;
 
-  z = vp9_get_mv_mag(c, o);
-  v = (s ? -(z + 8) : (z + 8));
-  return v;
+  mag = vp9_get_mv_mag(mv_class, d << 3);
+  return sign ? -(mag + 8) : (mag + 8);
 }
 
 static int read_nmv_component_fp(vp9_reader *r,
@@ -239,43 +258,34 @@ static int read_nmv_component_fp(vp9_reader *r,
                                  int rv,
                                  const nmv_component *mvcomp,
                                  int usehp) {
-  int s, z, c, o, d, e, f;
-  s = v < 0;
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
-  z &= ~7;
-
-  c = vp9_get_mv_class(z, &o);
-  d = o >> 3;
+  const int sign = v < 0;
+  int mag = ((sign ? -v : v) - 1) & ~7;  // magnitude - 1
+  int offset;
+  const int mv_class = vp9_get_mv_class(mag, &offset);
+  const int f = mv_class == MV_CLASS_0 ?
+      treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[offset >> 3]):
+      treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
 
-  if (c == MV_CLASS_0) {
-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);
-  } else {
-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
-  }
-  o += (f << 1);
+  offset += f << 1;
 
   if (usehp) {
-    if (c == MV_CLASS_0) {
-      e = vp9_read(r, mvcomp->class0_hp);
-    } else {
-      e = vp9_read(r, mvcomp->hp);
-    }
-    o += e;
+    offset += mv_class == MV_CLASS_0 ?
+        vp9_read(r, mvcomp->class0_hp) : vp9_read(r, mvcomp->hp);
   } else {
-    ++o;  /* Note if hp is not used, the default value of the hp bit is 1 */
+    offset += 1;  // If hp is not used, the default value of the hp bit is 1
   }
-  z = vp9_get_mv_mag(c, o);
-  v = (s ? -(z + 1) : (z + 1));
-  return v;
+  mag = vp9_get_mv_mag(mv_class, offset);
+  return sign ? -(mag + 1) : (mag + 1);
 }
 
 static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
                      const nmv_context *mvctx) {
-  MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
+  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
   mv->row = mv-> col = 0;
   if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
     mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);
   }
+
   if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
     mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);
   }
@@ -283,7 +293,7 @@ static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
 
 static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
                         const nmv_context *mvctx, int usehp) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+  const MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
   usehp = usehp && vp9_use_nmv_hp(ref);
   if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
     mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],
@@ -293,7 +303,10 @@ static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
     mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],
                                     usehp);
   }
-  //printf("  %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);
+  /*
+  printf("MV: %d %d REF: %d %d\n", mv->row + ref->row, mv->col + ref->col,
+	 ref->row, ref->col);
+	 */
 }
 
 static void update_nmv(vp9_reader *bc, vp9_prob *const p,
@@ -310,48 +323,40 @@ static void update_nmv(vp9_reader *bc, vp9_prob *const p,
 static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,
                           int usehp) {
   int i, j, k;
+
 #ifdef MV_GROUP_UPDATE
-  if (!vp9_read_bit(bc)) return;
+  if (!vp9_read_bit(bc))
+    return;
 #endif
-  for (j = 0; j < MV_JOINTS - 1; ++j) {
-    update_nmv(bc, &mvctx->joints[j],
-               VP9_NMV_UPDATE_PROB);
-  }
+  for (j = 0; j < MV_JOINTS - 1; ++j)
+    update_nmv(bc, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
+
   for (i = 0; i < 2; ++i) {
-    update_nmv(bc, &mvctx->comps[i].sign,
-               VP9_NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j) {
-      update_nmv(bc, &mvctx->comps[i].classes[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
-      update_nmv(bc, &mvctx->comps[i].class0[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      update_nmv(bc, &mvctx->comps[i].bits[j],
-                 VP9_NMV_UPDATE_PROB);
-    }
+    update_nmv(bc, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
+    for (j = 0; j < MV_CLASSES - 1; ++j)
+      update_nmv(bc, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+
+    for (j = 0; j < CLASS0_SIZE - 1; ++j)
+      update_nmv(bc, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_nmv(bc, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j) {
       for (k = 0; k < 3; ++k)
-        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],
-                   VP9_NMV_UPDATE_PROB);
-    }
-    for (j = 0; j < 3; ++j) {
-      update_nmv(bc, &mvctx->comps[i].fp[j],
-                 VP9_NMV_UPDATE_PROB);
+        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
     }
+
+    for (j = 0; j < 3; ++j)
+      update_nmv(bc, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(bc, &mvctx->comps[i].class0_hp,
-                 VP9_NMV_UPDATE_PROB);
-      update_nmv(bc, &mvctx->comps[i].hp,
-                 VP9_NMV_UPDATE_PROB);
+      update_nmv(bc, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+      update_nmv(bc, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
     }
   }
 }
@@ -361,15 +366,11 @@ static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
                                          vp9_reader *const bc,
                                          unsigned char segment_id) {
   MV_REFERENCE_FRAME ref_frame;
-  int seg_ref_active;
-  int seg_ref_count = 0;
-
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
-  seg_ref_active = vp9_segfeature_active(xd,
-                                         segment_id,
-                                         SEG_LVL_REF_FRAME);
+  int seg_ref_count = 0;
+  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
 
   // If segment coding enabled does the segment allow for more than one
   // possible reference frame
@@ -492,12 +493,12 @@ unsigned int vp9_mv_cont_count[5][4] = {
 };
 #endif
 
-static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 };
 static const unsigned char mbsplit_fill_offset[4][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 },
+  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15 },
+  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15 },
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 }
 };
 
 static void read_switchable_interp_probs(VP9D_COMP* const pbi,
@@ -603,7 +604,7 @@ static void read_mb_segment_id(VP9D_COMP *pbi,
         }
         // Else .... decode it explicitly
         else {
-          read_mb_segid(bc, mbmi, xd);
+          read_mb_segid_except(cm, bc, mbmi, xd, mb_row, mb_col);
         }
       }
       // Normal unpredicted coding mode
@@ -636,8 +637,7 @@ static void read_mb_segment_id(VP9D_COMP *pbi,
         for (y = 0; y < ymbs; y++) {
           for (x = 0; x < xmbs; x++) {
             segment_id = MIN(segment_id,
-                             cm->last_frame_seg_map[index + x +
-                                                    y * cm->mb_cols]);
+                cm->last_frame_seg_map[index + x + y * cm->mb_cols]);
           }
         }
         mbmi->segment_id = segment_id;
@@ -664,25 +664,28 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   int_mv *const mv = &mbmi->mv[0];
   int mb_to_left_edge;
   int mb_to_right_edge;
-  int mb_to_top_edge;
-  int mb_to_bottom_edge;
   const int mb_size = 1 << mi->mbmi.sb_type;
 
-  mb_to_top_edge = xd->mb_to_top_edge;
-  mb_to_bottom_edge = xd->mb_to_bottom_edge;
-  mb_to_top_edge -= LEFT_TOP_MARGIN;
-  mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+  const int use_prev_in_find_mv_refs = cm->Width == cm->last_width &&
+                                       cm->Height == cm->last_height &&
+                                       !cm->error_resilient_mode;
+
+  int mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+  int mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+
   mbmi->need_to_clamp_mvs = 0;
   mbmi->need_to_clamp_secondmv = 0;
   mbmi->second_ref_frame = NONE;
-  /* Distance of Mb to the various image edges.
-   * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
-   */
-  xd->mb_to_left_edge =
-    mb_to_left_edge = -((mb_col * 16) << 3);
+
+  // Distance of Mb to the various image edges.
+  // These specified to 8th pel as they are always compared to MV values
+  // that are in 1/8th pel units
+  xd->mb_to_left_edge = mb_to_left_edge
+                      = -((mb_col * 16) << 3);
   mb_to_left_edge -= LEFT_TOP_MARGIN;
-  xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
+
+  xd->mb_to_right_edge = mb_to_right_edge
+                       = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
   mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
 
   // Make sure the MACROBLOCKD mode info pointer is pointed at the
@@ -694,75 +697,74 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   read_mb_segment_id(pbi, mb_row, mb_col, bc);
 
   if (pbi->common.mb_no_coeff_skip &&
-      (!vp9_segfeature_active(xd,
-                              mbmi->segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {
+      (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP))) {
     // Read the macroblock coeff skip flag if this feature is in use,
     // else default to 0
     mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
   } else {
-    if (vp9_segfeature_active(xd,
-                              mbmi->segment_id, SEG_LVL_EOB) &&
-        (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {
-      mbmi->mb_skip_coeff = 1;
-    } else
-      mbmi->mb_skip_coeff = 0;
+    mbmi->mb_skip_coeff =
+        vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP) ? 1 : 0;
   }
 
   // Read the reference frame
-  if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)
-      && vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE) < NEARESTMV)
-    mbmi->ref_frame = INTRA_FRAME;
-  else
-    mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
+  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
+
+  /*
+  if (pbi->common.current_video_frame == 1)
+    printf("ref frame: %d [%d %d]\n", mbmi->ref_frame, mb_row, mb_col);
+    */
 
   // If reference frame is an Inter frame
   if (mbmi->ref_frame) {
     int_mv nearest, nearby, best_mv;
     int_mv nearest_second, nearby_second, best_mv_second;
-    vp9_prob mv_ref_p [VP9_MVREFS - 1];
+    vp9_prob mv_ref_p[VP9_MVREFS - 1];
 
-    int recon_y_stride, recon_yoffset;
-    int recon_uv_stride, recon_uvoffset;
     MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
 
     {
       int ref_fb_idx;
+      const int use_prev_in_find_best_ref =
+          xd->scale_factor[0].x_num == xd->scale_factor[0].x_den &&
+          xd->scale_factor[0].y_num == xd->scale_factor[0].y_den &&
+          !cm->error_resilient_mode &&
+          !cm->frame_parallel_decoding_mode;
 
       /* Select the appropriate reference frame for this MB */
-      if (ref_frame == LAST_FRAME)
-        ref_fb_idx = cm->lst_fb_idx;
-      else if (ref_frame == GOLDEN_FRAME)
-        ref_fb_idx = cm->gld_fb_idx;
-      else
-        ref_fb_idx = cm->alt_fb_idx;
-
-      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;
-      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+      ref_fb_idx = cm->active_ref_idx[ref_frame - 1];
 
-      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+      setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx],
+          mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
 #ifdef DEC_DEBUG
       if (dec_debug)
         printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
                xd->mode_info_context->mbmi.mv[0].as_mv.col);
 #endif
-      vp9_find_mv_refs(xd, mi, prev_mi,
+      // if (cm->current_video_frame == 1 && mb_row == 4 && mb_col == 5)
+      //  printf("Dello\n");
+      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
                        ref_frame, mbmi->ref_mvs[ref_frame],
                        cm->ref_frame_sign_bias);
 
       vp9_mv_ref_probs(&pbi->common, mv_ref_p,
                        mbmi->mb_mode_context[ref_frame]);
+      /*
+      if (pbi->common.current_video_frame == 1) {
+	int k = mbmi->mb_mode_context[ref_frame];
+	printf("vp9_mode_contexts: [%d %d %d %d] %d %d %d %d\n",
+	       mb_row, mb_col, ref_frame, k,
+	       cm->fc.vp9_mode_contexts[k][0],
+	       cm->fc.vp9_mode_contexts[k][1],
+	       cm->fc.vp9_mode_contexts[k][2],
+	       cm->fc.vp9_mode_contexts[k][3]);
+      }
+      */
 
-      // Is the segment level mode feature enabled for this segment
-      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
-        mbmi->mode =
-          vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+      // If the segment level skip mode enabled
+      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
+        mbmi->mode = ZEROMV;
       } else {
         if (mbmi->sb_type)
           mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
@@ -775,8 +777,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
       if (mbmi->mode != ZEROMV) {
         vp9_find_best_ref_mvs(xd,
-                              xd->pre.y_buffer,
-                              recon_y_stride,
+                              use_prev_in_find_best_ref ?
+                                  xd->pre.y_buffer : NULL,
+                              xd->pre.y_stride,
                               mbmi->ref_mvs[ref_frame],
                               &nearest, &nearby);
 
@@ -791,8 +794,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
     }
 
-    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
-    {
+    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV) {
       if (cm->mcomp_filter_type == SWITCHABLE) {
         mbmi->interp_filter = vp9_switchable_interp[
             treed_read(bc, vp9_switchable_interp_tree,
@@ -817,31 +819,31 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         mbmi->second_ref_frame = 1;
       if (mbmi->second_ref_frame > 0) {
         int second_ref_fb_idx;
+        int use_prev_in_find_best_ref;
+
+        xd->scale_factor[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];
+        use_prev_in_find_best_ref =
+            xd->scale_factor[1].x_num == xd->scale_factor[1].x_den &&
+            xd->scale_factor[1].y_num == xd->scale_factor[1].y_den &&
+            !cm->error_resilient_mode &&
+            !cm->frame_parallel_decoding_mode;
+
         /* Select the appropriate reference frame for this MB */
-        if (mbmi->second_ref_frame == LAST_FRAME)
-          second_ref_fb_idx = cm->lst_fb_idx;
-        else if (mbmi->second_ref_frame ==
-          GOLDEN_FRAME)
-          second_ref_fb_idx = cm->gld_fb_idx;
-        else
-          second_ref_fb_idx = cm->alt_fb_idx;
+        second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
 
-        xd->second_pre.y_buffer =
-          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+        setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
+            mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]);
 
-        vp9_find_mv_refs(xd, mi, prev_mi,
+        vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
                          mbmi->second_ref_frame,
                          mbmi->ref_mvs[mbmi->second_ref_frame],
                          cm->ref_frame_sign_bias);
 
         if (mbmi->mode != ZEROMV) {
           vp9_find_best_ref_mvs(xd,
-                                xd->second_pre.y_buffer,
-                                recon_y_stride,
+                                use_prev_in_find_best_ref ?
+                                    xd->second_pre.y_buffer : NULL,
+                                xd->second_pre.y_stride,
                                 mbmi->ref_mvs[mbmi->second_ref_frame],
                                 &nearest_second,
                                 &nearby_second);
@@ -861,12 +863,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         pbi->common.fc.interintra_counts[
             mbmi->second_ref_frame == INTRA_FRAME]++;
         if (mbmi->second_ref_frame == INTRA_FRAME) {
-          mbmi->interintra_mode = (MB_PREDICTION_MODE)read_ymode(
-              bc, pbi->common.fc.ymode_prob);
+          mbmi->interintra_mode = read_ymode(bc, pbi->common.fc.ymode_prob);
           pbi->common.fc.ymode_counts[mbmi->interintra_mode]++;
 #if SEPARATE_INTERINTRA_UV
-          mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
-              bc, pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);
+          mbmi->interintra_uv_mode = read_uv_mode(bc,
+              pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);
           pbi->common.fc.uv_mode_counts[mbmi->interintra_mode]
                                        [mbmi->interintra_uv_mode]++;
 #else
@@ -912,21 +913,19 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         cm->fc.mbsplit_counts[s]++;
 
         mbmi->need_to_clamp_mvs = 0;
-        do { /* for each subset j */
+        do {  // for each subset j
           int_mv leftmv, abovemv, second_leftmv, second_abovemv;
           int_mv blockmv, secondmv;
-          int k;  /* first block in subset j */
           int mv_contz;
           int blockmode;
+          int k = vp9_mbsplit_offset[s][j];  // first block in subset j
 
-          k = vp9_mbsplit_offset[s][j];
-
-          leftmv.as_int = left_block_mv(mi, k);
+          leftmv.as_int = left_block_mv(xd, mi, k);
           abovemv.as_int = above_block_mv(mi, k, mis);
           second_leftmv.as_int = 0;
           second_abovemv.as_int = 0;
           if (mbmi->second_ref_frame > 0) {
-            second_leftmv.as_int = left_block_second_mv(mi, k);
+            second_leftmv.as_int = left_block_second_mv(xd, mi, k);
             second_abovemv.as_int = above_block_second_mv(mi, k, mis);
           }
           mv_contz = vp9_mv_cont(&leftmv, &abovemv);
@@ -1005,15 +1004,14 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
             /* Fill (uniform) modes, mvs of jth subset.
              Must do it here because ensuing subsets can
              refer back to us via "left" or "above". */
-            const unsigned char *fill_offset;
             unsigned int fill_count = mbsplit_fill_count[s];
-
-            fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
+            const unsigned char *fill_offset =
+                &mbsplit_fill_offset[s][j * fill_count];
 
             do {
-              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;
+              mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int;
               if (mbmi->second_ref_frame > 0)
-                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;
+                mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int;
               fill_offset++;
             } while (--fill_count);
           }
@@ -1021,8 +1019,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         } while (++j < num_p);
       }
 
-      mv->as_int = mi->bmi[15].as_mv.first.as_int;
-      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;
+      mv->as_int = mi->bmi[15].as_mv[0].as_int;
+      mbmi->mv[1].as_int = mi->bmi[15].as_mv[1].as_int;
 
       break;  /* done with SPLITMV */
 
@@ -1057,7 +1055,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         break;
 
       case NEWMV:
-
         read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
         read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
                     xd->allow_high_precision_mv);
@@ -1086,10 +1083,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                             &cm->fc.NMVcount, xd->allow_high_precision_mv);
           mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;
           mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;
-          mbmi->need_to_clamp_secondmv |=
-            check_mv_bounds(&mbmi->mv[1],
-                            mb_to_left_edge, mb_to_right_edge,
-                            mb_to_top_edge, mb_to_bottom_edge);
+          mbmi->need_to_clamp_secondmv |= check_mv_bounds(&mbmi->mv[1],
+              mb_to_left_edge, mb_to_right_edge,
+              mb_to_top_edge, mb_to_bottom_edge);
         }
         break;
       default:
@@ -1102,16 +1098,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     /* required for left and above block mv */
     mbmi->mv[0].as_int = 0;
 
-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
-      mbmi->mode = (MB_PREDICTION_MODE)
-                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-    } else if (mbmi->sb_type) {
-      mbmi->mode = (MB_PREDICTION_MODE)
-                   read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);
+    if (mbmi->sb_type) {
+      mbmi->mode = read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);
       pbi->common.fc.sb_ymode_counts[mbmi->mode]++;
     } else {
-      mbmi->mode = (MB_PREDICTION_MODE)
-                   read_ymode(bc, pbi->common.fc.ymode_prob);
+      mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob);
       pbi->common.fc.ymode_counts[mbmi->mode]++;
     }
 
@@ -1120,8 +1111,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       int j = 0;
       do {
         int m;
-        m = mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)
-            read_bmode(bc, pbi->common.fc.bmode_prob);
+        m = mi->bmi[j].as_mode.first = read_bmode(bc,
+                                                  pbi->common.fc.bmode_prob);
 #if CONFIG_NEWBINTRAMODES
         if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
@@ -1131,10 +1122,10 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
     if (mbmi->mode == I8X8_PRED) {
       int i;
-      int mode8x8;
       for (i = 0; i < 4; i++) {
-        int ib = vp9_i8x8_block[i];
-        mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+        const int ib = vp9_i8x8_block[i];
+        const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+
         mi->bmi[ib + 0].as_mode.first = mode8x8;
         mi->bmi[ib + 1].as_mode.first = mode8x8;
         mi->bmi[ib + 4].as_mode.first = mode8x8;
@@ -1142,11 +1133,14 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         pbi->common.fc.i8x8_mode_counts[mode8x8]++;
       }
     } else {
-      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
-        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
+      mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
       pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
     }
   }
+  /*
+  if (pbi->common.current_video_frame == 1)
+    printf("mode: %d skip: %d\n", mbmi->mode, mbmi->mb_skip_coeff);
+    */
 
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
@@ -1188,6 +1182,7 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) {
 
   mb_mode_mv_init(pbi, bc);
 }
+
 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                            MACROBLOCKD* const xd,
                            int mb_row,
@@ -1196,8 +1191,12 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
   MODE_INFO *mi = xd->mode_info_context;
   MODE_INFO *prev_mi = xd->prev_mode_info_context;
 
-  if (pbi->common.frame_type == KEY_FRAME)
+  if (pbi->common.frame_type == KEY_FRAME) {
     kfread_modes(pbi, mi, mb_row, mb_col, bc);
-  else
+  } else {
     read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
+    set_scale_factors(xd,
+                      mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
+                      pbi->common.active_ref_scale);
+  }
 }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index c3b9637a6..86806d2d0 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -13,7 +13,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_header.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_reconintra4x4.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/decoder/vp9_decodframe.h"
@@ -32,7 +31,7 @@
 #include "vp9/decoder/vp9_dboolhuff.h"
 
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9_rtcd.h"
 
 #include <assert.h>
@@ -79,103 +78,82 @@ static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
 
 void vp9_init_de_quantizer(VP9D_COMP *pbi) {
   int i;
-  int Q;
+  int q;
   VP9_COMMON *const pc = &pbi->common;
 
-  for (Q = 0; Q < QINDEX_RANGE; Q++) {
-    pc->Y1dequant[Q][0] = (int16_t)vp9_dc_quant(Q, pc->y1dc_delta_q);
-    pc->Y2dequant[Q][0] = (int16_t)vp9_dc2quant(Q, pc->y2dc_delta_q);
-    pc->UVdequant[Q][0] = (int16_t)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    pc->Y1dequant[q][0] = (int16_t)vp9_dc_quant(q, pc->y1dc_delta_q);
+    pc->UVdequant[q][0] = (int16_t)vp9_dc_uv_quant(q, pc->uvdc_delta_q);
 
     /* all the ac values =; */
     for (i = 1; i < 16; i++) {
       int rc = vp9_default_zig_zag1d_4x4[i];
 
-      pc->Y1dequant[Q][rc] = (int16_t)vp9_ac_yquant(Q);
-      pc->Y2dequant[Q][rc] = (int16_t)vp9_ac2quant(Q, pc->y2ac_delta_q);
-      pc->UVdequant[Q][rc] = (int16_t)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
+      pc->Y1dequant[q][rc] = (int16_t)vp9_ac_yquant(q);
+      pc->UVdequant[q][rc] = (int16_t)vp9_ac_uv_quant(q, pc->uvac_delta_q);
     }
   }
 }
 
 static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
   int i;
-  int QIndex;
+  int qindex;
   VP9_COMMON *const pc = &pbi->common;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
 
   // Set the Q baseline allowing for any segment level adjustment
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    /* Abs Value */
     if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-    /* Delta Value */
+      /* Abs Value */
+      qindex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
     else {
-      QIndex = pc->base_qindex +
+      /* Delta Value */
+      qindex = pc->base_qindex +
                vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
+      /* Clamp to valid range */
+      qindex = (qindex >= 0) ? ((qindex <= MAXQ) ? qindex : MAXQ) : 0;
     }
   } else
-    QIndex = pc->base_qindex;
-  xd->q_index = QIndex;
+    qindex = pc->base_qindex;
+
+  xd->q_index = qindex;
 
   /* Set up the block level dequant pointers */
   for (i = 0; i < 16; i++) {
-    xd->block[i].dequant = pc->Y1dequant[QIndex];
+    xd->block[i].dequant = pc->Y1dequant[qindex];
   }
 
-#if CONFIG_LOSSLESS
-  if (!QIndex) {
-    pbi->mb.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;
-    pbi->mb.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;
-    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;
-    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;
-    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;
-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;
-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;
-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;
-  } else {
-    pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
-    pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;
-    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
-    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
-    pbi->idct_add            = vp9_dequant_idct_add;
-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
+  xd->inv_txm4x4_1      = vp9_short_idct4x4llm_1;
+  xd->inv_txm4x4        = vp9_short_idct4x4llm;
+  xd->itxm_add          = vp9_dequant_idct_add;
+  xd->itxm_add_y_block  = vp9_dequant_idct_add_y_block;
+  xd->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;
+  if (xd->lossless) {
+    assert(qindex == 0);
+    xd->inv_txm4x4_1      = vp9_short_inv_walsh4x4_1_x8;
+    xd->inv_txm4x4        = vp9_short_inv_walsh4x4_x8;
+    xd->itxm_add          = vp9_dequant_idct_add_lossless_c;
+    xd->itxm_add_y_block  = vp9_dequant_idct_add_y_block_lossless_c;
+    xd->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;
   }
-#else
-  pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
-  pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;
-  pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
-  pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
-  pbi->idct_add            = vp9_dequant_idct_add;
-  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
-  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
-  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
-  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
-#endif
 
   for (i = 16; i < 24; i++) {
-    xd->block[i].dequant = pc->UVdequant[QIndex];
+    xd->block[i].dequant = pc->UVdequant[qindex];
   }
-
-  xd->block[24].dequant = pc->Y2dequant[QIndex];
-
 }
 
 /* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
  *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
  */
-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
+static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                          int mb_row, int mb_col) {
+  BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
+
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    if (sb_type == BLOCK_SIZE_SB64X64) {
       vp9_build_intra_predictors_sb64uv_s(xd);
       vp9_build_intra_predictors_sb64y_s(xd);
-    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+    } else if (sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_intra_predictors_sbuv_s(xd);
       vp9_build_intra_predictors_sby_s(xd);
     } else {
@@ -183,38 +161,32 @@ static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
       vp9_build_intra_predictors_mby_s(xd);
     }
   } else {
-    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    if (sb_type == BLOCK_SIZE_SB64X64) {
       vp9_build_inter64x64_predictors_sb(xd,
                                          xd->dst.y_buffer,
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
-                                         xd->dst.uv_stride);
-    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
+    } else if (sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_inter32x32_predictors_sb(xd,
                                          xd->dst.y_buffer,
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
-                                         xd->dst.uv_stride);
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
     } else {
-      vp9_build_1st_inter16x16_predictors_mb(xd,
-                                             xd->dst.y_buffer,
-                                             xd->dst.u_buffer,
-                                             xd->dst.v_buffer,
-                                             xd->dst.y_stride,
-                                             xd->dst.uv_stride);
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        vp9_build_2nd_inter16x16_predictors_mb(xd,
-                                               xd->dst.y_buffer,
-                                               xd->dst.u_buffer,
-                                               xd->dst.v_buffer,
-                                               xd->dst.y_stride,
-                                               xd->dst.uv_stride);
-      }
+      vp9_build_inter16x16_predictors_mb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
 #if CONFIG_COMP_INTERINTRA_PRED
-      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
         vp9_build_interintra_16x16_predictors_mb(xd,
                                                  xd->dst.y_buffer,
                                                  xd->dst.u_buffer,
@@ -231,7 +203,6 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
                          BOOL_DECODER* const bc) {
   BLOCKD *bd = &xd->block[0];
   TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
-  assert(get_2nd_order_usage(xd) == 0);
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int i;
@@ -262,7 +233,7 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
   vp9_dequant_idct_add_uv_block_8x8(
       xd->qcoeff + 16 * 16, xd->block[16].dequant,
       xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-      xd->dst.uv_stride, xd->eobs + 16, xd);
+      xd->dst.uv_stride, xd);
 }
 
 static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
@@ -283,7 +254,6 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
 #endif
   if (tx_type != DCT_DCT || xd->mode_info_context->mbmi.mode == I8X8_PRED) {
     int i;
-    assert(get_2nd_order_usage(xd) == 0);
     for (i = 0; i < 4; i++) {
       int ib = vp9_i8x8_block[i];
       int idx = (ib & 0x02) ? (ib + 2) : ib;
@@ -295,7 +265,7 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
       BLOCKD *b = &xd->block[ib];
       if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
         int i8x8mode = b->bmi.as_mode.first;
-        vp9_intra8x8_predict(b, i8x8mode, b->predictor);
+        vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);
       }
       tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
       if (tx_type != DCT_DCT) {
@@ -303,38 +273,16 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                       xd->eobs[idx]);
       } else {
         vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,
-                                   0, xd->eobs[idx]);
+                                   xd->eobs[idx]);
       }
     }
-  } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-    assert(get_2nd_order_usage(xd) == 0);
+  } else {
     vp9_dequant_idct_add_y_block_8x8(xd->qcoeff,
                                      xd->block[0].dequant,
                                      xd->predictor,
                                      xd->dst.y_buffer,
                                      xd->dst.y_stride,
-                                     xd->eobs, xd);
-  } else {
-    BLOCKD *b = &xd->block[24];
-    assert(get_2nd_order_usage(xd) == 1);
-    vp9_dequantize_b_2x2(b);
-    vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8);
-    ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
-    ((int *)b->qcoeff)[1] = 0;
-    ((int *)b->qcoeff)[2] = 0;
-    ((int *)b->qcoeff)[3] = 0;
-    ((int *)b->qcoeff)[4] = 0;
-    ((int *)b->qcoeff)[5] = 0;
-    ((int *)b->qcoeff)[6] = 0;
-    ((int *)b->qcoeff)[7] = 0;
-    vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
-                                        xd->block[0].dequant,
-                                        xd->predictor,
-                                        xd->dst.y_buffer,
-                                        xd->dst.y_stride,
-                                        xd->eobs,
-                                        xd->block[24].diff,
-                                        xd);
+                                     xd);
   }
 
   // Now do UV
@@ -345,23 +293,23 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
       BLOCKD *b = &xd->block[ib];
       int i8x8mode = b->bmi.as_mode.first;
       b = &xd->block[16 + i];
-      vp9_intra_uv4x4_predict(&xd->block[16 + i], i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
-                    *(b->base_dst) + b->dst, 8, b->dst_stride);
+      vp9_intra_uv4x4_predict(xd, &xd->block[16 + i], i8x8mode, b->predictor);
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);
       b = &xd->block[20 + i];
-      vp9_intra_uv4x4_predict(&xd->block[20 + i], i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
-                    *(b->base_dst) + b->dst, 8, b->dst_stride);
+      vp9_intra_uv4x4_predict(xd, &xd->block[20 + i], i8x8mode, b->predictor);
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);
     }
   } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
          xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-         xd->dst.uv_stride, xd->eobs + 16);
+         xd->dst.uv_stride, xd);
   } else {
     vp9_dequant_idct_add_uv_block_8x8
         (xd->qcoeff + 16 * 16, xd->block[16].dequant,
          xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-         xd->dst.uv_stride, xd->eobs + 16, xd);
+         xd->dst.uv_stride, xd);
   }
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -382,7 +330,6 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
   int i, eobtotal = 0;
   MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
   if (mode == I8X8_PRED) {
-    assert(get_2nd_order_usage(xd) == 0);
     for (i = 0; i < 4; i++) {
       int ib = vp9_i8x8_block[i];
       const int iblock[4] = {0, 1, 4, 5};
@@ -391,7 +338,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       BLOCKD *b;
       b = &xd->block[ib];
       i8x8mode = b->bmi.as_mode.first;
-      vp9_intra8x8_predict(b, i8x8mode, b->predictor);
+      vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);
       for (j = 0; j < 4; j++) {
         b = &xd->block[ib + iblock[j]];
         tx_type = get_tx_type_4x4(xd, b);
@@ -399,23 +346,23 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
           vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                     b->dequant, b->predictor,
                                     *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride, b->eob);
+                                    b->dst_stride, xd->eobs[ib + iblock[j]]);
         } else {
-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                               *(b->base_dst) + b->dst, 16, b->dst_stride);
+          xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                       *(b->base_dst) + b->dst, 16, b->dst_stride,
+                       xd->eobs[ib + iblock[j]]);
         }
       }
       b = &xd->block[16 + i];
-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
-                    *(b->base_dst) + b->dst, 8, b->dst_stride);
+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);
       b = &xd->block[20 + i];
-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
-                    *(b->base_dst) + b->dst, 8, b->dst_stride);
+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);
     }
   } else if (mode == B_PRED) {
-    assert(get_2nd_order_usage(xd) == 0);
     for (i = 0; i < 16; i++) {
       int b_mode;
       BLOCKD *b = &xd->block[i];
@@ -427,46 +374,43 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       if (!xd->mode_info_context->mbmi.mb_skip_coeff)
         eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
 
-      vp9_intra4x4_predict(b, b_mode, b->predictor);
+      vp9_intra4x4_predict(xd, b, b_mode, b->predictor);
       tx_type = get_tx_type_4x4(xd, b);
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                   b->dequant, b->predictor,
                                   *(b->base_dst) + b->dst, 16, b->dst_stride,
-                                  b->eob);
+                                  xd->eobs[i]);
       } else {
-        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                             *(b->base_dst) + b->dst, 16, b->dst_stride);
+        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);
       }
     }
     if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
       vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);
     }
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
     vp9_build_intra_predictors_mbuv(xd);
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
                            xd->predictor + 16 * 16,
                            xd->dst.u_buffer,
                            xd->dst.v_buffer,
                            xd->dst.uv_stride,
-                           xd->eobs + 16);
-  } else if (mode == SPLITMV) {
-    assert(get_2nd_order_usage(xd) == 0);
-    pbi->idct_add_y_block(xd->qcoeff,
+                           xd);
+  } else if (mode == SPLITMV || get_tx_type_4x4(xd, &xd->block[0]) == DCT_DCT) {
+    xd->itxm_add_y_block(xd->qcoeff,
                           xd->block[0].dequant,
                           xd->predictor,
                           xd->dst.y_buffer,
                           xd->dst.y_stride,
-                          xd->eobs);
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
+                          xd);
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
                            xd->predictor + 16 * 16,
                            xd->dst.u_buffer,
                            xd->dst.v_buffer,
                            xd->dst.uv_stride,
-                           xd->eobs + 16);
+                           xd);
   } else {
 #ifdef DEC_DEBUG
     if (dec_debug) {
@@ -485,56 +429,26 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       }
     }
 #endif
-    tx_type = get_tx_type_4x4(xd, &xd->block[0]);
-    if (tx_type != DCT_DCT) {
-      assert(get_2nd_order_usage(xd) == 0);
-      for (i = 0; i < 16; i++) {
-        BLOCKD *b = &xd->block[i];
-        tx_type = get_tx_type_4x4(xd, b);
-        if (tx_type != DCT_DCT) {
-          vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
-                                    b->dequant, b->predictor,
-                                    *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride, b->eob);
-        } else {
-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                               *(b->base_dst) + b->dst, 16, b->dst_stride);
-        }
-      }
-    } else {
-      BLOCKD *b = &xd->block[24];
-      assert(get_2nd_order_usage(xd) == 1);
-      vp9_dequantize_b(b);
-      if (xd->eobs[24] > 1) {
-        vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);
-        ((int *)b->qcoeff)[0] = 0;
-        ((int *)b->qcoeff)[1] = 0;
-        ((int *)b->qcoeff)[2] = 0;
-        ((int *)b->qcoeff)[3] = 0;
-        ((int *)b->qcoeff)[4] = 0;
-        ((int *)b->qcoeff)[5] = 0;
-        ((int *)b->qcoeff)[6] = 0;
-        ((int *)b->qcoeff)[7] = 0;
+    for (i = 0; i < 16; i++) {
+      BLOCKD *b = &xd->block[i];
+      tx_type = get_tx_type_4x4(xd, b);
+      if (tx_type != DCT_DCT) {
+        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
+                                  b->dequant, b->predictor,
+                                  *(b->base_dst) + b->dst, 16,
+                                  b->dst_stride, xd->eobs[i]);
       } else {
-        xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);
-        ((int *)b->qcoeff)[0] = 0;
+        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);
       }
-      vp9_dequantize_b(b);
-      pbi->dc_idct_add_y_block(xd->qcoeff,
-                               xd->block[0].dequant,
-                               xd->predictor,
-                               xd->dst.y_buffer,
-                               xd->dst.y_stride,
-                               xd->eobs,
-                               xd->block[24].diff);
     }
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
                            xd->predictor + 16 * 16,
                            xd->dst.u_buffer,
                            xd->dst.v_buffer,
                            xd->dst.uv_stride,
-                           xd->eobs + 16);
+                           xd);
   }
 }
 
@@ -548,7 +462,7 @@ static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
         tx_type, xd->qcoeff, xd->block[0].dequant,
         xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
         xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->dst.y_stride, xd->block[0].eob);
+        xd->dst.y_stride, xd->dst.y_stride, xd->eobs[0]);
   } else {
     vp9_dequant_idct_add_16x16(
         xd->qcoeff, xd->block[0].dequant,
@@ -561,14 +475,13 @@ static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
       xd->block[16].dequant,
       xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
       xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.uv_stride, xd->eobs + 16, xd);
+      xd->dst.uv_stride, xd);
 };
 
 static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
                           BOOL_DECODER* const bc, int n,
                           int maska, int shiftb) {
   int x_idx = n & maska, y_idx = n >> shiftb;
-  BLOCKD *b = &xd->block[24];
   TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     int i;
@@ -578,7 +491,6 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
       int16_t *q  = xd->block[idx].qcoeff;
       int16_t *dq = xd->block[0].dequant;
       int stride = xd->dst.y_stride;
-      BLOCKD *b = &xd->block[ib];
       tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_8x8_c(
@@ -587,7 +499,7 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
             + x_idx * 16 + (i & 1) * 8,
             xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
             + x_idx * 16 + (i & 1) * 8,
-            stride, stride, b->eob);
+            stride, stride, xd->eobs[idx]);
       } else {
         vp9_dequant_idct_add_8x8_c(
             q, dq,
@@ -595,42 +507,26 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
             + x_idx * 16 + (i & 1) * 8,
             xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
             + x_idx * 16 + (i & 1) * 8,
-            stride, stride, 0, b->eob);
+            stride, stride, xd->eobs[idx]);
       }
-      vp9_dequant_idct_add_uv_block_8x8_inplace_c(
-          xd->qcoeff + 16 * 16, xd->block[16].dequant,
-          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-          xd->dst.uv_stride, xd->eobs + 16, xd);
     }
   } else {
-    vp9_dequantize_b_2x2(b);
-    vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8);
-    ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
-    ((int *)b->qcoeff)[1] = 0;
-    ((int *)b->qcoeff)[2] = 0;
-    ((int *)b->qcoeff)[3] = 0;
-    ((int *)b->qcoeff)[4] = 0;
-    ((int *)b->qcoeff)[5] = 0;
-    ((int *)b->qcoeff)[6] = 0;
-    ((int *)b->qcoeff)[7] = 0;
-    vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(
+    vp9_dequant_idct_add_y_block_8x8_inplace_c(
         xd->qcoeff, xd->block[0].dequant,
         xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-    vp9_dequant_idct_add_uv_block_8x8_inplace_c(
-        xd->qcoeff + 16 * 16, xd->block[16].dequant,
-        xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-        xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-        xd->dst.uv_stride, xd->eobs + 16, xd);
+        xd->dst.y_stride, xd);
   }
+  vp9_dequant_idct_add_uv_block_8x8_inplace_c(
+      xd->qcoeff + 16 * 16, xd->block[16].dequant,
+      xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+      xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+      xd->dst.uv_stride, xd);
 };
 
 static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
                           BOOL_DECODER* const bc, int n,
                           int maska, int shiftb) {
   int x_idx = n & maska, y_idx = n >> shiftb;
-  BLOCKD *b = &xd->block[24];
   TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     int i;
@@ -644,49 +540,34 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
             + x_idx * 16 + (i & 3) * 4,
             xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
             + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_stride, xd->dst.y_stride, b->eob);
+            xd->dst.y_stride, xd->dst.y_stride, xd->eobs[i]);
       } else {
-        vp9_dequant_idct_add_c(
+        xd->itxm_add(
             b->qcoeff, b->dequant,
             xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
             + x_idx * 16 + (i & 3) * 4,
             xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
             + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_stride, xd->dst.y_stride);
+            xd->dst.y_stride, xd->dst.y_stride, xd->eobs[i]);
       }
     }
   } else {
-    vp9_dequantize_b(b);
-    if (xd->eobs[24] > 1) {
-      vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);
-      ((int *)b->qcoeff)[0] = 0;
-      ((int *)b->qcoeff)[1] = 0;
-      ((int *)b->qcoeff)[2] = 0;
-      ((int *)b->qcoeff)[3] = 0;
-      ((int *)b->qcoeff)[4] = 0;
-      ((int *)b->qcoeff)[5] = 0;
-      ((int *)b->qcoeff)[6] = 0;
-      ((int *)b->qcoeff)[7] = 0;
-    } else {
-      xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);
-      ((int *)b->qcoeff)[0] = 0;
-    }
-    vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(
+    vp9_dequant_idct_add_y_block_4x4_inplace_c(
         xd->qcoeff, xd->block[0].dequant,
         xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+        xd->dst.y_stride, xd);
   }
   vp9_dequant_idct_add_uv_block_4x4_inplace_c(
       xd->qcoeff + 16 * 16, xd->block[16].dequant,
       xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
       xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.uv_stride, xd->eobs + 16, xd);
+      xd->dst.uv_stride, xd);
 };
 
 static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                 int mb_row, int mb_col,
                                 BOOL_DECODER* const bc) {
-  int i, n, eobtotal;
+  int n, eobtotal;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
   MODE_INFO *orig_mi = xd->mode_info_context;
@@ -720,7 +601,7 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
      */
-    skip_recon_mb(pbi, xd);
+    skip_recon_mb(pbi, xd, mb_row, mb_col);
     return;
   }
 
@@ -731,7 +612,8 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
   } else {
     vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   /* dequantization and idct */
@@ -770,7 +652,7 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                                 xd->dst.uv_stride * y_idx * 16,
                                               xd->dst.v_buffer + x_idx * 16 +
                                                 xd->dst.uv_stride * y_idx * 16,
-                                              xd->dst.uv_stride, xd->eobs + 16);
+                                              xd->dst.uv_stride, xd);
       }
     }
   } else {
@@ -783,10 +665,6 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
       xd->above_context = pc->above_context + mb_col + x_idx;
       xd->left_context = pc->left_context + y_idx;
       xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
-      for (i = 0; i < 25; i++) {
-        xd->block[i].eob = 0;
-        xd->eobs[i] = 0;
-      }
 
       eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
       if (eobtotal == 0) {  // skip loopfilter
@@ -812,7 +690,7 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
 static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                 int mb_row, int mb_col,
                                 BOOL_DECODER* const bc) {
-  int i, n, eobtotal;
+  int n, eobtotal;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
   MODE_INFO *orig_mi = xd->mode_info_context;
@@ -842,7 +720,7 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
      */
-    skip_recon_mb(pbi, xd);
+    skip_recon_mb(pbi, xd, mb_row, mb_col);
     return;
   }
 
@@ -853,7 +731,8 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
   } else {
     vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   /* dequantization and idct */
@@ -876,7 +755,7 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
       vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
                                             xd->block[16].dequant,
                                             xd->dst.u_buffer, xd->dst.v_buffer,
-                                            xd->dst.uv_stride, xd->eobs + 16);
+                                            xd->dst.uv_stride, xd);
     }
   } else {
     for (n = 0; n < 4; n++) {
@@ -888,10 +767,6 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
       xd->above_context = pc->above_context + mb_col + x_idx;
       xd->left_context = pc->left_context + y_idx + (mb_row & 2);
       xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
-      for (i = 0; i < 25; i++) {
-        xd->block[i].eob = 0;
-        xd->eobs[i] = 0;
-      }
 
       eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
       if (eobtotal == 0) {  // skip loopfilter
@@ -919,7 +794,6 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
                               BOOL_DECODER* const bc) {
   int eobtotal = 0;
   MB_PREDICTION_MODE mode;
-  int i;
   int tx_size;
 
   assert(!xd->mode_info_context->mbmi.sb_type);
@@ -934,10 +808,6 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp9_reset_mb_tokens_context(xd);
   } else if (!bool_error(bc)) {
-    for (i = 0; i < 25; i++) {
-      xd->block[i].eob = 0;
-      xd->eobs[i] = 0;
-    }
     if (mode != B_PRED) {
       eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
     }
@@ -948,14 +818,15 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
     vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,
                              &pbi->common);
 
-  if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV
-      && mode != I8X8_PRED
-      && !bool_error(bc)) {
+  if (eobtotal == 0 &&
+      mode != B_PRED &&
+      mode != SPLITMV &&
+      mode != I8X8_PRED &&
+      !bool_error(bc)) {
     /* Special case:  Force the loopfilter to skip when eobtotal and
-     * mb_skip_coeff are zero.
-     * */
+       mb_skip_coeff are zero. */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-    skip_recon_mb(pbi, xd);
+    skip_recon_mb(pbi, xd, mb_row, mb_col);
     return;
   }
 #ifdef DEC_DEBUG
@@ -982,7 +853,7 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
            xd->mode_info_context->mbmi.mode, tx_size,
            xd->mode_info_context->mbmi.interp_filter);
 #endif
-    vp9_build_inter_predictors_mb(xd);
+    vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   }
 
   if (tx_size == TX_16X16) {
@@ -1072,8 +943,9 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
   xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
   xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
 
-  xd->up_available = (mb_row != 0);
-  xd->left_available = (mb_col != 0);
+  xd->up_available    = (mb_row != 0);
+  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
+  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
 
   xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
   xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
@@ -1088,23 +960,14 @@ static void set_refs(VP9D_COMP *pbi, int block_size,
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (mbmi->ref_frame > INTRA_FRAME) {
-    int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride;
+    int ref_fb_idx;
 
     /* Select the appropriate reference frame for this MB */
-    if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cm->lst_fb_idx;
-    else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cm->gld_fb_idx;
-    else
-      ref_fb_idx = cm->alt_fb_idx;
-
-    ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-    ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col;
-    xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset;
-    ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-    ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col;
-    xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset;
-    xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset;
+    ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
+    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
+    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
+    setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     /* propagate errors from reference frames */
     xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
@@ -1113,19 +976,11 @@ static void set_refs(VP9D_COMP *pbi, int block_size,
       int second_ref_fb_idx;
 
       /* Select the appropriate reference frame for this MB */
-      if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cm->lst_fb_idx;
-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cm->gld_fb_idx;
-      else
-        second_ref_fb_idx = cm->alt_fb_idx;
-
-      xd->second_pre.y_buffer =
-          cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset;
-      xd->second_pre.u_buffer =
-          cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset;
-      xd->second_pre.v_buffer =
-          cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset;
+      second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
+
+      setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
 
       /* propagate errors from reference frames */
       xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
@@ -1156,7 +1011,8 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
   // For a SB there are 2 left contexts, each pertaining to a MB row within
   vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
 
-  for (mb_col = 0; mb_col < pc->mb_cols; mb_col += 4) {
+  for (mb_col = pc->cur_tile_mb_col_start;
+       mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {
     if (vp9_read(bc, pc->sb64_coded)) {
       set_offsets(pbi, 64, mb_row, mb_col);
       vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
@@ -1204,8 +1060,7 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
             vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
             update_blockd_bmi(xd);
             set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
-            vp9_intra_prediction_down_copy(xd);
-            decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+            decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
 
             /* check if the boolean decoder has suffered an error */
             xd->corrupted |= bool_error(bc);
@@ -1225,7 +1080,7 @@ static unsigned int read_partition_size(const unsigned char *cx_size) {
 static int read_is_valid(const unsigned char *start,
                          size_t               len,
                          const unsigned char *end) {
-  return (start + len > start && start + len <= end);
+  return start + len > start && start + len <= end;
 }
 
 
@@ -1265,55 +1120,14 @@ static void init_frame(VP9D_COMP *pbi) {
   MACROBLOCKD *const xd  = &pbi->mb;
 
   if (pc->frame_type == KEY_FRAME) {
-
-    if (pc->last_frame_seg_map)
-      vpx_memset(pc->last_frame_seg_map, 0, (pc->mb_rows * pc->mb_cols));
-
-    vp9_init_mv_probs(pc);
-
-    vp9_init_mbmode_probs(pc);
-    vp9_default_bmode_probs(pc->fc.bmode_prob);
-
-    vp9_default_coef_probs(pc);
-    vp9_kf_default_bmode_probs(pc->kf_bmode_prob);
-
-    // Reset the segment feature data to the default stats:
-    // Features disabled, 0, with delta coding (Default state).
-    vp9_clearall_segfeatures(xd);
-
-    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
-    /* reset the mode ref deltasa for loop filter */
-    vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-    vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-
+    vp9_setup_past_independence(pc, xd);
     /* All buffers are implicitly updated on key frames. */
-    pc->refresh_golden_frame = 1;
-    pc->refresh_alt_ref_frame = 1;
-    pc->copy_buffer_to_gf = 0;
-    pc->copy_buffer_to_arf = 0;
-
-    /* Note that Golden and Altref modes cannot be used on a key frame so
-     * ref_frame_sign_bias[] is undefined and meaningless
-     */
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
-
-    vp9_init_mode_contexts(&pbi->common);
-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
-    vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
-
-    vpx_memset(pc->prev_mip, 0,
-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
-    vpx_memset(pc->mip, 0,
-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
-
-    vp9_update_mode_info_border(pc, pc->mip);
-    vp9_update_mode_info_in_image(pc, pc->mi);
-
-
-  } else {
+    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;
+  } else if (pc->error_resilient_mode) {
+    vp9_setup_past_independence(pc, xd);
+  }
 
+  if (pc->frame_type != KEY_FRAME) {
     if (!pc->use_bilinear_mc_filter)
       pc->mcomp_filter_type = EIGHTTAP;
     else
@@ -1333,27 +1147,26 @@ static void init_frame(VP9D_COMP *pbi) {
   xd->fullpixel_mask = 0xffffffff;
   if (pc->full_pixel)
     xd->fullpixel_mask = 0xfffffff8;
-
 }
 
 static void read_coef_probs_common(BOOL_DECODER* const bc,
                                    vp9_coeff_probs *coef_probs,
                                    int block_types) {
-  int i, j, k, l;
+  int i, j, k, l, m;
 
   if (vp9_read_bit(bc)) {
     for (i = 0; i < block_types; i++) {
-      for (j = !i; j < COEF_BANDS; j++) {
-        /* NB: This j loop starts from 1 on block type i == 0 */
-        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-          if (k >= 3 && ((i == 0 && j == 1) ||
-                         (i > 0 && j == 0)))
-            continue;
-          for (l = 0; l < ENTROPY_NODES; l++) {
-            vp9_prob *const p = coef_probs[i][j][k] + l;
-
-            if (vp9_read(bc, COEF_UPDATE_PROB)) {
-              *p = read_prob_diff_update(bc, *p);
+      for (j = 0; j < REF_TYPES; j++) {
+        for (k = 0; k < COEF_BANDS; k++) {
+          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+            if (l >= 3 && k == 0)
+              continue;
+            for (m = 0; m < ENTROPY_NODES; m++) {
+              vp9_prob *const p = coef_probs[i][j][k][l] + m;
+
+              if (vp9_read(bc, COEF_UPDATE_PROB)) {
+                *p = read_prob_diff_update(bc, *p);
+              }
             }
           }
         }
@@ -1365,23 +1178,39 @@ static void read_coef_probs_common(BOOL_DECODER* const bc,
 static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
   VP9_COMMON *const pc = &pbi->common;
 
-  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES_4X4);
-  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);
+  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES);
 
   if (pbi->common.txfm_mode != ONLY_4X4) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES_8X8);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);
+    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES);
   }
   if (pbi->common.txfm_mode > ALLOW_8X8) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES_16X16);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16,
-                           BLOCK_TYPES_16X16);
+    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES);
   }
   if (pbi->common.txfm_mode > ALLOW_16X16) {
     read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES_32X32);
   }
 }
 
+static void update_frame_size(VP9D_COMP *pbi) {
+  VP9_COMMON *cm = &pbi->common;
+
+  /* our internal buffers are always multiples of 16 */
+  int width = (cm->Width + 15) & ~15;
+  int height = (cm->Height + 15) & ~15;
+
+  cm->mb_rows = height >> 4;
+  cm->mb_cols = width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+  memset(cm->mip, 0,
+        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));
+  vp9_update_mode_info_border(cm, cm->mip);
+
+  cm->mi = cm->mip + cm->mode_info_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  vp9_update_mode_info_in_image(cm, cm->mi);
+}
+
 int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   BOOL_DECODER header_bc, residual_bc;
   VP9_COMMON *const pc = &pbi->common;
@@ -1394,13 +1223,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   int i, j;
   int corrupt_tokens = 0;
 
+  // printf("Decoding frame %d\n", pc->current_video_frame);
   /* start with no corruption of current frame */
   xd->corrupted = 0;
   pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
 
   if (data_end - data < 3) {
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Truncated packet");
+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
   } else {
     pc->last_frame_type = pc->frame_type;
     pc->frame_type = (FRAME_TYPE)(data[0] & 1);
@@ -1419,9 +1248,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     vp9_setup_version(pc);
 
     if (pc->frame_type == KEY_FRAME) {
-      const int Width = pc->Width;
-      const int Height = pc->Height;
-
       /* vet via sync code */
       /* When error concealment is enabled we should only check the sync
        * code if we have enough bits available
@@ -1431,41 +1257,59 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                              "Invalid frame sync code");
       }
+      data += 3;
+    }
+    {
+      const int width = pc->Width;
+      const int height = pc->Height;
 
       /* If error concealment is enabled we should only parse the new size
        * if we have enough data. Otherwise we will end up with the wrong
        * size.
        */
-      if (data + 6 < data_end) {
-        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
-        pc->horiz_scale = data[4] >> 6;
-        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
-        pc->vert_scale = data[6] >> 6;
+      if (data + 4 < data_end) {
+        pc->Width = (data[0] | (data[1] << 8)) & 0x3fff;
+        pc->horiz_scale = data[1] >> 6;
+        pc->Height = (data[2] | (data[3] << 8)) & 0x3fff;
+        pc->vert_scale = data[3] >> 6;
       }
-      data += 7;
+      data += 4;
 
-      if (Width != pc->Width  ||  Height != pc->Height) {
+      if (width != pc->Width || height != pc->Height) {
         if (pc->Width <= 0) {
-          pc->Width = Width;
+          pc->Width = width;
           vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                              "Invalid frame width");
         }
 
         if (pc->Height <= 0) {
-          pc->Height = Height;
+          pc->Height = height;
           vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                              "Invalid frame height");
         }
 
-        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
-          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to allocate frame buffers");
+        if (!pbi->initial_width || !pbi->initial_height) {
+          if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
+            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffers");
+          pbi->initial_width = pc->Width;
+          pbi->initial_height = pc->Height;
+        }
+
+        if (pc->Width > pbi->initial_width) {
+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Frame width too large");
+        }
+
+        if (pc->Height > pbi->initial_height) {
+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Frame height too large");
+        }
+
+        update_frame_size(pbi);
       }
     }
   }
-#ifdef DEC_DEBUG
-  printf("Decode frame %d\n", pc->current_video_frame);
-#endif
 
   if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
       pc->Width == 0 || pc->Height == 0) {
@@ -1474,15 +1318,19 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   init_frame(pbi);
 
+  /* Reset the frame pointers to the current frame size */
+  vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],
+                                pc->mb_cols * 16, pc->mb_rows * 16,
+                                VP9BORDERINPIXELS);
+
   if (vp9_start_decode(&header_bc, data,
                        (unsigned int)first_partition_length_in_bytes))
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
-  if (pc->frame_type == KEY_FRAME) {
-    pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);
-    pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);
-  }
+  pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);
+  pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);
 
+  pc->error_resilient_mode = vp9_read_bit(&header_bc);
   /* Is segmentation enabled */
   xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);
 
@@ -1512,6 +1360,22 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           pc->segment_pred_probs[i] = 255;
         }
       }
+
+      if (pc->temporal_update) {
+        int count[4];
+        const vp9_prob *p = xd->mb_segment_tree_probs;
+        vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs;
+
+        count[0] =        p[0]  *        p[1];
+        count[1] =        p[0]  * (256 - p[1]);
+        count[2] = (256 - p[0]) *        p[2];
+        count[3] = (256 - p[0]) * (256 - p[2]);
+
+        p_mod[0] = get_binary_prob(count[1], count[2] + count[3]);
+        p_mod[1] = get_binary_prob(count[0], count[2] + count[3]);
+        p_mod[2] = get_binary_prob(count[0] + count[1], count[3]);
+        p_mod[3] = get_binary_prob(count[0] + count[1], count[2]);
+      }
     }
     // Is the segment data being updated
     xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);
@@ -1566,17 +1430,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   pc->sb64_coded = vp9_read_literal(&header_bc, 8);
   pc->sb32_coded = vp9_read_literal(&header_bc, 8);
-
-  /* Read the loop filter level and type */
-  pc->txfm_mode = vp9_read_literal(&header_bc, 2);
-  if (pc->txfm_mode == 3)
-    pc->txfm_mode += vp9_read_bit(&header_bc);
-  if (pc->txfm_mode == TX_MODE_SELECT) {
-    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
-    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
-    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+  xd->lossless = vp9_read_bit(&header_bc);
+  if (xd->lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  } else {
+    /* Read the loop filter level and type */
+    pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+    if (pc->txfm_mode == 3)
+      pc->txfm_mode += vp9_read_bit(&header_bc);
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
+      pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+      pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+    }
   }
-
   pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
   pc->filter_level = vp9_read_literal(&header_bc, 6);
   pc->sharpness_level = vp9_read_literal(&header_bc, 3);
@@ -1617,20 +1484,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   // Dummy read for now
   vp9_read_literal(&header_bc, 2);
 
-  setup_token_decoder(pbi, data + first_partition_length_in_bytes,
-                      &residual_bc);
-
   /* Read the default quantizers. */
   {
-    int Q, q_update;
+    int q_update = 0;
+    pc->base_qindex = vp9_read_literal(&header_bc, QINDEX_BITS);
 
-    Q = vp9_read_literal(&header_bc, QINDEX_BITS);
-    pc->base_qindex = Q;
-    q_update = 0;
     /* AC 1st order Q = default */
     pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
-    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);
-    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);
     pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
     pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
 
@@ -1645,27 +1505,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
    * For all non key frames the GF and ARF refresh flags and sign bias
    * flags must be set explicitly.
    */
-  if (pc->frame_type != KEY_FRAME) {
+  if (pc->frame_type == KEY_FRAME) {
+    pc->active_ref_idx[0] = pc->new_fb_idx;
+    pc->active_ref_idx[1] = pc->new_fb_idx;
+    pc->active_ref_idx[2] = pc->new_fb_idx;
+  } else {
     /* Should the GF or ARF be updated from the current frame */
-    pc->refresh_golden_frame = vp9_read_bit(&header_bc);
-    pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);
-
-    if (pc->refresh_alt_ref_frame) {
-      vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));
-    } else {
-      vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
-    }
-
-    /* Buffer to buffer copy flags. */
-    pc->copy_buffer_to_gf = 0;
+    pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES);
 
-    if (!pc->refresh_golden_frame)
-      pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);
+    /* Select active reference frames */
+    for (i = 0; i < 3; i++) {
+      int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);
 
-    pc->copy_buffer_to_arf = 0;
-
-    if (!pc->refresh_alt_ref_frame)
-      pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);
+      pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num];
+    }
 
     pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
     pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
@@ -1685,13 +1538,16 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
-  pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
-  if (pc->refresh_entropy_probs == 0) {
-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+  if (!pc->error_resilient_mode) {
+    pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
+    pc->frame_parallel_decoding_mode = vp9_read_bit(&header_bc);
+  } else {
+    pc->refresh_entropy_probs = 0;
+    pc->frame_parallel_decoding_mode = 1;
   }
-
-  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)
-                           || vp9_read_bit(&header_bc);
+  pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2);
+  vpx_memcpy(&pc->fc, &pc->frame_contexts[pc->frame_context_idx],
+             sizeof(pc->fc));
 
   // Read inter mode probability context updates
   if (pc->frame_type != KEY_FRAME) {
@@ -1708,11 +1564,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
 #if CONFIG_NEW_MVREF
   // If Key frame reset mv ref id probabilities to defaults
-  if (pc->frame_type == KEY_FRAME) {
-    // Defaults probabilities for encoding the MV ref id signal
-    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
-               sizeof(xd->mb_mv_ref_probs));
-  } else {
+  if (pc->frame_type != KEY_FRAME) {
     // Read any mv_ref index probability updates
     int i, j;
 
@@ -1735,28 +1587,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   if (0) {
     FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+    fprintf(z, "%6d F:%d,R:%d,Q:%d\n",
             pc->current_video_frame,
             pc->frame_type,
-            pc->refresh_golden_frame,
-            pc->refresh_alt_ref_frame,
-            pc->refresh_last_frame,
+            pbi->refresh_frame_flags,
             pc->base_qindex);
     fclose(z);
   }
 
   vp9_copy(pbi->common.fc.pre_coef_probs_4x4,
            pbi->common.fc.coef_probs_4x4);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_4x4,
-           pbi->common.fc.hybrid_coef_probs_4x4);
   vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
            pbi->common.fc.coef_probs_8x8);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
-           pbi->common.fc.hybrid_coef_probs_8x8);
   vp9_copy(pbi->common.fc.pre_coef_probs_16x16,
            pbi->common.fc.coef_probs_16x16);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
-           pbi->common.fc.hybrid_coef_probs_16x16);
   vp9_copy(pbi->common.fc.pre_coef_probs_32x32,
            pbi->common.fc.coef_probs_32x32);
   vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
@@ -1771,11 +1615,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 #endif
   pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
   vp9_zero(pbi->common.fc.coef_counts_4x4);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts_4x4);
   vp9_zero(pbi->common.fc.coef_counts_8x8);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
   vp9_zero(pbi->common.fc.coef_counts_16x16);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
   vp9_zero(pbi->common.fc.coef_counts_32x32);
   vp9_zero(pbi->common.fc.ymode_counts);
   vp9_zero(pbi->common.fc.sb_ymode_counts);
@@ -1792,8 +1633,11 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   read_coef_probs(pbi, &header_bc);
 
-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
-  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
+  /* Initialize xd pointers. Any reference should do for xd->pre, so use 0. */
+  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],
+             sizeof(YV12_BUFFER_CONFIG));
+  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx],
+             sizeof(YV12_BUFFER_CONFIG));
 
   // Create the segmentation map structure and set to 0
   if (!pc->last_frame_seg_map)
@@ -1815,14 +1659,106 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
-  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+  /* tile info */
+  {
+    const unsigned char *data_ptr = data + first_partition_length_in_bytes;
+    int tile_row, tile_col, delta_log2_tiles;
+
+    vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);
+    while (delta_log2_tiles--) {
+      if (vp9_read_bit(&header_bc)) {
+        pc->log2_tile_columns++;
+      } else {
+        break;
+      }
+    }
+    pc->log2_tile_rows = vp9_read_bit(&header_bc);
+    if (pc->log2_tile_rows)
+      pc->log2_tile_rows += vp9_read_bit(&header_bc);
+    pc->tile_columns = 1 << pc->log2_tile_columns;
+    pc->tile_rows    = 1 << pc->log2_tile_rows;
+
+    vpx_memset(pc->above_context, 0,
+               sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+
+    if (pbi->oxcf.inv_tile_order) {
+      const int n_cols = pc->tile_columns;
+      const unsigned char *data_ptr2[4][1 << 6];
+      BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak);
+
+      // pre-initialize the offsets, we're going to read in inverse order
+      data_ptr2[0][0] = data_ptr;
+      for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+        if (tile_row) {
+          int size = data_ptr2[tile_row - 1][n_cols - 1][0] |
+                    (data_ptr2[tile_row - 1][n_cols - 1][1] << 8) |
+                    (data_ptr2[tile_row - 1][n_cols - 1][2] << 16) |
+                    (data_ptr2[tile_row - 1][n_cols - 1][3] << 24);
+          data_ptr2[tile_row - 1][n_cols - 1] += 4;
+          data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
+        }
+
+        for (tile_col = 1; tile_col < n_cols; tile_col++) {
+          int size = data_ptr2[tile_row][tile_col - 1][0] |
+                    (data_ptr2[tile_row][tile_col - 1][1] << 8) |
+                    (data_ptr2[tile_row][tile_col - 1][2] << 16) |
+                    (data_ptr2[tile_row][tile_col - 1][3] << 24);
+          data_ptr2[tile_row][tile_col - 1] += 4;
+          data_ptr2[tile_row][tile_col] =
+              data_ptr2[tile_row][tile_col - 1] + size;
+        }
+      }
+
+      for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+        vp9_get_tile_row_offsets(pc, tile_row);
+        for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {
+          vp9_get_tile_col_offsets(pc, tile_col);
+          setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], &residual_bc);
+
+          /* Decode a row of superblocks */
+          for (mb_row = pc->cur_tile_mb_row_start;
+               mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
+            decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+          }
+          if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)
+            bc_bak = residual_bc;
+        }
+      }
+      residual_bc = bc_bak;
+    } else {
+      for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+        vp9_get_tile_row_offsets(pc, tile_row);
+        for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
+          vp9_get_tile_col_offsets(pc, tile_col);
+
+          if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
+            setup_token_decoder(pbi, data_ptr + 4, &residual_bc);
+          else
+            setup_token_decoder(pbi, data_ptr, &residual_bc);
+
+          /* Decode a row of superblocks */
+          for (mb_row = pc->cur_tile_mb_row_start;
+               mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
+            decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+          }
 
-  /* Decode a row of superblocks */
-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {
-    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+          if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
+            int size = data_ptr[0] |
+                      (data_ptr[1] << 8) |
+                      (data_ptr[2] << 16) |
+                      (data_ptr[3] << 24);
+            data_ptr += 4 + size;
+          }
+        }
+      }
+    }
   }
   corrupt_tokens |= xd->corrupted;
 
+  // keep track of the last coded dimensions
+  pc->last_width = pc->Width;
+  pc->last_height = pc->Height;
+
   /* Collect information about decoder corruption. */
   /* 1. Check first boolean decoder for errors. */
   pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
@@ -1838,23 +1774,21 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
                          "A stream must start with a complete key frame");
   }
 
-  vp9_adapt_coef_probs(pc);
+  if (!pc->error_resilient_mode &&
+      !pc->frame_parallel_decoding_mode)
+    vp9_adapt_coef_probs(pc);
   if (pc->frame_type != KEY_FRAME) {
-    vp9_adapt_mode_probs(pc);
-    vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
-    vp9_update_mode_context(&pbi->common);
+    if (!pc->error_resilient_mode &&
+        !pc->frame_parallel_decoding_mode) {
+      vp9_adapt_mode_probs(pc);
+      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+      vp9_adapt_mode_context(&pbi->common);
+    }
   }
 
-  /* If this was a kf or Gf note the Q used */
-  if ((pc->frame_type == KEY_FRAME) ||
-      pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {
-    pc->last_kf_gf_q = pc->base_qindex;
-  }
   if (pc->refresh_entropy_probs) {
-    if (pc->refresh_alt_ref_frame)
-      vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
-    else
-      vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+    vpx_memcpy(&pc->frame_contexts[pc->frame_context_idx], &pc->fc,
+               sizeof(pc->fc));
   }
 
 #ifdef PACKET_TESTING
@@ -1866,7 +1800,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     fclose(f);
   }
 #endif
-  // printf("Frame %d Done\n", frame_count++);
 
   /* Find the end of the coded buffer */
   while (residual_bc.count > CHAR_BIT
diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index ae25428c4..391a26519 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -14,6 +14,6 @@
 
 struct VP9Decompressor;
 
-extern void vp9_init_de_quantizer(struct VP9Decompressor *pbi);
+void vp9_init_de_quantizer(struct VP9Decompressor *pbi);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 354d2bd36..5a98b1150 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -14,14 +14,14 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_common.h"
+
 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                          uint8_t *dest, int stride, int width, int height) {
   int r, c;
 
   for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++) {
+    for (c = 0; c < width; c++)
       dest[c] = clip_pixel(diff[c] + pred[c]);
-    }
 
     dest += stride;
     diff += width;
@@ -35,126 +35,107 @@ static void add_constant_residual(const int16_t diff, const uint8_t *pred,
   int r, c;
 
   for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++) {
+    for (c = 0; c < width; c++)
       dest[c] = clip_pixel(diff + pred[c]);
-    }
 
     dest += stride;
     pred += pitch;
   }
 }
 
-void vp9_dequantize_b_c(BLOCKD *d) {
-
-  int i;
-  int16_t *DQ  = d->dqcoeff;
-  const int16_t *Q   = d->qcoeff;
-  const int16_t *DQC = d->dequant;
-
-  for (i = 0; i < 16; i++) {
-    DQ[i] = Q[i] * DQC[i];
-  }
-}
-
-
 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
                                const int16_t *dq,
                                uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride, uint16_t eobs) {
-  int16_t output[16];
-  int16_t *diff_ptr = output;
+                               int pitch, int stride, int eob) {
   int i;
+  int16_t output[16];
 
-  for (i = 0; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
-
-  vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs);
+  for (i = 0; i < 16; i++)
+    input[i] *= dq[i];
 
+  vp9_short_iht4x4(input, output, 4, tx_type);
   vpx_memset(input, 0, 32);
-
-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(output, pred, pitch, dest, stride, 4, 4);
 }
 
 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
                                    const int16_t *dq,
                                    uint8_t *pred, uint8_t *dest,
-                                   int pitch, int stride, uint16_t eobs) {
+                                   int pitch, int stride, int eob) {
   int16_t output[64];
-  int16_t *diff_ptr = output;
-  int i;
-  if (eobs == 0) {
-    /* All 0 DCT coefficient */
+
+  if (eob == 0) {
+    // All 0 DCT coefficients
     vp9_copy_mem8x8(pred, pitch, dest, stride);
-  } else if (eobs > 0) {
-    input[0] = dq[0] * input[0];
-    for (i = 1; i < 64; i++) {
-      input[i] = dq[1] * input[i];
-    }
+  } else if (eob > 0) {
+    int i;
 
-    vp9_ihtllm(input, output, 16, tx_type, 8, eobs);
+    input[0] *= dq[0];
+    for (i = 1; i < 64; i++)
+      input[i] *= dq[1];
 
+    vp9_short_iht8x8(input, output, 8, tx_type);
     vpx_memset(input, 0, 128);
-
-    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+    add_residual(output, pred, pitch, dest, stride, 8, 8);
   }
 }
 
 void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
-                            uint8_t *dest, int pitch, int stride) {
-  int16_t output[16];
-  int16_t *diff_ptr = output;
+                            uint8_t *dest, int pitch, int stride, int eob) {
   int i;
+  int16_t output[16];
 
-  for (i = 0; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
+  if (eob > 1) {
+    for (i = 0; i < 16; i++)
+      input[i] *= dq[i];
 
-  /* the idct halves ( >> 1) the pitch */
-  vp9_short_idct4x4llm_c(input, output, 4 << 1);
+    // the idct halves ( >> 1) the pitch
+    vp9_short_idct4x4llm_c(input, output, 4 << 1);
 
-  vpx_memset(input, 0, 32);
+    vpx_memset(input, 0, 32);
 
-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
+    add_residual(output, pred, pitch, dest, stride, 4, 4);
+  } else {
+    vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride);
+    ((int *)input)[0] = 0;
+  }
 }
 
 void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
-                               uint8_t *dest, int pitch, int stride, int Dc) {
+                               uint8_t *dest, int pitch, int stride, int dc) {
   int i;
   int16_t output[16];
-  int16_t *diff_ptr = output;
 
-  input[0] = (int16_t)Dc;
+  input[0] = dc;
 
-  for (i = 1; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
+  for (i = 1; i < 16; i++)
+    input[i] *= dq[i];
 
-  /* the idct halves ( >> 1) the pitch */
+  // the idct halves ( >> 1) the pitch
   vp9_short_idct4x4llm_c(input, output, 4 << 1);
-
   vpx_memset(input, 0, 32);
-
-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(output, pred, pitch, dest, stride, 4, 4);
 }
 
-#if CONFIG_LOSSLESS
 void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                      uint8_t *pred, uint8_t *dest,
-                                     int pitch, int stride) {
-  int16_t output[16];
-  int16_t *diff_ptr = output;
+                                     int pitch, int stride, int eob) {
   int i;
+  int16_t output[16];
 
-  for (i = 0; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
+  if (eob > 1) {
+    for (i = 0; i < 16; i++)
+      input[i] *= dq[i];
 
-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+    vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
 
-  vpx_memset(input, 0, 32);
+    vpx_memset(input, 0, 32);
 
-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
+    add_residual(output, pred, pitch, dest, stride, 4, 4);
+  } else {
+    vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride);
+    ((int *)input)[0] = 0;
+  }
 }
 
 void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
@@ -163,76 +144,55 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                         int pitch, int stride, int dc) {
   int i;
   int16_t output[16];
-  int16_t *diff_ptr = output;
 
-  input[0] = (int16_t)dc;
+  input[0] = dc;
 
-  for (i = 1; i < 16; i++) {
-    input[i] = dq[i] * input[i];
-  }
+  for (i = 1; i < 16; i++)
+    input[i] *= dq[i];
 
   vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
-
-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
-}
-#endif
-
-void vp9_dequantize_b_2x2_c(BLOCKD *d) {
-  int i;
-  int16_t *DQ  = d->dqcoeff;
-  const int16_t *Q   = d->qcoeff;
-  const int16_t *DQC = d->dequant;
-
-  for (i = 0; i < 16; i++) {
-    DQ[i] = (int16_t)((Q[i] * DQC[i]));
-  }
+  add_residual(output, pred, pitch, dest, stride, 4, 4);
 }
 
 void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
                                 uint8_t *pred, uint8_t *dest, int pitch,
-                                int stride, int dc, int eob) {
+                                int stride, int eob) {
   int16_t output[64];
-  int16_t *diff_ptr = output;
-  int i;
 
-  /* If dc is 1, then input[0] is the reconstructed value, do not need
-   * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-   */
-  if (!dc)
-    input[0] *= dq[0];
 
-  /* The calculation can be simplified if there are not many non-zero dct
-   * coefficients. Use eobs to decide what to do.
-   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
-   * Combine that with code here.
-   */
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+  input[0] *= dq[0];
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
   if (eob == 0) {
-    /* All 0 DCT coefficient */
+    // All 0 DCT coefficients
     vp9_copy_mem8x8(pred, pitch, dest, stride);
   } else if (eob == 1) {
-    /* DC only DCT coefficient. */
+    // DC only DCT coefficient
+    int16_t in = input[0];
     int16_t out;
 
-    /* Note: the idct1 will need to be modified accordingly whenever
-     * vp9_short_idct8x8_c() is modified. */
-    out = (input[0] + 1 + (input[0] < 0)) >> 2;
-    out = out << 3;
-    out = (out + 32) >> 7;
-
+     // Note: the idct1 will need to be modified accordingly whenever
+     // vp9_short_idct8x8_c() is modified.
+    vp9_short_idct1_8x8_c(&in, &out);
     input[0] = 0;
 
     add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
   } else if (eob <= 10) {
-    input[1] = input[1] * dq[1];
-    input[2] = input[2] * dq[1];
-    input[3] = input[3] * dq[1];
-    input[8] = input[8] * dq[1];
-    input[9] = input[9] * dq[1];
-    input[10] = input[10] * dq[1];
-    input[16] = input[16] * dq[1];
-    input[17] = input[17] * dq[1];
-    input[24] = input[24] * dq[1];
+    input[1] *= dq[1];
+    input[2] *= dq[1];
+    input[3] *= dq[1];
+    input[8] *= dq[1];
+    input[9] *= dq[1];
+    input[10] *= dq[1];
+    input[16] *= dq[1];
+    input[17] *= dq[1];
+    input[24] *= dq[1];
 
     vp9_short_idct10_8x8_c(input, output, 16);
 
@@ -241,48 +201,48 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
     input[16] = input[17] = 0;
     input[24] = 0;
 
-    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+    add_residual(output, pred, pitch, dest, stride, 8, 8);
   } else {
+    int i;
+
     // recover quantizer for 4 4x4 blocks
-    for (i = 1; i < 64; i++) {
-      input[i] = input[i] * dq[1];
-    }
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct8x8_c(input, output, 16);
+    for (i = 1; i < 64; i++)
+      input[i] *= dq[1];
 
+    // the idct halves ( >> 1) the pitch
+    vp9_short_idct8x8_c(input, output, 8 << 1);
     vpx_memset(input, 0, 128);
-
-    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
-
+    add_residual(output, pred, pitch, dest, stride, 8, 8);
   }
 }
 
 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
                                      const int16_t *dq, uint8_t *pred,
                                      uint8_t *dest, int pitch, int stride,
-                                     uint16_t eobs) {
+                                     int eob) {
   int16_t output[256];
-  int16_t *diff_ptr = output;
-  int i;
-  if (eobs == 0) {
-    /* All 0 DCT coefficient */
+
+  if (eob == 0) {
+    // All 0 DCT coefficients
     vp9_copy_mem16x16(pred, pitch, dest, stride);
-  } else if (eobs > 0) {
-    input[0]= input[0] * dq[0];
+  } else if (eob > 0) {
+    int i;
+
+    input[0] *= dq[0];
 
     // recover quantizer for 4 4x4 blocks
     for (i = 1; i < 256; i++)
-      input[i] = input[i] * dq[1];
+      input[i] *= dq[1];
 
     // inverse hybrid transform
-    vp9_ihtllm(input, output, 32, tx_type, 16, eobs);
+    vp9_short_iht16x16(input, output, 16, tx_type);
 
     // the idct halves ( >> 1) the pitch
     // vp9_short_idct16x16_c(input, output, 32);
 
     vpx_memset(input, 0, 512);
 
-    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(output, pred, pitch, dest, stride, 16, 16);
   }
 }
 
@@ -290,8 +250,6 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
                                   uint8_t *pred, uint8_t *dest, int pitch,
                                   int stride, int eob) {
   int16_t output[256];
-  int16_t *diff_ptr = output;
-  int i;
 
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
@@ -300,28 +258,26 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
     vp9_copy_mem16x16(pred, pitch, dest, stride);
   } else if (eob == 1) {
     /* DC only DCT coefficient. */
+    int16_t in = input[0] * dq[0];
     int16_t out;
-
     /* Note: the idct1 will need to be modified accordingly whenever
      * vp9_short_idct16x16_c() is modified. */
-    out = (input[0] * dq[0] + 2) >> 2;
-    out = (out + 2) >> 2;
-    out = (out + 4) >> 3;
-
+    vp9_short_idct1_16x16_c(&in, &out);
     input[0] = 0;
 
     add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
   } else if (eob <= 10) {
-    input[0]= input[0] * dq[0];
-    input[1] = input[1] * dq[1];
-    input[2] = input[2] * dq[1];
-    input[3] = input[3] * dq[1];
-    input[16] = input[16] * dq[1];
-    input[17] = input[17] * dq[1];
-    input[18] = input[18] * dq[1];
-    input[32] = input[32] * dq[1];
-    input[33] = input[33] * dq[1];
-    input[48] = input[48] * dq[1];
+    input[0] *= dq[0];
+
+    input[1] *= dq[1];
+    input[2] *= dq[1];
+    input[3] *= dq[1];
+    input[16] *= dq[1];
+    input[17] *= dq[1];
+    input[18] *= dq[1];
+    input[32] *= dq[1];
+    input[33] *= dq[1];
+    input[48] *= dq[1];
 
     // the idct halves ( >> 1) the pitch
     vp9_short_idct10_16x16_c(input, output, 32);
@@ -331,20 +287,22 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
     input[32] = input[33] = 0;
     input[48] = 0;
 
-    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(output, pred, pitch, dest, stride, 16, 16);
   } else {
-    input[0]= input[0] * dq[0];
+    int i;
+
+    input[0] *= dq[0];
 
     // recover quantizer for 4 4x4 blocks
     for (i = 1; i < 256; i++)
-      input[i] = input[i] * dq[1];
+      input[i] *= dq[1];
 
     // the idct halves ( >> 1) the pitch
-    vp9_short_idct16x16_c(input, output, 32);
+    vp9_short_idct16x16_c(input, output, 16 << 1);
 
     vpx_memset(input, 0, 512);
 
-    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(output, pred, pitch, dest, stride, 16, 16);
   }
 }
 
@@ -352,23 +310,51 @@ void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
                                   uint8_t *pred, uint8_t *dest, int pitch,
                                   int stride, int eob) {
   int16_t output[1024];
-  int i;
-
-  input[0]= input[0] * dq[0] / 2;
-  for (i = 1; i < 1024; i++)
-    input[i] = input[i] * dq[1] / 2;
-  vp9_short_idct32x32_c(input, output, 64);
-  vpx_memset(input, 0, 2048);
 
-  add_residual(output, pred, pitch, dest, stride, 32, 32);
+  if (eob) {
+    input[0] = input[0] * dq[0] / 2;
+    if (eob == 1) {
+      vp9_short_idct1_32x32(input, output);
+      add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32);
+      input[0] = 0;
+    } else if (eob <= 10) {
+      input[1] = input[1] * dq[1] / 2;
+      input[2] = input[2] * dq[1] / 2;
+      input[3] = input[3] * dq[1] / 2;
+      input[32] = input[32] * dq[1] / 2;
+      input[33] = input[33] * dq[1] / 2;
+      input[34] = input[34] * dq[1] / 2;
+      input[64] = input[64] * dq[1] / 2;
+      input[65] = input[65] * dq[1] / 2;
+      input[96] = input[96] * dq[1] / 2;
+
+      // the idct halves ( >> 1) the pitch
+      vp9_short_idct10_32x32(input, output, 64);
+
+      input[0] = input[1] = input[2] = input[3] = 0;
+      input[32] = input[33] = input[34] = 0;
+      input[64] = input[65] = 0;
+      input[96] = 0;
+
+      add_residual(output, pred, pitch, dest, stride, 32, 32);
+    } else {
+      int i;
+      for (i = 1; i < 1024; i++)
+        input[i] = input[i] * dq[1] / 2;
+      vp9_short_idct32x32(input, output, 64);
+      vpx_memset(input, 0, 2048);
+      add_residual(output, pred, pitch, dest, stride, 32, 32);
+    }
+  }
 }
 
 void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq,
                                            uint8_t *dstu,
                                            uint8_t *dstv,
                                            int stride,
-                                           uint16_t *eobs) {
-  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]);
-  vp9_dequant_idct_add_16x16_c(q + 256, dq,
-                               dstv, dstv, stride, stride, eobs[4]);
+                                           MACROBLOCKD *xd) {
+  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride,
+                               xd->eobs[16]);
+  vp9_dequant_idct_add_16x16_c(q + 256, dq, dstv, dstv, stride, stride,
+                               xd->eobs[20]);
 }
diff --git a/vp9/decoder/vp9_dequantize.h b/vp9/decoder/vp9_dequantize.h
index 2a0ae80e8..bde27bb7a 100644
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@@ -11,91 +11,86 @@
 
 #ifndef VP9_DECODER_VP9_DEQUANTIZE_H_
 #define VP9_DECODER_VP9_DEQUANTIZE_H_
+
 #include "vp9/common/vp9_blockd.h"
 
-#if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
-                                            unsigned char *pred,
-                                            unsigned char *output,
-                                            int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
-                                               unsigned char *pred,
-                                               unsigned char *output,
-                                               int pitch, int stride, int dc);
-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
-                                                       const int16_t *dq,
-                                                       unsigned char *pre,
-                                                       unsigned char *dst,
-                                                       int stride,
-                                                       uint16_t *eobs,
-                                                       const int16_t *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
-                                                    unsigned char *pre,
-                                                    unsigned char *dst,
-                                                    int stride,
-                                                    uint16_t *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
-                                                     unsigned char *pre,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
-                                                     int stride,
-                                                     uint16_t *eobs);
-#endif
-
-typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-    unsigned char *pred, unsigned char *output, int pitch, int stride);
-typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
-
-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
-    const int16_t *dc);
-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
-    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
-    uint16_t *eobs);
+
+void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                     unsigned char *pred,
+                                     unsigned char *output,
+                                     int pitch, int stride, int eob);
+
+void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                        unsigned char *pred,
+                                        unsigned char *output,
+                                        int pitch, int stride, int dc);
+
+void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
+                                                const int16_t *dq,
+                                                unsigned char *pre,
+                                                unsigned char *dst,
+                                                int stride,
+                                                const int16_t *dc);
+
+void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                             unsigned char *pre,
+                                             unsigned char *dst,
+                                             int stride,
+                                             struct macroblockd *xd);
+
+void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
+                                              unsigned char *pre,
+                                              unsigned char *dst_u,
+                                              unsigned char *dst_v,
+                                              int stride,
+                                              struct macroblockd *xd);
 
 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
                                     unsigned char *pred, unsigned char *dest,
-                                    int pitch, int stride, uint16_t eobs);
+                                    int pitch, int stride, int eob);
 
 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
                                    const int16_t *dq, unsigned char *pred,
                                    unsigned char *dest, int pitch, int stride,
-                                   uint16_t eobs);
+                                   int eob);
 
 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
                                      const int16_t *dq, unsigned char *pred,
                                      unsigned char *dest,
-                                     int pitch, int stride, uint16_t eobs);
+                                     int pitch, int stride, int eob);
 
 void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                    unsigned char *dst,
                                                    int stride,
-                                                   uint16_t *eobs,
                                                    const int16_t *dc,
                                                    MACROBLOCKD *xd);
 
+void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                unsigned char *dst,
+                                                int stride,
+                                                MACROBLOCKD *xd);
+
 void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                    unsigned char *dst,
                                                    int stride,
-                                                   uint16_t *eobs,
                                                    const int16_t *dc,
                                                    MACROBLOCKD *xd);
 
+void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                unsigned char *dst,
+                                                int stride,
+                                                MACROBLOCKD *xd);
+
 void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                  unsigned char *dstu,
                                                  unsigned char *dstv,
                                                  int stride,
-                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd);
 
 void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                  unsigned char *dstu,
                                                  unsigned char *dstv,
                                                  int stride,
-                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd);
 
-#endif
+#endif  // VP9_DECODER_VP9_DEQUANTIZE_H_
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 335c335ca..d3fb25ace 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -63,24 +63,11 @@ static int get_signed(BOOL_DECODER *br, int value_to_sign) {
   return decode_bool(br, 128) ? -value_to_sign : value_to_sign;
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-#define PT pn
-#define INCREMENT_COUNT(token)                       \
-  do {                                               \
-    coef_counts[type][coef_bands[c]][pn][token]++;   \
-    pn = pt = vp9_prev_token_class[token];           \
-    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(coef_bands[c + 1]))  \
-      pn = vp9_get_coef_neighbor_context(            \
-          qcoeff_ptr, nodc, neighbors, scan[c + 1]); \
-  } while (0)
-#else
-#define PT pt
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[type][coef_bands[c]][pt][token]++; \
-    pt = vp9_prev_token_class[token];              \
+    coef_counts[type][ref][get_coef_band(txfm_size, c)][pt][token]++;     \
+    pt = vp9_get_coef_context(&recent_energy, token);         \
   } while (0)
-#endif  /* CONFIG_NEWCOEFCONTEXT */
 
 #define WRITE_COEF_CONTINUE(val, token)                       \
   {                                                           \
@@ -97,77 +84,90 @@ static int get_signed(BOOL_DECODER *br, int value_to_sign) {
   } while (0);
 
 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
-                        BOOL_DECODER* const br,
-                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                        PLANE_TYPE type,
-                        TX_TYPE tx_type,
+                        BOOL_DECODER* const br, int block_idx,
+                        PLANE_TYPE type, TX_TYPE tx_type,
                         int seg_eob, int16_t *qcoeff_ptr,
-                        const int *const scan, TX_SIZE txfm_size,
-                        const int *coef_bands) {
+                        const int *const scan, TX_SIZE txfm_size) {
+  ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context;
+  ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context;
+  const int aidx = vp9_block2above[txfm_size][block_idx];
+  const int lidx = vp9_block2left[txfm_size][block_idx];
+  ENTROPY_CONTEXT above_ec = A0[aidx] != 0, left_ec = L0[lidx] != 0;
   FRAME_CONTEXT *const fc = &dx->common.fc;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-  int pn;
-#endif
-  int nodc = (type == PLANE_TYPE_Y_NO_DC);
-  int pt, c = nodc;
+  int recent_energy = 0;
+  int pt, c = 0;
   vp9_coeff_probs *coef_probs;
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
+  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
 
   switch (txfm_size) {
     default:
     case TX_4X4:
-      if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_4x4;
-        coef_counts = fc->coef_counts_4x4;
-      } else {
-        coef_probs  = fc->hybrid_coef_probs_4x4;
-        coef_counts = fc->hybrid_coef_counts_4x4;
-      }
+      coef_probs  = fc->coef_probs_4x4;
+      coef_counts = fc->coef_counts_4x4;
       break;
     case TX_8X8:
-      if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_8x8;
-        coef_counts = fc->coef_counts_8x8;
-      } else {
-        coef_probs  = fc->hybrid_coef_probs_8x8;
-        coef_counts = fc->hybrid_coef_counts_8x8;
-      }
+      coef_probs  = fc->coef_probs_8x8;
+      coef_counts = fc->coef_counts_8x8;
+      above_ec = (A0[aidx] + A0[aidx + 1]) != 0;
+      left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;
       break;
     case TX_16X16:
-      if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_16x16;
-        coef_counts = fc->coef_counts_16x16;
+      coef_probs  = fc->coef_probs_16x16;
+      coef_counts = fc->coef_counts_16x16;
+      if (type == PLANE_TYPE_UV) {
+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
+        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1]) != 0;
+        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1]) != 0;
       } else {
-        coef_probs  = fc->hybrid_coef_probs_16x16;
-        coef_counts = fc->hybrid_coef_counts_16x16;
+        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3]) != 0;
+        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0;
       }
       break;
     case TX_32X32:
       coef_probs = fc->coef_probs_32x32;
       coef_counts = fc->coef_counts_32x32;
+      if (type == PLANE_TYPE_UV) {
+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
+        ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);
+        ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);
+        ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);
+        ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);
+        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1] +
+                    A2[aidx] + A2[aidx + 1] + A3[aidx] + A3[aidx + 1]) != 0;
+        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1] +
+                    L2[lidx] + L2[lidx + 1] + L3[lidx] + L3[lidx + 1]) != 0;
+      } else {
+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
+        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3] +
+                    A1[aidx] + A1[aidx + 1] + A1[aidx + 2] + A1[aidx + 3]) != 0;
+        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3] +
+                    L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0;
+      }
       break;
   }
 
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-#if CONFIG_NEWCOEFCONTEXT
-  pn = pt;
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-#endif
+  VP9_COMBINEENTROPYCONTEXTS(pt, above_ec, left_ec);
   while (1) {
     int val;
     const uint8_t *cat6 = cat6_prob;
-    if (c >= seg_eob) break;
-    prob = coef_probs[type][coef_bands[c]][PT];
+
+    if (c >= seg_eob)
+      break;
+    prob = coef_probs[type][ref][get_coef_band(txfm_size, c)][pt];
     if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
       break;
 SKIP_START:
-    if (c >= seg_eob) break;
+    if (c >= seg_eob)
+      break;
     if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
-      prob = coef_probs[type][coef_bands[c]][PT];
+      prob = coef_probs[type][ref][get_coef_band(txfm_size, c)][pt];
       goto SKIP_START;
     }
     // ONE_CONTEXT_NODE_0_
@@ -231,193 +231,110 @@ SKIP_START:
   }
 
   if (c < seg_eob)
-    coef_counts[type][coef_bands[c]][PT][DCT_EOB_TOKEN]++;
-
-  a[0] = l[0] = (c > !type);
+    coef_counts[type][ref][get_coef_band(txfm_size, c)][pt][DCT_EOB_TOKEN]++;
+
+  A0[aidx] = L0[lidx] = c > 0;
+  if (txfm_size >= TX_8X8) {
+    A0[aidx + 1] = L0[lidx + 1] = A0[aidx];
+    if (txfm_size >= TX_16X16) {
+      if (type == PLANE_TYPE_UV) {
+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
+        A1[aidx] = A1[aidx + 1] = L1[aidx] = L1[lidx + 1] = A0[aidx];
+        if (txfm_size >= TX_32X32) {
+          ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);
+          ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);
+          ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);
+          ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);
+          A2[aidx] = A2[aidx + 1] = A3[aidx] = A3[aidx + 1] = A0[aidx];
+          L2[lidx] = L2[lidx + 1] = L3[lidx] = L3[lidx + 1] = A0[aidx];
+        }
+      } else {
+        A0[aidx + 2] = A0[aidx + 3] = L0[lidx + 2] = L0[lidx + 3] = A0[aidx];
+        if (txfm_size >= TX_32X32) {
+          ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
+          ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
+          A1[aidx] = A1[aidx + 1] = A1[aidx + 2] = A1[aidx + 3] = A0[aidx];
+          L1[lidx] = L1[lidx + 1] = L1[lidx + 2] = L1[lidx + 3] = A0[aidx];
+        }
+      }
+    }
+  }
 
   return c;
 }
 
 static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
-  int active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
-  int eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
-  if (!active || eob > eob_max)
-    eob = eob_max;
-  return eob;
+  return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
                          MACROBLOCKD* const xd,
                          BOOL_DECODER* const bc) {
-  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-  ENTROPY_CONTEXT* const A1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]);
-  ENTROPY_CONTEXT* const L1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]);
-  uint16_t *const eobs = xd->eobs;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int c, i, eobtotal = 0, seg_eob;
+  int i, eobtotal = 0, seg_eob;
 
   // Luma block
-#if CONFIG_CNVCONTEXT
-  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3] +
-                              A1[0] + A1[1] + A1[2] + A1[3]) != 0;
-  ENTROPY_CONTEXT left_ec =  (L[0] + L[1] + L[2] + L[3] +
-                              L1[0] + L1[1] + L1[2] + L1[3]) != 0;
-#else
-  ENTROPY_CONTEXT above_ec = A[0];
-  ENTROPY_CONTEXT left_ec =  L[0];
-#endif
-  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,
-                             PLANE_TYPE_Y_WITH_DC,
-                             DCT_DCT, get_eob(xd, segment_id, 1024),
-                             xd->sb_coeff_data.qcoeff,
-                             vp9_default_zig_zag1d_32x32,
-                             TX_32X32, vp9_coef_bands_32x32);
-  A[1] = A[2] = A[3] = A[0] = above_ec;
-  L[1] = L[2] = L[3] = L[0] = left_ec;
-  A1[1] = A1[2] = A1[3] = A1[0] = above_ec;
-  L1[1] = L1[2] = L1[3] = L1[0] = left_ec;
-
+  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
+                       DCT_DCT, get_eob(xd, segment_id, 1024),
+                       xd->sb_coeff_data.qcoeff,
+                       vp9_default_zig_zag1d_32x32, TX_32X32);
+  xd->eobs[0] = c;
   eobtotal += c;
 
   // 16x16 chroma blocks
   seg_eob = get_eob(xd, segment_id, 256);
-
   for (i = 16; i < 24; i += 4) {
-    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_16X16][i];
-    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_16X16][i];
-    ENTROPY_CONTEXT* const a1 = A1 + vp9_block2above[TX_16X16][i];
-    ENTROPY_CONTEXT* const l1 = L1 + vp9_block2left[TX_16X16][i];
-#if CONFIG_CNVCONTEXT
-    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-#else
-    above_ec = a[0];
-    left_ec = l[0];
-#endif
-
-    eobs[i] = c = decode_coefs(pbi, xd, bc,
-                               &above_ec, &left_ec,
-                               PLANE_TYPE_UV,
-                               DCT_DCT, seg_eob,
-                               xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
-                               vp9_default_zig_zag1d_16x16,
-                               TX_16X16, vp9_coef_bands_16x16);
-
-    a1[1] = a1[0] = a[1] = a[0] = above_ec;
-    l1[1] = l1[0] = l[1] = l[0] = left_ec;
+    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                     xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+                     vp9_default_zig_zag1d_16x16, TX_16X16);
+    xd->eobs[i] = c;
     eobtotal += c;
   }
-  // no Y2 block
-  A[8] = L[8] = A1[8] = L1[8] = 0;
+
   return eobtotal;
 }
 
 static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
                                       MACROBLOCKD* const xd,
                                       BOOL_DECODER* const bc) {
-  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-  uint16_t *const eobs = xd->eobs;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int c, i, eobtotal = 0, seg_eob;
-  // Luma block
+  int i, eobtotal = 0, seg_eob;
 
-#if CONFIG_CNVCONTEXT
-  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
-  ENTROPY_CONTEXT left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
-#else
-  ENTROPY_CONTEXT above_ec = A[0];
-  ENTROPY_CONTEXT left_ec = L[0];
-#endif
-  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,
-                             PLANE_TYPE_Y_WITH_DC,
-                             get_tx_type(xd, &xd->block[0]),
-                             get_eob(xd, segment_id, 256),
-                             xd->qcoeff, vp9_default_zig_zag1d_16x16,
-                             TX_16X16, vp9_coef_bands_16x16);
-  A[1] = A[2] = A[3] = A[0] = above_ec;
-  L[1] = L[2] = L[3] = L[0] = left_ec;
+  // Luma block
+  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
+                       get_tx_type(xd, &xd->block[0]),
+                       get_eob(xd, segment_id, 256),
+                       xd->qcoeff, vp9_default_zig_zag1d_16x16, TX_16X16);
+  xd->eobs[0] = c;
   eobtotal += c;
 
   // 8x8 chroma blocks
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 16; i < 24; i += 4) {
-    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_8X8][i];
-    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_8X8][i];
-#if CONFIG_CNVCONTEXT
-    above_ec = (a[0] + a[1]) != 0;
-    left_ec = (l[0] + l[1]) != 0;
-#else
-    above_ec = a[0];
-    left_ec = l[0];
-#endif
-    eobs[i] = c = decode_coefs(pbi, xd, bc,
-                               &above_ec, &left_ec,
-                               PLANE_TYPE_UV,
-                               DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                               vp9_default_zig_zag1d_8x8,
-                               TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0] = above_ec;
-    l[1] = l[0] = left_ec;
+    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
+                     DCT_DCT, seg_eob, xd->block[i].qcoeff,
+                     vp9_default_zig_zag1d_8x8, TX_8X8);
+    xd->eobs[i] = c;
     eobtotal += c;
   }
-  A[8] = 0;
-  L[8] = 0;
   return eobtotal;
 }
 
 static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
                                     MACROBLOCKD* const xd,
                                     BOOL_DECODER* const bc) {
-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-  uint16_t *const eobs = xd->eobs;
-  PLANE_TYPE type;
   int c, i, eobtotal = 0, seg_eob;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
 
-  int has_2nd_order = get_2nd_order_usage(xd);
-  // 2nd order DC block
-  if (has_2nd_order) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][24];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][24];
-
-    eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_Y2,
-                                DCT_DCT, get_eob(xd, segment_id, 4),
-                                xd->block[24].qcoeff,
-                                vp9_default_zig_zag1d_4x4, TX_8X8,
-                                vp9_coef_bands_4x4);
-    eobtotal += c - 4;
-    type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
-    eobs[24] = 0;
-    type = PLANE_TYPE_Y_WITH_DC;
-  }
-
   // luma blocks
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 0; i < 16; i += 4) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];
-#if CONFIG_CNVCONTEXT
-    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
-    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-#else
-    ENTROPY_CONTEXT above_ec = a[0];
-    ENTROPY_CONTEXT left_ec = l[0];
-#endif
-    eobs[i] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, type,
-                               type == PLANE_TYPE_Y_WITH_DC ?
-                               get_tx_type(xd, xd->block + i) : DCT_DCT,
-                               seg_eob, xd->block[i].qcoeff,
-                               vp9_default_zig_zag1d_8x8,
-                               TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0] = above_ec;
-    l[1] = l[0] = left_ec;
+    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                     get_tx_type(xd, xd->block + i),
+                     seg_eob, xd->block[i].qcoeff,
+                     vp9_default_zig_zag1d_8x8, TX_8X8);
+    xd->eobs[i] = c;
     eobtotal += c;
   }
 
@@ -427,34 +344,18 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
     // use 4x4 transform for U, V components in I8X8/splitmv prediction mode
     seg_eob = get_eob(xd, segment_id, 16);
     for (i = 16; i < 24; i++) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];
-
-      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
-                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                                 vp9_default_zig_zag1d_4x4, TX_4X4,
-                                 vp9_coef_bands_4x4);
+      c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
+                       DCT_DCT, seg_eob, xd->block[i].qcoeff,
+                       vp9_default_zig_zag1d_4x4, TX_4X4);
+      xd->eobs[i] = c;
       eobtotal += c;
     }
   } else {
     for (i = 16; i < 24; i += 4) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];
-#if CONFIG_CNVCONTEXT
-      ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
-      ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-#else
-      ENTROPY_CONTEXT above_ec = a[0];
-      ENTROPY_CONTEXT left_ec = l[0];
-#endif
-      eobs[i] = c = decode_coefs(pbi, xd, bc,
-                                 &above_ec, &left_ec,
-                                 PLANE_TYPE_UV,
-                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                                 vp9_default_zig_zag1d_8x8,
-                                 TX_8X8, vp9_coef_bands_8x8);
-      a[1] = a[0] = above_ec;
-      l[1] = l[0] = left_ec;
+      c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
+                       DCT_DCT, seg_eob, xd->block[i].qcoeff,
+                       vp9_default_zig_zag1d_8x8, TX_8X8);
+      xd->eobs[i] = c;
       eobtotal += c;
     }
   }
@@ -466,17 +367,9 @@ static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                             BOOL_DECODER* const bc,
                             PLANE_TYPE type, int i, int seg_eob,
                             TX_TYPE tx_type, const int *scan) {
-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-  ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];
-  ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];
-  uint16_t *const eobs = xd->eobs;
-  int c;
-
-  c = decode_coefs(dx, xd, bc, a, l, type, tx_type, seg_eob,
-                   xd->block[i].qcoeff, scan, TX_4X4, vp9_coef_bands_4x4);
-  eobs[i] = c;
-
+  int c = decode_coefs(dx, xd, bc, i, type, tx_type, seg_eob,
+                       xd->block[i].qcoeff, scan, TX_4X4);
+  xd->eobs[i] = c;
   return c;
 }
 
@@ -539,26 +432,13 @@ static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
                                     MACROBLOCKD* const xd,
                                     BOOL_DECODER* const bc) {
   int i, eobtotal = 0;
-  PLANE_TYPE type;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
   const int seg_eob = get_eob(xd, segment_id, 16);
-  const int has_2nd_order = get_2nd_order_usage(xd);
-
-  // 2nd order DC block
-  if (has_2nd_order) {
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24, seg_eob,
-                                 DCT_DCT, vp9_default_zig_zag1d_4x4) - 16;
-    type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
-    xd->eobs[24] = 0;
-    type = PLANE_TYPE_Y_WITH_DC;
-  }
 
   // luma blocks
   for (i = 0; i < 16; ++i) {
-    eobtotal += decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);
+    eobtotal += decode_coefs_4x4_y(dx, xd, bc,
+                                   PLANE_TYPE_Y_WITH_DC, i, seg_eob);
   }
 
   // chroma blocks
@@ -571,16 +451,13 @@ int vp9_decode_mb_tokens(VP9D_COMP* const dx,
                          MACROBLOCKD* const xd,
                          BOOL_DECODER* const bc) {
   const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  int eobtotal;
-
-  if (tx_size == TX_16X16) {
-    eobtotal = vp9_decode_mb_tokens_16x16(dx, xd, bc);
-  } else if (tx_size == TX_8X8) {
-    eobtotal = vp9_decode_mb_tokens_8x8(dx, xd, bc);
-  } else {
-    assert(tx_size == TX_4X4);
-    eobtotal = vp9_decode_mb_tokens_4x4(dx, xd, bc);
+  switch (tx_size) {
+    case TX_16X16:
+      return vp9_decode_mb_tokens_16x16(dx, xd, bc);
+    case TX_8X8:
+      return vp9_decode_mb_tokens_8x8(dx, xd, bc);
+    default:
+      assert(tx_size == TX_4X4);
+      return vp9_decode_mb_tokens_4x4(dx, xd, bc);
   }
-
-  return eobtotal;
 }
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 152527cff..b17955b1c 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -10,54 +10,20 @@
 
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
-#if CONFIG_LOSSLESS
 #include "vp9/decoder/vp9_dequantize.h"
-#endif
 
-void vp9_dequant_dc_idct_add_y_block_c(int16_t *q, const int16_t *dq,
-                                       uint8_t *pre,
-                                       uint8_t *dst,
-                                       int stride, uint16_t *eobs,
-                                       const int16_t *dc) {
-  int i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);
-      else
-        vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);
-
-      q   += 16;
-      pre += 4;
-      dst += 4;
-      dc++;
-    }
-
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
-  }
-}
-
-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
-                                                   const int16_t *dq,
-                                                   uint8_t *dst,
-                                                   int stride,
-                                                   uint16_t *eobs,
-                                                   const int16_t *dc,
-                                                   MACROBLOCKD *xd) {
+void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q,
+                                                const int16_t *dq,
+                                                uint8_t *dst,
+                                                int stride,
+                                                MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_dc_idct_add_c(q, dq, dst, dst, stride, stride, dc[0]);
-      else
-        vp9_dc_only_idct_add_c(dc[0], dst, dst, stride, stride);
-
+      xd->itxm_add(q, dq, dst, dst, stride, stride, xd->eobs[i * 4 + j]);
       q   += 16;
       dst += 4;
-      dc++;
     }
 
     dst += 4 * stride - 16;
@@ -67,18 +33,12 @@ void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
 void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
                                     uint8_t *pre,
                                     uint8_t *dst,
-                                    int stride, uint16_t *eobs) {
+                                    int stride, MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
-      else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      vp9_dequant_idct_add(q, dq, pre, dst, 16, stride, xd->eobs[i * 4  + j]);
       q   += 16;
       pre += 4;
       dst += 4;
@@ -92,18 +52,13 @@ void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
 void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
                                      uint8_t *pre, uint8_t *dstu,
                                      uint8_t *dstv, int stride,
-                                     uint16_t *eobs) {
+                                     MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
-      else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      vp9_dequant_idct_add(q, dq, pre, dstu, 8, stride,
+                           xd->eobs[16 + i * 2 + j]);
       q    += 16;
       pre  += 4;
       dstu += 4;
@@ -115,13 +70,8 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
-      else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      vp9_dequant_idct_add(q, dq, pre, dstv, 8, stride,
+                           xd->eobs[20 + i * 2 + j]);
       q    += 16;
       pre  += 4;
       dstv += 4;
@@ -136,19 +86,12 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                  uint8_t *dstu,
                                                  uint8_t *dstv,
                                                  int stride,
-                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1) {
-        vp9_dequant_idct_add_c(q, dq, dstu, dstu, stride, stride);
-      } else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], dstu, dstu, stride, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      xd->itxm_add(q, dq, dstu, dstu, stride, stride, xd->eobs[16 + i * 2 + j]);
       q    += 16;
       dstu += 4;
     }
@@ -158,13 +101,7 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1) {
-        vp9_dequant_idct_add_c(q, dq, dstv, dstv, stride, stride);
-      } else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], dstv, dstv, stride, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      xd->itxm_add(q, dq, dstv, dstv, stride, stride, xd->eobs[20 + i * 2 + j]);
       q    += 16;
       dstv += 4;
     }
@@ -173,69 +110,40 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
   }
 }
 
-void vp9_dequant_dc_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
-                                           uint8_t *pre,
-                                           uint8_t *dst,
-                                           int stride, uint16_t *eobs,
-                                           const int16_t *dc,
-                                           MACROBLOCKD *xd) {
-  q[0] = dc[0];
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);
-
-  q[64] = dc[1];
-  vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1,
-                             xd->eobs[4]);
-
-  q[128] = dc[4];
-  vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
-                                dst + 8 * stride, 16, stride, 1, xd->eobs[8]);
-
-  q[192] = dc[8];
-  vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
-                                dst + 8 * stride + 8, 16, stride, 1,
-                                xd->eobs[12]);
-}
+void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q,
+                                                const int16_t *dq,
+                                                uint8_t *dst,
+                                                int stride,
+                                                MACROBLOCKD *xd) {
+  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, xd->eobs[0]);
 
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,
-                                                   const int16_t *dq,
-                                                   uint8_t *dst,
-                                                   int stride,
-                                                   uint16_t *eobs,
-                                                   const int16_t *dc,
-                                                   MACROBLOCKD *xd) {
-  q[0] = dc[0];
-  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);
-
-  q[64] = dc[1];
   vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,
-                                dst + 8, stride, stride, 1, xd->eobs[4]);
+                             dst + 8, stride, stride, xd->eobs[4]);
 
-  q[128] = dc[4];
   vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
-                                dst + 8 * stride, stride, stride, 1,
-                                xd->eobs[8]);
+                             dst + 8 * stride, stride, stride,
+                             xd->eobs[8]);
 
-  q[192] = dc[8];
   vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
-                                dst + 8 * stride + 8, stride, stride, 1,
-                                xd->eobs[12]);
+                             dst + 8 * stride + 8, stride, stride,
+                             xd->eobs[12]);
 }
 
 void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
                                         uint8_t *pre,
                                         uint8_t *dst,
-                                        int stride, uint16_t *eobs,
-                                        MACROBLOCKD *xd) {
+                                        int stride, MACROBLOCKD *xd) {
   uint8_t *origdest = dst;
   uint8_t *origpred = pre;
 
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, xd->eobs[0]);
   vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
-                             origdest + 8, 16, stride, 0, xd->eobs[4]);
+                             origdest + 8, 16, stride, xd->eobs[4]);
   vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
-                             origdest + 8 * stride, 16, stride, 0, xd->eobs[8]);
+                             origdest + 8 * stride, 16, stride,
+                             xd->eobs[8]);
   vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
-                             origdest + 8 * stride + 8, 16, stride, 0,
+                             origdest + 8 * stride + 8, 16, stride,
                              xd->eobs[12]);
 }
 
@@ -243,72 +151,39 @@ void vp9_dequant_idct_add_uv_block_8x8_c(int16_t *q, const int16_t *dq,
                                          uint8_t *pre,
                                          uint8_t *dstu,
                                          uint8_t *dstv,
-                                         int stride, uint16_t *eobs,
-                                         MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);
+                                         int stride, MACROBLOCKD *xd) {
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, xd->eobs[16]);
 
   q    += 64;
   pre  += 64;
 
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, xd->eobs[20]);
 }
 
 void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                  uint8_t *dstu,
                                                  uint8_t *dstv,
                                                  int stride,
-                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,
+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride,
                              xd->eobs[16]);
 
   q += 64;
-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,
+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride,
                              xd->eobs[20]);
 }
 
-#if CONFIG_LOSSLESS
-void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
-                                                uint8_t *pre,
-                                                uint8_t *dst,
-                                                int stride,
-                                                uint16_t *eobs,
-                                                const int16_t *dc) {
-  int i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);
-      else
-        vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);
-
-      q   += 16;
-      pre += 4;
-      dst += 4;
-      dc++;
-    }
-
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
-  }
-}
 
 void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
                                              uint8_t *pre,
                                              uint8_t *dst,
-                                             int stride, uint16_t *eobs) {
+                                             int stride, MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);
-      else {
-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride,
+                                      xd->eobs[i * 4 + j]);
       q   += 16;
       pre += 4;
       dst += 4;
@@ -324,18 +199,13 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
                                               uint8_t *dstu,
                                               uint8_t *dstv,
                                               int stride,
-                                              uint16_t *eobs) {
+                                              MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);
-      else {
-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride,
+                                      xd->eobs[16 + i * 2 + j]);
       q    += 16;
       pre  += 4;
       dstu += 4;
@@ -347,13 +217,8 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      if (*eobs++ > 1)
-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);
-      else {
-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);
-        ((int *)q)[0] = 0;
-      }
-
+      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride,
+                                      xd->eobs[20 + i * 2 + j]);
       q    += 16;
       pre  += 4;
       dstv += 4;
@@ -363,5 +228,4 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
     dstv += 4 * stride - 8;
   }
 }
-#endif
 
diff --git a/vp9/decoder/vp9_onyxd.h b/vp9/decoder/vp9_onyxd.h
index 93321ef34..748fc7ea3 100644
--- a/vp9/decoder/vp9_onyxd.h
+++ b/vp9/decoder/vp9_onyxd.h
@@ -27,6 +27,7 @@ extern "C" {
     int     Version;
     int     postprocess;
     int     max_threads;
+    int     inv_tile_order;
     int     input_partition;
   } VP9D_CONFIG;
   typedef enum {
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index b3b75af70..ce7958c3b 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -30,34 +30,34 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "./vpx_scale_rtcd.h"
 
-static int get_free_fb(VP9_COMMON *cm);
-static void ref_cnt_fb(int *buf, int *idx, int new_idx);
-
 #define WRITE_RECON_BUFFER 0
 #if WRITE_RECON_BUFFER == 1
-static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
+static void recon_write_yuv_frame(const char *name,
+                                  const YV12_BUFFER_CONFIG *s,
+                                  int w, int _h) {
   FILE *yuv_file = fopen((char *)name, "ab");
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
+  const uint8_t *src = s->y_buffer;
+  int h = _h;
 
   do {
-    fwrite(src, s->y_width, 1,  yuv_file);
+    fwrite(src, w, 1,  yuv_file);
     src += s->y_stride;
   } while (--h);
 
   src = s->u_buffer;
-  h = s->uv_height;
+  h = (_h + 1) >> 1;
+  w = (w + 1) >> 1;
 
   do {
-    fwrite(src, s->uv_width, 1,  yuv_file);
+    fwrite(src, w, 1,  yuv_file);
     src += s->uv_stride;
   } while (--h);
 
   src = s->v_buffer;
-  h = s->uv_height;
+  h = (_h + 1) >> 1;
 
   do {
-    fwrite(src, s->uv_width, 1, yuv_file);
+    fwrite(src, w, 1, yuv_file);
     src += s->uv_stride;
   } while (--h);
 
@@ -127,6 +127,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   vp9_initialize_dec();
 
   vp9_create_common(&pbi->common);
+  pbi->oxcf = *oxcf;
 
   pbi->common.current_video_frame = 0;
   pbi->ready_for_new_data = 1;
@@ -168,12 +169,13 @@ vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
   VP9_COMMON *cm = &pbi->common;
   int ref_fb_idx;
 
+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+   * encoder is using the frame buffers for. This is just a stub to keep the
+   * vpxenc --test-decode functionality working, and will be replaced in a
+   * later commit that adds VP9-specific controls for this functionality.
+   */
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->lst_fb_idx;
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->gld_fb_idx;
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->alt_fb_idx;
+    ref_fb_idx = pbi->common.new_fb_idx;
   else {
     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
@@ -200,12 +202,17 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int *ref_fb_ptr = NULL;
   int free_fb;
 
+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+   * encoder is using the frame buffers for. This is just a stub to keep the
+   * vpxenc --test-decode functionality working, and will be replaced in a
+   * later commit that adds VP9-specific controls for this functionality.
+   */
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_ptr = &cm->lst_fb_idx;
+    ref_fb_ptr = &pbi->common.active_ref_idx[0];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_ptr = &cm->gld_fb_idx;
+    ref_fb_ptr = &pbi->common.active_ref_idx[1];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_ptr = &cm->alt_fb_idx;
+    ref_fb_ptr = &pbi->common.active_ref_idx[2];
   else {
     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
@@ -234,77 +241,25 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
 }
 
 
-static int get_free_fb(VP9_COMMON *cm) {
-  int i;
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    if (cm->fb_idx_ref_cnt[i] == 0)
-      break;
-
-  assert(i < NUM_YV12_BUFFERS);
-  cm->fb_idx_ref_cnt[i] = 1;
-  return i;
-}
-
-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
-  if (buf[*idx] > 0)
-    buf[*idx]--;
-
-  *idx = new_idx;
-
-  buf[new_idx]++;
-}
-
-/* If any buffer copy / swapping is signalled it should be done here. */
-static int swap_frame_buffers(VP9_COMMON *cm) {
-  int err = 0;
-
-  /* The alternate reference frame or golden frame can be updated
-   *  using the new, last, or golden/alt ref frame.  If it
-   *  is updated using the newly decoded frame it is a refresh.
-   *  An update using the last or golden/alt ref frame is a copy.
-   */
-  if (cm->copy_buffer_to_arf) {
-    int new_fb = 0;
+/* If any buffer updating is signalled it should be done here. */
+static void swap_frame_buffers(VP9D_COMP *pbi) {
+  int ref_index = 0, mask;
 
-    if (cm->copy_buffer_to_arf == 1)
-      new_fb = cm->lst_fb_idx;
-    else if (cm->copy_buffer_to_arf == 2)
-      new_fb = cm->gld_fb_idx;
-    else
-      err = -1;
-
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
-  }
-
-  if (cm->copy_buffer_to_gf) {
-    int new_fb = 0;
-
-    if (cm->copy_buffer_to_gf == 1)
-      new_fb = cm->lst_fb_idx;
-    else if (cm->copy_buffer_to_gf == 2)
-      new_fb = cm->alt_fb_idx;
-    else
-      err = -1;
-
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      ref_cnt_fb(pbi->common.fb_idx_ref_cnt,
+                 &pbi->common.ref_frame_map[ref_index],
+                 pbi->common.new_fb_idx);
+    }
+    ++ref_index;
   }
 
-  if (cm->refresh_golden_frame)
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
-
-  if (cm->refresh_alt_ref_frame)
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+  pbi->common.frame_to_show = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+  pbi->common.fb_idx_ref_cnt[pbi->common.new_fb_idx]--;
 
-  if (cm->refresh_last_frame) {
-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
-
-    cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
-  } else
-    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
-
-  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-
-  return err;
+  /* Invalidate these references until the next frame starts. */
+  for (ref_index = 0; ref_index < 3; ref_index++)
+    pbi->common.active_ref_idx[ref_index] = INT_MAX;
 }
 
 int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
@@ -332,8 +287,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
      * We do not know if the missing frame(s) was supposed to update
      * any of the reference buffers, but we act conservative and
      * mark only the last buffer as corrupted.
+     *
+     * TODO(jkoleszar): Error concealment is undefined and non-normative
+     * at this point, but if it becomes so, [0] may not always be the correct
+     * thing to do here.
      */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+    if (cm->active_ref_idx[0] != INT_MAX)
+      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
   }
 
   cm->new_fb_idx = get_free_fb(cm);
@@ -344,8 +304,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
     /* We do not know if the missing frame(s) was supposed to update
      * any of the reference buffers, but we act conservative and
      * mark only the last buffer as corrupted.
+     *
+     * TODO(jkoleszar): Error concealment is undefined and non-normative
+     * at this point, but if it becomes so, [0] may not always be the correct
+     * thing to do here.
      */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+    if (cm->active_ref_idx[0] != INT_MAX)
+      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
 
     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
@@ -365,11 +330,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
   }
 
   {
-    if (swap_frame_buffers(cm)) {
-      pbi->common.error.error_code = VPX_CODEC_ERROR;
-      pbi->common.error.setjmp = 0;
-      return -1;
-    }
+    swap_frame_buffers(pbi);
 
 #if WRITE_RECON_BUFFER == 2
     if (cm->show_frame)
@@ -389,7 +350,8 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
 
 #if WRITE_RECON_BUFFER == 1
   if (cm->show_frame)
-    recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+    recon_write_yuv_frame("recon.yuv", cm->frame_to_show,
+                          cm->Width, cm->Height);
 #endif
 
   vp9_clear_system_state();
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 64975468d..0e6d059af 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -18,41 +18,6 @@
 
 // #define DEC_DEBUG
 
-typedef struct {
-  int ithread;
-  void *ptr1;
-  void *ptr2;
-} DECODETHREAD_DATA;
-
-typedef struct {
-  MACROBLOCKD  mbd;
-  int mb_row;
-  int current_mb_col;
-  short *coef_ptr;
-} MB_ROW_DEC;
-
-typedef struct {
-  int const *scan;
-  int const *scan_8x8;
-  uint8_t const *ptr_block2leftabove;
-  vp9_tree_index const *vp9_coef_tree_ptr;
-  unsigned char *norm_ptr;
-  uint8_t *ptr_coef_bands_x;
-  uint8_t *ptr_coef_bands_x_8x8;
-
-  ENTROPY_CONTEXT_PLANES *A;
-  ENTROPY_CONTEXT_PLANES *L;
-
-  int16_t *qcoeff_start_ptr;
-
-  vp9_prob const *coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];
-
-  uint8_t eob[25];
-
-} DETOK;
-
 typedef struct VP9Decompressor {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -68,18 +33,13 @@ typedef struct VP9Decompressor {
   int64_t last_time_stamp;
   int   ready_for_new_data;
 
-  DETOK detoken;
-
-  vp9_dequant_idct_add_fn_t            idct_add;
-  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;
-  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
-  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;
-  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
-
+  int refresh_frame_flags;
   vp9_prob prob_skip_false;
 
   int decoded_key_frame;
 
+  int initial_width;
+  int initial_height;
 } VP9D_COMP;
 
 int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);
diff --git a/vp9/decoder/x86/vp9_dequantize_mmx.asm b/vp9/decoder/x86/vp9_dequantize_mmx.asm
deleted file mode 100644
index 23080bfee..000000000
--- a/vp9/decoder/x86/vp9_dequantize_mmx.asm
+++ /dev/null
@@ -1,406 +0,0 @@
-;
-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2:      times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16:         times 4 dw 16
-
-SECTION .text
-
-INIT_MMX
-
-
-;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
-cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
-    mova       m1, [sqq]
-    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+ 0], m1
-
-    mova       m1, [sqq+8]
-    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+ 8], m1
-
-    mova       m1, [sqq+16]
-    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+16], m1
-
-    mova       m1, [sqq+24]
-    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.
-    mova [dqq+24], m1
-    RET
-
-
-;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
-cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
-
-%if ARCH_X86_64
-    movsxd              strideq,  dword stridem
-    movsxd              pitq,     dword pitm
-%else
-    mov                 strideq,  stridem
-    mov                 pitq,     pitm
-%endif
-
-    mova                m0,       [inpq+ 0]
-    pmullw              m0,       [dqq]
-
-    mova                m1,       [inpq+ 8]
-    pmullw              m1,       [dqq+ 8]
-
-    mova                m2,       [inpq+16]
-    pmullw              m2,       [dqq+16]
-
-    mova                m3,       [inpq+24]
-    pmullw              m3,       [dqq+24]
-
-    pxor                m7,        m7
-    mova            [inpq],        m7
-    mova          [inpq+8],        m7
-    mova         [inpq+16],        m7
-    mova         [inpq+24],        m7
-
-
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    mova                m3,        m5             ; 33 23 13 03
-
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    paddw               m0,       [pw_16]
-
-    paddw               m2,       [pw_16]
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-    psraw               m2,        5
-
-    psraw               m0,        5
-    psraw               m4,        5
-
-    psraw               m6,        5
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    pxor                m7,        m7
-
-    movh                m4,       [predq]
-    punpcklbw           m4,        m7
-    paddsw              m0,        m4
-    packuswb            m0,        m7
-    movh           [destq],      m0
-
-    movh                m4,       [predq+pitq]
-    punpcklbw           m4,        m7
-    paddsw              m1,        m4
-    packuswb            m1,        m7
-    movh   [destq+strideq],        m1
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m2,        m4
-    packuswb            m2,        m7
-    movh [destq+strideq*2],        m2
-
-    add              destq,        strideq
-    add              predq,        pitq
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m5,        m4
-    packuswb            m5,        m7
-    movh [destq+strideq*2],        m5
-    RET
-
-
-;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
-cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
-
-%if ARCH_X86_64
-    movsxd              strideq,   dword stridem
-    movsxd              pitq,      dword pitm
-%else
-    mov                 strideq,   stridem
-    mov                 pitq,      pitm
-%endif
-
-    mov                 Dcq, Dcm
-    mova                m0,       [inpq+ 0]
-    pmullw              m0,       [dqq+ 0]
-
-    mova                m1,       [inpq+ 8]
-    pmullw              m1,       [dqq+ 8]
-
-    mova                m2,       [inpq+16]
-    pmullw              m2,       [dqq+16]
-
-    mova                m3,       [inpq+24]
-    pmullw              m3,       [dqq+24]
-
-    pxor                m7,        m7
-    mova         [inpq+ 0],        m7
-    mova         [inpq+ 8],        m7
-    mova         [inpq+16],        m7
-    mova         [inpq+24],        m7
-
-    ; move lower word of Dc to lower word of m0
-    psrlq               m0,        16
-    psllq               m0,        16
-    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.
-    movh                m7,        Dcq
-    por                 m0,        m7
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    mova                m3,        m5             ; 33 23 13 03
-
-    psubw               m0,        m2             ; b1= 0-2
-    paddw               m2,        m2             ;
-
-    mova                m5,        m1
-    paddw               m2,        m0             ; a1 =0+2
-
-    pmulhw              m5,       [x_s1sqr2];
-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)
-
-    mova                m7,        m3             ;
-    pmulhw              m7,       [x_c1sqr2less1];
-
-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
-    psubw               m7,        m5             ; c1
-
-    mova                m5,        m1
-    mova                m4,        m3
-
-    pmulhw              m5,       [x_c1sqr2less1]
-    paddw               m5,        m1
-
-    pmulhw              m3,       [x_s1sqr2]
-    paddw               m3,        m4
-
-    paddw               m3,        m5             ; d1
-    paddw               m0,       [pw_16]
-
-    paddw               m2,       [pw_16]
-    mova                m6,        m2             ; a1
-
-    mova                m4,        m0             ; b1
-    paddw               m2,        m3             ;0
-
-    paddw               m4,        m7             ;1
-    psubw               m0,        m7             ;2
-
-    psubw               m6,        m3             ;3
-    psraw               m2,        5
-
-    psraw               m0,        5
-    psraw               m4,        5
-
-    psraw               m6,        5
-
-    mova                m1,        m2             ; 03 02 01 00
-    mova                m3,        m4             ; 23 22 21 20
-
-    punpcklwd           m1,        m0             ; 11 01 10 00
-    punpckhwd           m2,        m0             ; 13 03 12 02
-
-    punpcklwd           m3,        m6             ; 31 21 30 20
-    punpckhwd           m4,        m6             ; 33 23 32 22
-
-    mova                m0,        m1             ; 11 01 10 00
-    mova                m5,        m2             ; 13 03 12 02
-
-    punpckldq           m0,        m3             ; 30 20 10 00
-    punpckhdq           m1,        m3             ; 31 21 11 01
-
-    punpckldq           m2,        m4             ; 32 22 12 02
-    punpckhdq           m5,        m4             ; 33 23 13 03
-
-    pxor                m7,        m7
-
-    movh                m4,       [predq]
-    punpcklbw           m4,        m7
-    paddsw              m0,        m4
-    packuswb            m0,        m7
-    movh           [destq],        m0
-
-    movh                m4,       [predq+pitq]
-    punpcklbw           m4,        m7
-    paddsw              m1,        m4
-    packuswb            m1,        m7
-    movh   [destq+strideq],        m1
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m2,        m4
-    packuswb            m2,        m7
-    movh [destq+strideq*2],        m2
-
-    add              destq,        strideq
-    add              predq,        pitq
-
-    movh                m4,       [predq+2*pitq]
-    punpcklbw           m4,        m7
-    paddsw              m5,        m4
-    packuswb            m5,        m7
-    movh [destq+strideq*2],        m5
-    RET
-
diff --git a/vp9/decoder/x86/vp9_idct_blk_mmx.c b/vp9/decoder/x86/vp9_idct_blk_mmx.c
deleted file mode 100644
index 8279eaa4a..000000000
--- a/vp9/decoder/x86/vp9_idct_blk_mmx.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/decoder/vp9_dequantize.h"
-#include "vp9/decoder/x86/vp9_idct_mmx.h"
-
-void vp9_dequant_dc_idct_add_y_block_mmx(short *q, const short *dq,
-                                         unsigned char *pre,
-                                         unsigned char *dst,
-                                         int stride, unsigned short *eobs,
-                                         const short *dc) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);
-
-    if (eobs[1] > 1)
-      vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,
-                                  dst + 4, 16, stride, dc[1]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);
-
-    if (eobs[2] > 1)
-      vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,
-                                  dst + 8, 16, stride, dc[2]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);
-
-    if (eobs[3] > 1)
-      vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,
-                                  dst + 12, 16, stride, dc[3]);
-    else
-      vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);
-
-    q    += 64;
-    dc   += 4;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_y_block_mmx(short *q, const short *dq,
-                                      unsigned char *pre,
-                                      unsigned char *dst,
-                                      int stride, unsigned short *eobs) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    if (eobs[2] > 1)
-      vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
-      ((int *)(q + 32))[0] = 0;
-    }
-
-    if (eobs[3] > 1)
-      vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
-      ((int *)(q + 48))[0] = 0;
-    }
-
-    q    += 64;
-    pre  += 64;
-    dst  += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_mmx(short *q, const short *dq,
-                                       unsigned char *pre,
-                                       unsigned char *dstu,
-                                       unsigned char *dstv,
-                                       int stride, unsigned short *eobs) {
-  int i;
-
-  for (i = 0; i < 2; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    q    += 32;
-    pre  += 32;
-    dstu += 4 * stride;
-    eobs += 2;
-  }
-
-  for (i = 0; i < 2; i++) {
-    if (eobs[0] > 1)
-      vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);
-      ((int *)q)[0] = 0;
-    }
-
-    if (eobs[1] > 1)
-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);
-    else {
-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
-      ((int *)(q + 16))[0] = 0;
-    }
-
-    q    += 32;
-    pre  += 32;
-    dstv += 4 * stride;
-    eobs += 2;
-  }
-}
diff --git a/vp9/decoder/x86/vp9_x86_dsystemdependent.c b/vp9/decoder/x86/vp9_x86_dsystemdependent.c
deleted file mode 100644
index 51ee8ec31..000000000
--- a/vp9/decoder/x86/vp9_x86_dsystemdependent.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
-
-#if HAVE_MMX
-void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp9_dequantize_b_mmx(BLOCKD *d) {
-  short *sq = (short *) d->qcoeff;
-  short *dq = (short *) d->dqcoeff;
-  short *q = (short *) d->dequant;
-  vp9_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
-
diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c
index 71fad2e07..e174a894a 100644
--- a/vp9/encoder/vp9_asm_enc_offsets.c
+++ b/vp9/encoder/vp9_asm_enc_offsets.c
@@ -32,7 +32,6 @@ DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));
 DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
 DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));
 DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
-DEFINE(vp9_blockd_eob,                          offsetof(BLOCKD, eob));
 
 END
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 61aac5cd1..7101947a6 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -14,6 +14,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include <assert.h>
@@ -41,12 +42,9 @@ unsigned __int64 Sectionbits[500];
 int intra_mode_stats[VP9_KF_BINTRAMODES]
                     [VP9_KF_BINTRAMODES]
                     [VP9_KF_BINTRAMODES];
-vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4];
-vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4];
-vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8];
-vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8];
-vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16];
-vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16];
+vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
+vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
+vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];
 
 extern unsigned int active_section;
@@ -189,15 +187,7 @@ static void update_refpred_stats(VP9_COMP *cpi) {
   int old_cost, new_cost;
 
   // Set the prediction probability structures to defaults
-  if (cm->frame_type == KEY_FRAME) {
-    // Set the prediction probabilities to defaults
-    cm->ref_pred_probs[0] = 120;
-    cm->ref_pred_probs[1] = 80;
-    cm->ref_pred_probs[2] = 40;
-
-    vpx_memset(cpi->ref_pred_probs_update, 0,
-               sizeof(cpi->ref_pred_probs_update));
-  } else {
+  if (cm->frame_type != KEY_FRAME) {
     // From the prediction counts set the probabilities for each context
     for (i = 0; i < PREDICTION_PROBS; i++) {
       new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
@@ -219,7 +209,6 @@ static void update_refpred_stats(VP9_COMP *cpi) {
         cm->ref_pred_probs[i] = new_pred_probs[i];
       } else
         cpi->ref_pred_probs_update[i] = 0;
-
     }
   }
 }
@@ -230,8 +219,8 @@ static void update_refpred_stats(VP9_COMP *cpi) {
 //
 // The branch counts table is re-populated during the actual pack stage and in
 // the decoder to facilitate backwards update of the context.
-static void update_mode_probs(VP9_COMMON *cm,
-                              int mode_context[INTER_MODE_CONTEXTS][4]) {
+static void update_inter_mode_probs(VP9_COMMON *cm,
+                                    int mode_context[INTER_MODE_CONTEXTS][4]) {
   int i, j;
   unsigned int (*mv_ref_ct)[4][2];
 
@@ -508,7 +497,8 @@ static void write_sub_mv_ref
               vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);
 }
 
-static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,
+static void write_nmv(VP9_COMP *cpi, vp9_writer *bc,
+                      const MV *mv, const int_mv *ref,
                       const nmv_context *nmvc, int usehp) {
   MV e;
   e.row = mv->row - ref->as_mv.row;
@@ -585,6 +575,28 @@ static void write_mb_segid(vp9_writer *bc,
   }
 }
 
+static void write_mb_segid_except(VP9_COMMON *cm,
+                                  vp9_writer *bc,
+                                  const MB_MODE_INFO *mi,
+                                  const MACROBLOCKD *xd,
+                                  int mb_row, int mb_col) {
+  // Encode the MB segment id.
+  int seg_id = mi->segment_id;
+  int pred_seg_id = vp9_get_pred_mb_segid(cm, xd,
+                                          mb_row * cm->mb_cols + mb_col);
+  const vp9_prob *p = xd->mb_segment_tree_probs;
+  const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id];
+
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+    vp9_write(bc, seg_id >= 2, p1);
+    if (pred_seg_id >= 2 && seg_id < 2) {
+      vp9_write(bc, seg_id == 1, p[1]);
+    } else if (pred_seg_id < 2 && seg_id >= 2) {
+      vp9_write(bc, seg_id == 3, p[2]);
+    }
+  }
+}
+
 // This function encodes the reference frame
 static void encode_ref_frame(vp9_writer *const bc,
                              VP9_COMMON *const cm,
@@ -728,7 +740,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
       // If the mb segment id wasn't predicted code explicitly
       if (!prediction_flag)
-        write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+        write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col);
     } else {
       // Normal unpredicted coding
       write_mb_segid(bc, mi, &cpi->mb.e_mbd);
@@ -737,8 +749,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
   if (!pc->mb_no_coeff_skip) {
     skip_coeff = 0;
-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {
+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
     const int nmbs = mb_size;
@@ -758,24 +769,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   }
 
   // Encode the reference frame.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)
-      || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) {
-    encode_ref_frame(bc, pc, xd, segment_id, rf);
-  } else {
-    assert(rf == INTRA_FRAME);
-  }
+  encode_ref_frame(bc, pc, xd, segment_id, rf);
 
   if (rf == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 6;
 #endif
 
-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-      if (m->mbmi.sb_type)
-        write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-      else
-        write_ymode(bc, mode, pc->fc.ymode_prob);
-    }
+    if (m->mbmi.sb_type)
+      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
+    else
+      write_ymode(bc, mode, pc->fc.ymode_prob);
+
     if (mode == B_PRED) {
       int j = 0;
       do {
@@ -801,14 +806,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 
     vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
 
-    // #ifdef ENTROPY_STATS
 #ifdef ENTROPY_STATS
-    accum_mv_refs(mode, ct);
     active_section = 3;
 #endif
 
-    // Is the segment coding of mode enabled
-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+    // If segment skip is not enabled code the mode.
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
       if (mi->sb_type) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
@@ -878,12 +881,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #ifdef ENTROPY_STATS
         active_section = 5;
 #endif
-        write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv,
+        write_nmv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv,
                   (const nmv_context*) nmvc,
                   xd->allow_high_precision_mv);
 
         if (mi->second_ref_frame > 0) {
-          write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv,
+          write_nmv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv,
                     (const nmv_context*) nmvc,
                     xd->allow_high_precision_mv);
         }
@@ -915,7 +918,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #else
           while (j != L[++k]);
 #endif
-          leftmv.as_int = left_block_mv(m, k);
+          leftmv.as_int = left_block_mv(xd, m, k);
           abovemv.as_int = above_block_mv(m, k, mis);
           mv_contz = vp9_mv_cont(&leftmv, &abovemv);
 
@@ -926,12 +929,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
-            write_nmv(bc, &blockmv.as_mv, &mi->best_mv,
+            write_nmv(cpi, bc, &blockmv.as_mv, &mi->best_mv,
                       (const nmv_context*) nmvc,
                       xd->allow_high_precision_mv);
 
             if (mi->second_ref_frame > 0) {
-              write_nmv(bc,
+              write_nmv(cpi, bc,
                         &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
                         &mi->best_second_mv,
                         (const nmv_context*) nmvc,
@@ -951,8 +954,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
                                mi->partitioning == PARTITIONING_4X4))) &&
       pc->txfm_mode == TX_MODE_SELECT &&
       !((pc->mb_no_coeff_skip && skip_coeff) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
     TX_SIZE sz = mi->txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
@@ -981,8 +983,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
 
   if (!c->mb_no_coeff_skip) {
     skip_coeff = 0;
-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {
+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
     const int nmbs = 1 << m->mbmi.sb_type;
@@ -1013,7 +1014,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     int i = 0;
     do {
       const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(m, i);
+      const B_PREDICTION_MODE L = (xd->left_available || (i & 3)) ?
+                                  left_block_mode(m, i) : B_DC_PRED;
       const int bm = m->bmi[i].as_mode.first;
 
 #ifdef ENTROPY_STATS
@@ -1041,8 +1043,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
 
   if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !((c->mb_no_coeff_skip && skip_coeff) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
     TX_SIZE sz = m->mbmi.txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
@@ -1061,6 +1062,10 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   xd->mode_info_context = m;
+  xd->left_available = mb_col > c->cur_tile_mb_col_start;
+  xd->right_available =
+      (mb_col + (1 << m->mbmi.sb_type)) < c->cur_tile_mb_col_end;
+  xd->up_available = mb_row > 0;
   if (c->frame_type == KEY_FRAME) {
     write_mb_modes_kf(cpi, m, bc,
                       c->mb_rows - mb_row, c->mb_cols - mb_col);
@@ -1079,20 +1084,22 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   pack_mb_tokens(bc, tok, tok_end);
 }
 
-static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {
+static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
+                        TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   VP9_COMMON *const c = &cpi->common;
   const int mis = c->mode_info_stride;
   MODE_INFO *m, *m_ptr = c->mi;
   int i, mb_row, mb_col;
-  TOKENEXTRA *tok = cpi->tok;
-  TOKENEXTRA *tok_end = tok + cpi->tok_count;
 
-  for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) {
+  m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis;
+  for (mb_row = c->cur_tile_mb_row_start;
+       mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) {
     m = m_ptr;
-    for (mb_col = 0; mb_col < c->mb_cols; mb_col += 4, m += 4) {
+    for (mb_col = c->cur_tile_mb_col_start;
+         mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) {
       vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded);
       if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-        write_modes_b(cpi, m, bc, &tok, tok_end, mb_row, mb_col);
+        write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);
       } else {
         int j;
 
@@ -1107,7 +1114,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {
           vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded);
           if (sb_m->mbmi.sb_type) {
             assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32);
-            write_modes_b(cpi, sb_m, bc, &tok, tok_end,
+            write_modes_b(cpi, sb_m, bc, tok, tok_end,
                           mb_row + y_idx_sb, mb_col + x_idx_sb);
           } else {
             // Process the 4 MBs in the order:
@@ -1123,7 +1130,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {
               }
 
               assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-              write_modes_b(cpi, mb_m, bc, &tok, tok_end,
+              write_modes_b(cpi, mb_m, bc, tok, tok_end,
                             mb_row + y_idx, mb_col + x_idx);
             }
           }
@@ -1135,20 +1142,23 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {
 
 
 /* This function is used for debugging probability trees. */
-static void print_prob_tree(vp9_coeff_probs *coef_probs) {
+static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
   /* print coef probability tree */
-  int i, j, k, l;
+  int i, j, k, l, m;
   FILE *f = fopen("enc_tree_probs.txt", "a");
   fprintf(f, "{\n");
-  for (i = 0; i < BLOCK_TYPES_4X4; i++) {
+  for (i = 0; i < block_types; i++) {
     fprintf(f, "  {\n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < ENTROPY_NODES; l++) {
-          fprintf(f, "%3u, ",
-                  (unsigned int)(coef_probs [i][j][k][l]));
+    for (j = 0; j < REF_TYPES; ++j) {
+      fprintf(f, "  {\n");
+      for (k = 0; k < COEF_BANDS; k++) {
+        fprintf(f, "    {\n");
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+          fprintf(f, "      {");
+          for (m = 0; m < ENTROPY_NODES; m++) {
+            fprintf(f, "%3u, ",
+                    (unsigned int)(coef_probs[i][j][k][l][m]));
+          }
         }
         fprintf(f, " }\n");
       }
@@ -1168,26 +1178,28 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs,
 #endif
                                     vp9_coeff_stats *coef_branch_ct,
                                     int block_types) {
-  int i = 0, j, k;
+  int i, j, k, l;
 #ifdef ENTROPY_STATS
   int t = 0;
 #endif
 
   for (i = 0; i < block_types; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
-                                         vp9_coef_encodings, vp9_coef_tree,
-                                         coef_probs[i][j][k],
-                                         coef_branch_ct[i][j][k],
-                                         coef_counts[i][j][k]);
+    for (j = 0; j < REF_TYPES; ++j) {
+      for (k = 0; k < COEF_BANDS; ++k) {
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          if (l >= 3 && k == 0)
+            continue;
+          vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
+                                           vp9_coef_encodings, vp9_coef_tree,
+                                           coef_probs[i][j][k][l],
+                                           coef_branch_ct[i][j][k][l],
+                                           coef_counts[i][j][k][l]);
 #ifdef ENTROPY_STATS
         if (!cpi->dummy_packing)
           for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            context_counters[i][j][k][t] += coef_counts[i][j][k][t];
+            context_counters[i][j][k][l][t] += coef_counts[i][j][k][l][t];
 #endif
+        }
       }
     }
   }
@@ -1199,37 +1211,19 @@ static void build_coeff_contexts(VP9_COMP *cpi) {
 #ifdef ENTROPY_STATS
                           cpi, context_counters_4x4,
 #endif
-                          cpi->frame_branch_ct_4x4, BLOCK_TYPES_4X4);
-  build_tree_distribution(cpi->frame_hybrid_coef_probs_4x4,
-                          cpi->hybrid_coef_counts_4x4,
-#ifdef ENTROPY_STATS
-                          cpi, hybrid_context_counters_4x4,
-#endif
-                          cpi->frame_hybrid_branch_ct_4x4, BLOCK_TYPES_4X4);
+                          cpi->frame_branch_ct_4x4, BLOCK_TYPES);
   build_tree_distribution(cpi->frame_coef_probs_8x8,
                           cpi->coef_counts_8x8,
 #ifdef ENTROPY_STATS
                           cpi, context_counters_8x8,
 #endif
-                          cpi->frame_branch_ct_8x8, BLOCK_TYPES_8X8);
-  build_tree_distribution(cpi->frame_hybrid_coef_probs_8x8,
-                          cpi->hybrid_coef_counts_8x8,
-#ifdef ENTROPY_STATS
-                          cpi, hybrid_context_counters_8x8,
-#endif
-                          cpi->frame_hybrid_branch_ct_8x8, BLOCK_TYPES_8X8);
+                          cpi->frame_branch_ct_8x8, BLOCK_TYPES);
   build_tree_distribution(cpi->frame_coef_probs_16x16,
                           cpi->coef_counts_16x16,
 #ifdef ENTROPY_STATS
                           cpi, context_counters_16x16,
 #endif
-                          cpi->frame_branch_ct_16x16, BLOCK_TYPES_16X16);
-  build_tree_distribution(cpi->frame_hybrid_coef_probs_16x16,
-                          cpi->hybrid_coef_counts_16x16,
-#ifdef ENTROPY_STATS
-                          cpi, hybrid_context_counters_16x16,
-#endif
-                          cpi->frame_hybrid_branch_ct_16x16, BLOCK_TYPES_16X16);
+                          cpi->frame_branch_ct_16x16, BLOCK_TYPES);
   build_tree_distribution(cpi->frame_coef_probs_32x32,
                           cpi->coef_counts_32x32,
 #ifdef ENTROPY_STATS
@@ -1247,7 +1241,7 @@ static void update_coef_probs_common(vp9_writer* const bc,
                                      vp9_coeff_probs *old_frame_coef_probs,
                                      vp9_coeff_stats *frame_branch_ct,
                                      int block_types) {
-  int i, j, k, t;
+  int i, j, k, l, t;
   int update[2] = {0, 0};
   int savings;
   // vp9_prob bestupd = find_coef_update_prob(cpi);
@@ -1255,38 +1249,39 @@ static void update_coef_probs_common(vp9_writer* const bc,
   /* dry run to see if there is any udpate at all needed */
   savings = 0;
   for (i = 0; i < block_types; ++i) {
-    for (j = !i; j < COEF_BANDS; ++j) {
-      int prev_coef_savings[ENTROPY_NODES] = {0};
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          vp9_prob newp = new_frame_coef_probs[i][j][k][t];
-          const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];
-          const vp9_prob upd = COEF_UPDATE_PROB;
-          int s = prev_coef_savings[t];
-          int u = 0;
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
+    for (j = 0; j < REF_TYPES; ++j) {
+      for (k = 0; k < COEF_BANDS; ++k) {
+        int prev_coef_savings[ENTROPY_NODES] = {0};
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          for (t = 0; t < ENTROPY_NODES; ++t) {
+            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+            const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+            const vp9_prob upd = COEF_UPDATE_PROB;
+            int s = prev_coef_savings[t];
+            int u = 0;
+
+            if (l >= 3 && k == 0)
+              continue;
 #if defined(SEARCH_NEWP)
-          s = prob_diff_update_savings_search(
-                frame_branch_ct[i][j][k][t],
-                oldp, &newp, upd);
-          if (s > 0 && newp != oldp)
-            u = 1;
-          if (u)
-            savings += s - (int)(vp9_cost_zero(upd));
-          else
-            savings -= (int)(vp9_cost_zero(upd));
+            s = prob_diff_update_savings_search(frame_branch_ct[i][j][k][l][t],
+                                                oldp, &newp, upd);
+            if (s > 0 && newp != oldp)
+              u = 1;
+            if (u)
+              savings += s - (int)(vp9_cost_zero(upd));
+            else
+              savings -= (int)(vp9_cost_zero(upd));
 #else
-          s = prob_update_savings(
-                frame_branch_ct[i][j][k][t],
-                oldp, newp, upd);
-          if (s > 0)
-            u = 1;
-          if (u)
-            savings += s;
+            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],
+                                    oldp, newp, upd);
+            if (s > 0)
+              u = 1;
+            if (u)
+              savings += s;
 #endif
 
-          update[u]++;
+            update[u]++;
+          }
         }
       }
     }
@@ -1299,41 +1294,42 @@ static void update_coef_probs_common(vp9_writer* const bc,
   } else {
     vp9_write_bit(bc, 1);
     for (i = 0; i < block_types; ++i) {
-      for (j = !i; j < COEF_BANDS; ++j) {
-        int prev_coef_savings[ENTROPY_NODES] = {0};
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          // calc probs and branch cts for this frame only
-          for (t = 0; t < ENTROPY_NODES; ++t) {
-            vp9_prob newp = new_frame_coef_probs[i][j][k][t];
-            vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;
-            const vp9_prob upd = COEF_UPDATE_PROB;
-            int s = prev_coef_savings[t];
-            int u = 0;
-            if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-              continue;
+      for (j = 0; j < REF_TYPES; ++j) {
+        for (k = 0; k < COEF_BANDS; ++k) {
+          int prev_coef_savings[ENTROPY_NODES] = {0};
+          for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            // calc probs and branch cts for this frame only
+            for (t = 0; t < ENTROPY_NODES; ++t) {
+              vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+              vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+              const vp9_prob upd = COEF_UPDATE_PROB;
+              int s = prev_coef_savings[t];
+              int u = 0;
+              if (l >= 3 && k == 0)
+                continue;
 
 #if defined(SEARCH_NEWP)
-            s = prob_diff_update_savings_search(
-                  frame_branch_ct[i][j][k][t],
-                  *oldp, &newp, upd);
-            if (s > 0 && newp != *oldp)
-              u = 1;
+              s = prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+              if (s > 0 && newp != *oldp)
+                u = 1;
 #else
-            s = prob_update_savings(
-                  frame_branch_ct[i][j][k][t],
-                  *oldp, newp, upd);
-            if (s > 0)
-              u = 1;
+              s = prob_update_savings(frame_branch_ct[i][j][k][l][t],
+                                      *oldp, newp, upd);
+              if (s > 0)
+                u = 1;
 #endif
-            vp9_write(bc, u, upd);
+              vp9_write(bc, u, upd);
 #ifdef ENTROPY_STATS
-            if (!cpi->dummy_packing)
-              ++tree_update_hist[i][j][k][t][u];
+              if (!cpi->dummy_packing)
+                ++tree_update_hist[i][j][k][l][t][u];
 #endif
-            if (u) {
-              /* send/use new probability */
-              write_prob_diff_update(bc, newp, *oldp);
-              *oldp = newp;
+              if (u) {
+                /* send/use new probability */
+                write_prob_diff_update(bc, newp, *oldp);
+                *oldp = newp;
+              }
             }
           }
         }
@@ -1356,17 +1352,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
                            cpi->frame_coef_probs_4x4,
                            cpi->common.fc.coef_probs_4x4,
                            cpi->frame_branch_ct_4x4,
-                           BLOCK_TYPES_4X4);
-
-  update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                           cpi,
-                           hybrid_tree_update_hist_4x4,
-#endif
-                           cpi->frame_hybrid_coef_probs_4x4,
-                           cpi->common.fc.hybrid_coef_probs_4x4,
-                           cpi->frame_hybrid_branch_ct_4x4,
-                           BLOCK_TYPES_4X4);
+                           BLOCK_TYPES);
 
   /* do not do this if not even allowed */
   if (cpi->common.txfm_mode != ONLY_4X4) {
@@ -1378,17 +1364,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
                              cpi->frame_coef_probs_8x8,
                              cpi->common.fc.coef_probs_8x8,
                              cpi->frame_branch_ct_8x8,
-                             BLOCK_TYPES_8X8);
-
-    update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                             cpi,
-                             hybrid_tree_update_hist_8x8,
-#endif
-                             cpi->frame_hybrid_coef_probs_8x8,
-                             cpi->common.fc.hybrid_coef_probs_8x8,
-                             cpi->frame_hybrid_branch_ct_8x8,
-                             BLOCK_TYPES_8X8);
+                             BLOCK_TYPES);
   }
 
   if (cpi->common.txfm_mode > ALLOW_8X8) {
@@ -1400,16 +1376,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
                              cpi->frame_coef_probs_16x16,
                              cpi->common.fc.coef_probs_16x16,
                              cpi->frame_branch_ct_16x16,
-                             BLOCK_TYPES_16X16);
-    update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                             cpi,
-                             hybrid_tree_update_hist_16x16,
-#endif
-                             cpi->frame_hybrid_coef_probs_16x16,
-                             cpi->common.fc.hybrid_coef_probs_16x16,
-                             cpi->frame_hybrid_branch_ct_16x16,
-                             BLOCK_TYPES_16X16);
+                             BLOCK_TYPES);
   }
 
   if (cpi->common.txfm_mode > ALLOW_16X16) {
@@ -1523,33 +1490,37 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
    * and color type.
    */
   if (oh.type == KEY_FRAME) {
-    int v;
-
     // Start / synch code
     cx_data[0] = 0x9D;
     cx_data[1] = 0x01;
     cx_data[2] = 0x2a;
+    extra_bytes_packed = 3;
+    cx_data += extra_bytes_packed;
+  }
+  {
+    int v;
 
+    /* TODO(jkoleszar): support arbitrary resolutions */
     v = (pc->horiz_scale << 14) | pc->Width;
-    cx_data[3] = v;
-    cx_data[4] = v >> 8;
+    cx_data[0] = v;
+    cx_data[1] = v >> 8;
 
     v = (pc->vert_scale << 14) | pc->Height;
-    cx_data[5] = v;
-    cx_data[6] = v >> 8;
+    cx_data[2] = v;
+    cx_data[3] = v >> 8;
 
-    extra_bytes_packed = 7;
-    cx_data += extra_bytes_packed;
+    extra_bytes_packed += 4;
+    cx_data += 4;
+  }
 
-    vp9_start_encode(&header_bc, cx_data);
+  vp9_start_encode(&header_bc, cx_data);
 
-    // signal clr type
-    vp9_write_bit(&header_bc, pc->clr_type);
-    vp9_write_bit(&header_bc, pc->clamp_type);
+  // TODO(jkoleszar): remove these two unused bits?
+  vp9_write_bit(&header_bc, pc->clr_type);
+  vp9_write_bit(&header_bc, pc->clamp_type);
 
-  } else {
-    vp9_start_encode(&header_bc, cx_data);
-  }
+  // error resilient mode
+  vp9_write_bit(&header_bc, pc->error_resilient_mode);
 
   // Signal whether or not Segmentation is enabled
   vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
@@ -1655,7 +1626,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);
   vp9_write_literal(&header_bc, pc->sb32_coded, 8);
 
-  {
+  vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
+  if (cpi->mb.e_mbd.lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  } else {
     if (pc->txfm_mode == TX_MODE_SELECT) {
       pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +
                                 cpi->txfm_count_16x16p[TX_4X4] +
@@ -1765,29 +1739,35 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
 
   // Transmit Dc, Second order and Uv quantizer delta information
   put_delta_q(&header_bc, pc->y1dc_delta_q);
-  put_delta_q(&header_bc, pc->y2dc_delta_q);
-  put_delta_q(&header_bc, pc->y2ac_delta_q);
   put_delta_q(&header_bc, pc->uvdc_delta_q);
   put_delta_q(&header_bc, pc->uvac_delta_q);
 
   // When there is a key frame all reference buffers are updated using the new key frame
   if (pc->frame_type != KEY_FRAME) {
-    // Should the GF or ARF be updated using the transmitted frame or buffer
-    vp9_write_bit(&header_bc, pc->refresh_golden_frame);
-    vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);
-
-    // For inter frames the current default behavior is that when
-    // cm->refresh_golden_frame is set we copy the old GF over to
-    // the ARF buffer. This is purely an encoder decision at present.
-    if (pc->refresh_golden_frame)
-      pc->copy_buffer_to_arf  = 2;
-
-    // If not being updated from current frame should either GF or ARF be updated from another buffer
-    if (!pc->refresh_golden_frame)
-      vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);
+    int refresh_mask;
 
-    if (!pc->refresh_alt_ref_frame)
-      vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);
+    // Should the GF or ARF be updated using the transmitted frame or buffer
+    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+      /* Preserve the previously existing golden frame and update the frame in
+       * the alt ref slot instead. This is highly specific to the use of
+       * alt-ref as a forward reference, and this needs to be generalized as
+       * other uses are implemented (like RTC/temporal scaling)
+       *
+       * gld_fb_idx and alt_fb_idx need to be swapped for future frames, but
+       * that happens in vp9_onyx_if.c:update_reference_frames() so that it can
+       * be done outside of the recode loop.
+       */
+      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+                     (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+    } else {
+      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+                     (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+                     (cpi->refresh_alt_ref_frame << cpi->alt_fb_idx);
+    }
+    vp9_write_literal(&header_bc, refresh_mask, NUM_REF_FRAMES);
+    vp9_write_literal(&header_bc, cpi->lst_fb_idx, NUM_REF_FRAMES_LG2);
+    vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);
+    vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);
 
     // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
     vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
@@ -1831,10 +1811,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
 #endif
   }
 
-  vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
+  if (!pc->error_resilient_mode) {
+    vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
+    vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode);
+  }
 
-  if (pc->frame_type != KEY_FRAME)
-    vp9_write_bit(&header_bc, pc->refresh_last_frame);
+  vp9_write_literal(&header_bc, pc->frame_context_idx,
+                    NUM_FRAME_CONTEXTS_LG2);
 
 #ifdef ENTROPY_STATS
   if (pc->frame_type == INTER_FRAME)
@@ -1848,7 +1831,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   if (pc->frame_type != KEY_FRAME) {
     int i, j;
     int new_context[INTER_MODE_CONTEXTS][4];
-    update_mode_probs(pc, new_context);
+    if (!cpi->dummy_packing) {
+      update_inter_mode_probs(pc, new_context);
+    } else {
+      // In dummy pack assume context unchanged.
+      vpx_memcpy(new_context, pc->fc.vp9_mode_contexts,
+                 sizeof(pc->fc.vp9_mode_contexts));
+    }
 
     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
       for (j = 0; j < 4; j++) {
@@ -1902,16 +1891,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
 
   vp9_copy(cpi->common.fc.pre_coef_probs_4x4,
            cpi->common.fc.coef_probs_4x4);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_4x4,
-           cpi->common.fc.hybrid_coef_probs_4x4);
   vp9_copy(cpi->common.fc.pre_coef_probs_8x8,
            cpi->common.fc.coef_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8,
-           cpi->common.fc.hybrid_coef_probs_8x8);
   vp9_copy(cpi->common.fc.pre_coef_probs_16x16,
            cpi->common.fc.coef_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16,
-           cpi->common.fc.hybrid_coef_probs_16x16);
   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
            cpi->common.fc.coef_probs_32x32);
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
@@ -1960,7 +1943,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
     if (pc->mcomp_filter_type == SWITCHABLE)
       update_switchable_interp_probs(cpi, &header_bc);
 
-    #if CONFIG_COMP_INTERINTRA_PRED
+#if CONFIG_COMP_INTERINTRA_PRED
     if (pc->use_interintra) {
       vp9_cond_prob_update(&header_bc,
                            &pc->fc.interintra_prob,
@@ -1995,6 +1978,25 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
     vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
   }
 
+  /* tiling */
+  {
+    int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;
+
+    vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles);
+    n_tile_bits = pc->log2_tile_columns - min_log2_tiles;
+    for (n = 0; n < delta_log2_tiles; n++) {
+      if (n_tile_bits--) {
+        vp9_write_bit(&header_bc, 1);
+      } else {
+        vp9_write_bit(&header_bc, 0);
+        break;
+      }
+    }
+    vp9_write_bit(&header_bc, pc->log2_tile_rows != 0);
+    if (pc->log2_tile_rows != 0)
+      vp9_write_bit(&header_bc, pc->log2_tile_rows != 1);
+  }
+
   vp9_stop_encode(&header_bc);
 
   oh.first_partition_length_in_bytes = header_bc.pos;
@@ -2012,42 +2014,80 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   }
 
   *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
-  vp9_start_encode(&residual_bc, cx_data + header_bc.pos);
 
   if (pc->frame_type == KEY_FRAME) {
     decide_kf_ymode_entropy(cpi);
-    write_modes(cpi, &residual_bc);
   } else {
     /* This is not required if the counts in cpi are consistent with the
      * final packing pass */
     // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);
-    write_modes(cpi, &residual_bc);
-
-    vp9_update_mode_context(&cpi->common);
   }
 
-  vp9_stop_encode(&residual_bc);
+  {
+    int tile_row, tile_col, total_size = 0;
+    unsigned char *data_ptr = cx_data + header_bc.pos;
+    TOKENEXTRA *tok[1 << 6], *tok_end;
+
+    tok[0] = cpi->tok;
+    for (tile_col = 1; tile_col < pc->tile_columns; tile_col++)
+      tok[tile_col] = tok[tile_col - 1] + cpi->tok_count[tile_col - 1];
+
+    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+      vp9_get_tile_row_offsets(pc, tile_row);
+      tok_end = cpi->tok + cpi->tok_count[0];
+      for (tile_col = 0; tile_col < pc->tile_columns;
+           tile_col++, tok_end += cpi->tok_count[tile_col]) {
+        vp9_get_tile_col_offsets(pc, tile_col);
+
+        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
+          vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+        else
+          vp9_start_encode(&residual_bc, data_ptr + total_size);
+        write_modes(cpi, &residual_bc, &tok[tile_col], tok_end);
+        vp9_stop_encode(&residual_bc);
+        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
+          /* size of this tile */
+          data_ptr[total_size + 0] = residual_bc.pos;
+          data_ptr[total_size + 1] = residual_bc.pos >> 8;
+          data_ptr[total_size + 2] = residual_bc.pos >> 16;
+          data_ptr[total_size + 3] = residual_bc.pos >> 24;
+          total_size += 4;
+        }
+
+        total_size += residual_bc.pos;
+      }
+    }
+
+    assert((unsigned int)(tok[0] - cpi->tok) == cpi->tok_count[0]);
+    for (tile_col = 1; tile_col < pc->tile_columns; tile_col++)
+      assert((unsigned int)(tok[tile_col] - tok[tile_col - 1]) ==
+                  cpi->tok_count[tile_col]);
 
-  *size += residual_bc.pos;
+    *size += total_size;
+  }
 }
 
 #ifdef ENTROPY_STATS
 static void print_tree_update_for_type(FILE *f,
                                        vp9_coeff_stats *tree_update_hist,
                                        int block_types, const char *header) {
-  int i, j, k, l;
+  int i, j, k, l, m;
 
   fprintf(f, "const vp9_coeff_prob %s = {\n", header);
   for (i = 0; i < block_types; i++) {
     fprintf(f, "  { \n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < ENTROPY_NODES; l++) {
-          fprintf(f, "%3d, ",
-                  get_binary_prob(tree_update_hist[i][j][k][l][0],
-                                  tree_update_hist[i][j][k][l][1]));
+    for (j = 0; j < REF_TYPES; j++) {
+      fprintf(f, "  { \n");
+      for (k = 0; k < COEF_BANDS; k++) {
+        fprintf(f, "    {\n");
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+          fprintf(f, "      {");
+          for (m = 0; m < ENTROPY_NODES; m++) {
+            fprintf(f, "%3d, ",
+                    get_binary_prob(tree_update_hist[i][j][k][l][m][0],
+                                    tree_update_hist[i][j][k][l][m][1]));
+          }
+          fprintf(f, "},\n");
         }
         fprintf(f, "},\n");
       }
@@ -2062,18 +2102,11 @@ void print_tree_update_probs() {
   FILE *f = fopen("coefupdprob.h", "w");
   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
 
-  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES_4X4,
+  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES,
                              "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");
-  print_tree_update_for_type(f, hybrid_tree_update_hist_4x4, BLOCK_TYPES_4X4,
-                             "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");
-  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES_8X8,
-                             "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");
-  print_tree_update_for_type(f, hybrid_tree_update_hist_8x8, BLOCK_TYPES_8X8,
+  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES,
                              "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");
-  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES_16X16,
-                             "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");
-  print_tree_update_for_type(f, hybrid_tree_update_hist_16x16,
-                             BLOCK_TYPES_16X16,
+  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES,
                              "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");
   print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES_32X32,
                              "vp9_coef_update_probs_32x32[BLOCK_TYPES_32X32]");
@@ -2083,6 +2116,7 @@ void print_tree_update_probs() {
   fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);
   fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
   fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+  fwrite(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);
   fclose(f);
 }
 #endif
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 1960b9162..79a021cfb 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -50,10 +50,7 @@ typedef struct block {
   int src;
   int src_stride;
 
-  int eob_max_offset;
-  int eob_max_offset_8x8;
-  int eob_max_offset_16x16;
-  int eob_max_offset_32x32;
+  int skip_block;
 } BLOCK;
 
 typedef struct {
@@ -91,12 +88,12 @@ typedef struct superblock {
   DECLARE_ALIGNED(16, int16_t, coeff[32*32+16*16*2]);
 } SUPERBLOCK;
 
-typedef struct macroblock {
-  DECLARE_ALIGNED(16, int16_t, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
-  DECLARE_ALIGNED(16, int16_t, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+  DECLARE_ALIGNED(16, int16_t, src_diff[384]);  // 16x16 Y 8x8 U 8x8 V
+  DECLARE_ALIGNED(16, int16_t, coeff[384]);     // 16x16 Y 8x8 U 8x8 V
   // 16 Y blocks, 4 U blocks, 4 V blocks,
-  // 1 DC 2nd order block each with 16 entries
-  BLOCK block[25];
+  BLOCK block[24];
 
   SUPERBLOCK sb_coeff_data;
 
@@ -160,8 +157,7 @@ typedef struct macroblock {
 
   unsigned char *active_ptr;
 
-  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4];
-  vp9_coeff_count hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4];
+  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
 
   int optimize;
 
@@ -172,17 +168,14 @@ typedef struct macroblock {
   PICK_MODE_CONTEXT sb32_context[4];
   PICK_MODE_CONTEXT sb64_context;
 
-  void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch);
-  void (*vp9_short_fdct8x4)(int16_t *input, int16_t *output, int pitch);
-  void (*short_walsh4x4)(int16_t *input, int16_t *output, int pitch);
-  void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
-  void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
-  void (*vp9_short_fdct8x8)(int16_t *input, int16_t *output, int pitch);
-  void (*vp9_short_fdct16x16)(int16_t *input, int16_t *output, int pitch);
-  void (*short_fhaar2x2)(int16_t *input, int16_t *output, int pitch);
-  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
-  void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
-  void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
-} MACROBLOCK;
+  void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
+  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx);
+  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2);
+  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx);
+  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx);
+};
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c
index d1b1e0e89..a590902c2 100644
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -40,7 +40,6 @@ const unsigned int vp9_prob_cost[256] = {
 };
 
 void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
-
   br->lowvalue = 0;
   br->range    = 255;
   br->value    = 0;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index bfde02ccb..e4ac2ce36 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -15,842 +15,362 @@
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/common/vp9_blockd.h"
-
-// TODO: these transforms can be converted into integer forms to reduce
-//       the complexity
-static const float dct_4[16] = {
-  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,
-  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,
-  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,
-  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099
-};
-
-static const float adst_4[16] = {
-  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,
-  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,
-  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,
-  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779
-};
-
-static const float dct_8[64] = {
-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
-  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,
- -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,
-  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,
- -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,
-  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,
-  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,
-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
-  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,
- -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,
-  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,
- -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,
-  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,
-  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064
-};
-
-static const float adst_8[64] = {
-  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,
-  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,
-  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,
-  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,
-  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,
- -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,
-  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,
-  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,
-  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,
-  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,
-  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,
- -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,
-  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,
- -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,
-  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,
-  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532
-};
-
-/* Converted the transforms to integers. */
-static const int16_t dct_i4[16] = {
-  16384,  16384,  16384,  16384,
-  21407,   8867,  -8867, -21407,
-  16384, -16384, -16384,  16384,
-   8867, -21407,  21407,  -8867
-};
-
-static const int16_t adst_i4[16] = {
-   7472,  14042,  18919,  21513,
-  18919,  18919,      0, -18919,
-  21513,  -7472, -18919,  14042,
-  14042, -21513,  18919,  -7472
-};
-
-static const int16_t dct_i8[64] = {
-   11585,  11585,  11585,  11585,
-   11585,  11585,  11585,  11585,
-   16069,  13623,   9102,   3196,
-   -3196,  -9102, -13623, -16069,
-   15137,   6270,  -6270, -15137,
-  -15137,  -6270,   6270,  15137,
-   13623,  -3196, -16069,  -9102,
-    9102,  16069,   3196, -13623,
-   11585, -11585, -11585,  11585,
-   11585, -11585, -11585,  11585,
-    9102, -16069,   3196,  13623,
-  -13623,  -3196,  16069,  -9102,
-    6270, -15137,  15137,  -6270,
-   -6270,  15137, -15137,   6270,
-    3196,  -9102,  13623, -16069,
-   16069, -13623,   9102,  -3196
-};
-
-static const int16_t adst_i8[64] = {
-    2921,   5742,   8368,  10708,
-   12684,  14228,  15288,  15827,
-    8368,  14228,  15827,  12684,
-    5742,  -2921, -10708, -15288,
-   12684,  15288,   5742,  -8368,
-  -15827, -10708,   2921,  14228,
-   15288,   8368, -10708, -14228,
-    2921,  15827,   5742, -12684,
-   15827,  -2921, -15288,   5742,
-   14228,  -8368, -12684,  10708,
-   14228, -12684,  -2921,  15288,
-  -10708,  -5742,  15827,  -8368,
-   10708, -15827,  12684,  -2921,
-   -8368,  15288, -14228,   5742,
-    5742, -10708,  14228, -15827,
-   15288, -12684,   8368,  -2921
-};
-
-static const float dct_16[256] = {
-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
-  0.351851,  0.338330,  0.311806,  0.273300,  0.224292,  0.166664,  0.102631,  0.034654,
- -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,
-  0.346760,  0.293969,  0.196424,  0.068975, -0.068975, -0.196424, -0.293969, -0.346760,
- -0.346760, -0.293969, -0.196424, -0.068975,  0.068975,  0.196424,  0.293969,  0.346760,
-  0.338330,  0.224292,  0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,
-  0.102631,  0.273300,  0.351851,  0.311806,  0.166664, -0.034654, -0.224292, -0.338330,
-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,
-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,
-  0.311806,  0.034654, -0.273300, -0.338330, -0.102631,  0.224292,  0.351851,  0.166664,
- -0.166664, -0.351851, -0.224292,  0.102631,  0.338330,  0.273300, -0.034654, -0.311806,
-  0.293969, -0.068975, -0.346760, -0.196424,  0.196424,  0.346760,  0.068975, -0.293969,
- -0.293969,  0.068975,  0.346760,  0.196424, -0.196424, -0.346760, -0.068975,  0.293969,
-  0.273300, -0.166664, -0.338330,  0.034654,  0.351851,  0.102631, -0.311806, -0.224292,
-  0.224292,  0.311806, -0.102631, -0.351851, -0.034654,  0.338330,  0.166664, -0.273300,
-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,
-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,
-  0.224292, -0.311806, -0.102631,  0.351851, -0.034654, -0.338330,  0.166664,  0.273300,
- -0.273300, -0.166664,  0.338330,  0.034654, -0.351851,  0.102631,  0.311806, -0.224292,
-  0.196424, -0.346760,  0.068975,  0.293969, -0.293969, -0.068975,  0.346760, -0.196424,
- -0.196424,  0.346760, -0.068975, -0.293969,  0.293969,  0.068975, -0.346760,  0.196424,
-  0.166664, -0.351851,  0.224292,  0.102631, -0.338330,  0.273300,  0.034654, -0.311806,
-  0.311806, -0.034654, -0.273300,  0.338330, -0.102631, -0.224292,  0.351851, -0.166664,
-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,
-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,
-  0.102631, -0.273300,  0.351851, -0.311806,  0.166664,  0.034654, -0.224292,  0.338330,
- -0.338330,  0.224292, -0.034654, -0.166664,  0.311806, -0.351851,  0.273300, -0.102631,
-  0.068975, -0.196424,  0.293969, -0.346760,  0.346760, -0.293969,  0.196424, -0.068975,
- -0.068975,  0.196424, -0.293969,  0.346760, -0.346760,  0.293969, -0.196424,  0.068975,
-  0.034654, -0.102631,  0.166664, -0.224292,  0.273300, -0.311806,  0.338330, -0.351851,
-  0.351851, -0.338330,  0.311806, -0.273300,  0.224292, -0.166664,  0.102631, -0.034654
-};
-
-static const float adst_16[256] = {
-  0.033094,  0.065889,  0.098087,  0.129396,  0.159534,  0.188227,  0.215215,  0.240255,
-  0.263118,  0.283599,  0.301511,  0.316693,  0.329007,  0.338341,  0.344612,  0.347761,
-  0.098087,  0.188227,  0.263118,  0.316693,  0.344612,  0.344612,  0.316693,  0.263118,
-  0.188227,  0.098087,  0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,
-  0.159534,  0.283599,  0.344612,  0.329007,  0.240255,  0.098087, -0.065889, -0.215215,
- -0.316693, -0.347761, -0.301511, -0.188227, -0.033094,  0.129396,  0.263118,  0.338341,
-  0.215215,  0.338341,  0.316693,  0.159534, -0.065889, -0.263118, -0.347761, -0.283599,
- -0.098087,  0.129396,  0.301511,  0.344612,  0.240255,  0.033094, -0.188227, -0.329007,
-  0.263118,  0.344612,  0.188227, -0.098087, -0.316693, -0.316693, -0.098087,  0.188227,
-  0.344612,  0.263118,  0.000000, -0.263118, -0.344612, -0.188227,  0.098087,  0.316693,
-  0.301511,  0.301511,  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,
-  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,  0.000000, -0.301511,
-  0.329007,  0.215215, -0.188227, -0.338341, -0.033094,  0.316693,  0.240255, -0.159534,
- -0.344612, -0.065889,  0.301511,  0.263118, -0.129396, -0.347761, -0.098087,  0.283599,
-  0.344612,  0.098087, -0.316693, -0.188227,  0.263118,  0.263118, -0.188227, -0.316693,
-  0.098087,  0.344612,  0.000000, -0.344612, -0.098087,  0.316693,  0.188227, -0.263118,
-  0.347761, -0.033094, -0.344612,  0.065889,  0.338341, -0.098087, -0.329007,  0.129396,
-  0.316693, -0.159534, -0.301511,  0.188227,  0.283599, -0.215215, -0.263118,  0.240255,
-  0.338341, -0.159534, -0.263118,  0.283599,  0.129396, -0.344612,  0.033094,  0.329007,
- -0.188227, -0.240255,  0.301511,  0.098087, -0.347761,  0.065889,  0.316693, -0.215215,
-  0.316693, -0.263118, -0.098087,  0.344612, -0.188227, -0.188227,  0.344612, -0.098087,
- -0.263118,  0.316693,  0.000000, -0.316693,  0.263118,  0.098087, -0.344612,  0.188227,
-  0.283599, -0.329007,  0.098087,  0.215215, -0.347761,  0.188227,  0.129396, -0.338341,
-  0.263118,  0.033094, -0.301511,  0.316693, -0.065889, -0.240255,  0.344612, -0.159534,
-  0.240255, -0.347761,  0.263118, -0.033094, -0.215215,  0.344612, -0.283599,  0.065889,
-  0.188227, -0.338341,  0.301511, -0.098087, -0.159534,  0.329007, -0.316693,  0.129396,
-  0.188227, -0.316693,  0.344612, -0.263118,  0.098087,  0.098087, -0.263118,  0.344612,
- -0.316693,  0.188227,  0.000000, -0.188227,  0.316693, -0.344612,  0.263118, -0.098087,
-  0.129396, -0.240255,  0.316693, -0.347761,  0.329007, -0.263118,  0.159534, -0.033094,
- -0.098087,  0.215215, -0.301511,  0.344612, -0.338341,  0.283599, -0.188227,  0.065889,
-  0.065889, -0.129396,  0.188227, -0.240255,  0.283599, -0.316693,  0.338341, -0.347761,
-  0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094
-};
-
-/* Converted the transforms to integers. */
-static const int16_t dct_i16[256] = {
-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
-   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,
-   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,
-   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,
-  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,
-   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,
-    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,
-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
-   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,
-   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,
-    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,
-   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,
-    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,
-    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,
-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
-    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,
-   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,
-    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,
-   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,
-    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,
-   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,
-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
-    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,
-  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,
-    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,
-   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,
-    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,
-   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136
-};
-
-static const int16_t adst_i16[256] = {
-    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,
-    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,
-    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,
-    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,
-    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,
-  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,
-    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,
-   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,
-    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,
-   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,
-    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,
-       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,
-   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,
-  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,
-   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,
-    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,
-   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,
-   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,
-   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,
-   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,
-   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,
-   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,
-    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,
-    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,
-    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,
-    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,
-    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,
-  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,
-    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,
-   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,
-    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,
-   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084
-};
-
-static const int xC1S7 = 16069;
-static const int xC2S6 = 15137;
-static const int xC3S5 = 13623;
-static const int xC4S4 = 11585;
-static const int xC5S3 =  9102;
-static const int xC6S2 =  6270;
-static const int xC7S1 =  3196;
-
-#define SHIFT_BITS 14
-#define DOROUND(X) X += (1<<(SHIFT_BITS-1));
-
-#define FINAL_SHIFT 3
-#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))
-#define IN_SHIFT (FINAL_SHIFT+1)
-
-
-void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
-  int loop;
-  int short_pitch = pitch >> 1;
-  int is07, is12, is34, is56;
-  int is0734, is1256;
-  int id07, id12, id34, id56;
-  int irot_input_x, irot_input_y;
-  int icommon_product1;      // Re-used product  (c4s4 * (s12 - s56))
-  int icommon_product2;      // Re-used product  (c4s4 * (d12 + d56))
-  int temp1, temp2;          // intermediate variable for computation
-
-  int  InterData[64];
-  int  *ip = InterData;
-  short *op = OutputData;
-
-  for (loop = 0; loop < 8; loop++) {
-    // Pre calculate some common sums and differences.
-    is07 = (InputData[0] + InputData[7]) << IN_SHIFT;
-    is12 = (InputData[1] + InputData[2]) << IN_SHIFT;
-    is34 = (InputData[3] + InputData[4]) << IN_SHIFT;
-    is56 = (InputData[5] + InputData[6]) << IN_SHIFT;
-    id07 = (InputData[0] - InputData[7]) << IN_SHIFT;
-    id12 = (InputData[1] - InputData[2]) << IN_SHIFT;
-    id34 = (InputData[3] - InputData[4]) << IN_SHIFT;
-    id56 = (InputData[5] - InputData[6]) << IN_SHIFT;
-
-    is0734 = is07 + is34;
-    is1256 = is12 + is56;
-
-    // Pre-Calculate some common product terms.
-    icommon_product1 = xC4S4 * (is12 - is56);
-    DOROUND(icommon_product1)
-    icommon_product1 >>= SHIFT_BITS;
-
-    icommon_product2 = xC4S4 * (id12 + id56);
-    DOROUND(icommon_product2)
-    icommon_product2 >>= SHIFT_BITS;
-
-
-    ip[0] = (xC4S4 * (is0734 + is1256));
-    DOROUND(ip[0]);
-    ip[0] >>= SHIFT_BITS;
-
-    ip[4] = (xC4S4 * (is0734 - is1256));
-    DOROUND(ip[4]);
-    ip[4] >>= SHIFT_BITS;
-
-    // Define inputs to rotation for outputs 2 and 6
-    irot_input_x = id12 - id56;
-    irot_input_y = is07 - is34;
-
-    // Apply rotation for outputs 2 and 6.
-    temp1 = xC6S2 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[2] = temp1 + temp2;
-
-    temp1 = xC6S2 * irot_input_y;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_x;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[6] = temp1 - temp2;
-
-    // Define inputs to rotation for outputs 1 and 7
-    irot_input_x = icommon_product1 + id07;
-    irot_input_y = -(id34 + icommon_product2);
-
-    // Apply rotation for outputs 1 and 7.
-    temp1 = xC1S7 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC7S1 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[1] = temp1 - temp2;
-
-    temp1 = xC7S1 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC1S7 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[7] = temp1 + temp2;
-
-    // Define inputs to rotation for outputs 3 and 5
-    irot_input_x = id07 - icommon_product1;
-    irot_input_y = id34 - icommon_product2;
-
-    // Apply rotation for outputs 3 and 5.
-    temp1 = xC3S5 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC5S3 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[3] = temp1 - temp2;
-
-
-    temp1 = xC5S3 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC3S5 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    ip[5] = temp1 + temp2;
-
-    // Increment data pointer for next row
-    InputData += short_pitch;
-    ip += 8;
-  }
-
-  // Performed DCT on rows, now transform the columns
-  ip = InterData;
-  for (loop = 0; loop < 8; loop++) {
-    // Pre calculate some common sums and differences.
-    is07 = ip[0 * 8] + ip[7 * 8];
-    is12 = ip[1 * 8] + ip[2 * 8];
-    is34 = ip[3 * 8] + ip[4 * 8];
-    is56 = ip[5 * 8] + ip[6 * 8];
-
-    id07 = ip[0 * 8] - ip[7 * 8];
-    id12 = ip[1 * 8] - ip[2 * 8];
-    id34 = ip[3 * 8] - ip[4 * 8];
-    id56 = ip[5 * 8] - ip[6 * 8];
-
-    is0734 = is07 + is34;
-    is1256 = is12 + is56;
-
-    // Pre-Calculate some common product terms
-    icommon_product1 = xC4S4 * (is12 - is56);
-    icommon_product2 = xC4S4 * (id12 + id56);
-    DOROUND(icommon_product1)
-    DOROUND(icommon_product2)
-    icommon_product1 >>= SHIFT_BITS;
-    icommon_product2 >>= SHIFT_BITS;
-
-
-    temp1 = xC4S4 * (is0734 + is1256);
-    temp2 = xC4S4 * (is0734 - is1256);
-    DOROUND(temp1);
-    DOROUND(temp2);
-    temp1 >>= SHIFT_BITS;
-
-    temp2 >>= SHIFT_BITS;
-    op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;
-    op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Define inputs to rotation for outputs 2 and 6
-    irot_input_x = id12 - id56;
-    irot_input_y = is07 - is34;
-
-    // Apply rotation for outputs 2 and 6.
-    temp1 = xC6S2 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    temp1 = xC6S2 * irot_input_y;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC2S6 * irot_input_x;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Define inputs to rotation for outputs 1 and 7
-    irot_input_x = icommon_product1 + id07;
-    irot_input_y = -(id34 + icommon_product2);
-
-    // Apply rotation for outputs 1 and 7.
-    temp1 = xC1S7 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC7S1 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    temp1 = xC7S1 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC1S7 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Define inputs to rotation for outputs 3 and 5
-    irot_input_x = id07 - icommon_product1;
-    irot_input_y = id34 - icommon_product2;
-
-    // Apply rotation for outputs 3 and 5.
-    temp1 = xC3S5 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC5S3 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-
-    temp1 = xC5S3 * irot_input_x;
-    DOROUND(temp1);
-    temp1 >>= SHIFT_BITS;
-    temp2 = xC3S5 * irot_input_y;
-    DOROUND(temp2);
-    temp2 >>= SHIFT_BITS;
-    op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-    // Increment data pointer for next column.
-    ip++;
-    op++;
-  }
+#include "vp9/common/vp9_idct.h"
+
+static void fdct4_1d(int16_t *input, int16_t *output) {
+  int16_t step[4];
+  int temp1, temp2;
+
+  step[0] = input[0] + input[3];
+  step[1] = input[1] + input[2];
+  step[2] = input[1] - input[2];
+  step[3] = input[0] - input[3];
+
+  temp1 = (step[0] + step[1]) * cospi_16_64;
+  temp2 = (step[0] - step[1]) * cospi_16_64;
+  output[0] = dct_const_round_shift(temp1);
+  output[2] = dct_const_round_shift(temp2);
+  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+  output[1] = dct_const_round_shift(temp1);
+  output[3] = dct_const_round_shift(temp2);
 }
 
-void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
-  /* [1 1; 1 -1] orthogonal transform */
-  /* use position: 0,1, 4, 8 */
-  int i;
-  short *ip1 = input;
-  short *op1 = output;
-  for (i = 0; i < 16; i++) {
-    op1[i] = 0;
+void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[4 * 4];
+  int16_t *outptr = &out[0];
+  const int short_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[4], temp_out[4];
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = input[j * short_pitch + i] << 4;
+    if (i == 0 && temp_in[0])
+      temp_in[0] += 1;
+    fdct4_1d(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      outptr[j * 4 + i] = temp_out[j];
   }
 
-  op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;
-  op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;
-  op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;
-  op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;
-}
-
-/* For test */
-#define TEST_INT 1
-#if TEST_INT
-#define vp9_fht_int_c vp9_fht_c
-#else
-#define vp9_fht_float_c vp9_fht_c
-#endif
-
-void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,
-               TX_TYPE tx_type, int tx_dim) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int i, j, k;
-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
-                                 // the implementation could be simplified in
-                                 // conjunction with integer transform
-    const int16_t *ip = input;
-    int16_t *op = output;
-
-    float *pfa = &bufa[0];
-    float *pfb = &bufb[0];
-
-    // pointers to vertical and horizontal transforms
-    const float *ptv, *pth;
-
-    assert(tx_type != DCT_DCT);
-    // load and convert residual array into floating-point
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        pfa[i] = (float)ip[i];
-      }
-      pfa += tx_dim;
-      ip  += pitch / 2;
-    }
-
-    // vertical transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch (tx_type) {
-      case ADST_ADST :
-      case ADST_DCT  :
-        ptv = (tx_dim == 4) ? &adst_4[0] :
-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-        break;
-
-      default :
-        ptv = (tx_dim == 4) ? &dct_4[0] :
-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-        break;
-    }
-
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        pfb[i] = 0;
-        for (k = 0; k < tx_dim; k++) {
-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
-        }
-        pfa += 1;
-      }
-      pfb += tx_dim;
-      ptv += tx_dim;
-      pfa = &bufa[0];
-    }
-
-    // horizontal transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch (tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = (tx_dim == 4) ? &adst_4[0] :
-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-        break;
-
-      default :
-        pth = (tx_dim == 4) ? &dct_4[0] :
-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-        break;
-    }
-
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        pfa[i] = 0;
-        for (k = 0; k < tx_dim; k++) {
-          pfa[i] += pfb[k] * pth[k];
-        }
-        pth += tx_dim;
-      }
-
-      pfa += tx_dim;
-      pfb += tx_dim;
-      // pth -= tx_dim * tx_dim;
-
-      switch (tx_type) {
-        case ADST_ADST :
-        case  DCT_ADST :
-          pth = (tx_dim == 4) ? &adst_4[0] :
-                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
-          break;
-
-        default :
-          pth = (tx_dim == 4) ? &dct_4[0] :
-                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
-          break;
-      }
-    }
-
-    // convert to short integer format and load BLOCKD buffer
-    op = output;
-    pfa = &bufa[0];
-
-    for (j = 0; j < tx_dim; j++) {
-      for (i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
-                                     -(int16_t)(- 8 * pfa[i] + 0.49);
-      }
-      op  += tx_dim;
-      pfa += tx_dim;
-    }
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j + i * 4];
+    fdct4_1d(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
   }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
-/* Converted the transforms to integer form. */
-#define VERTICAL_SHIFT 11
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-#define HORIZONTAL_SHIFT 16
-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
-void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
-                   TX_TYPE tx_type, int tx_dim) {
-  int i, j, k;
-  int16_t imbuf[256];
-
-  const int16_t *ip = input;
-  int16_t *op = output;
-  int16_t *im = &imbuf[0];
-
-  /* pointers to vertical and horizontal transforms. */
-  const int16_t *ptv = NULL, *pth = NULL;
-
-  switch (tx_type) {
-    case ADST_ADST :
-      ptv = pth = (tx_dim == 4) ? &adst_i4[0]
-                                  : ((tx_dim == 8) ? &adst_i8[0]
-                                                     : &adst_i16[0]);
-      break;
-    case ADST_DCT  :
-      ptv = (tx_dim == 4) ? &adst_i4[0]
-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
-      pth = (tx_dim == 4) ? &dct_i4[0]
-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
-      break;
-    case  DCT_ADST :
-      ptv = (tx_dim == 4) ? &dct_i4[0]
-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
-      pth = (tx_dim == 4) ? &adst_i4[0]
-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
-      break;
-    case  DCT_DCT :
-      ptv = pth = (tx_dim == 4) ? &dct_i4[0]
-                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  /* vertical transformation */
-  for (j = 0; j < tx_dim; j++) {
-    for (i = 0; i < tx_dim; i++) {
-      int temp = 0;
+static void fadst4_1d(int16_t *input, int16_t *output) {
+  int x0, x1, x2, x3;
+  int s0, s1, s2, s3, s4, s5, s6, s7;
 
-      for (k = 0; k < tx_dim; k++) {
-        temp += ptv[k] * ip[(k * (pitch >> 1))];
-      }
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
 
-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
-      ip++;
-    }
-    im += tx_dim;  // 16
-    ptv += tx_dim;
-    ip = input;
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
   }
 
-  /* horizontal transformation */
-  im = &imbuf[0];
-
-  for (j = 0; j < tx_dim; j++) {
-    const int16_t *pthc = pth;
-
-    for (i = 0; i < tx_dim; i++) {
-      int temp = 0;
-
-      for (k = 0; k < tx_dim; k++) {
-        temp += im[k] * pthc[k];
-      }
-
-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
-      pthc += tx_dim;
-    }
-
-    im += tx_dim;  // 16
-    op += tx_dim;
-  }
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3]) << 5);
-    b1 = ((ip[1] + ip[2]) << 5);
-    c1 = ((ip[1] - ip[2]) << 5);
-    d1 = ((ip[0] - ip[3]) << 5);
-
-    op[0] = a1 + b1;
-    op[2] = a1 - b1;
-
-    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;
-    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;
-
-    ip += pitch / 2;
-    op += 4;
+static const transform_2d FHT_4[] = {
+  { fdct4_1d,  fdct4_1d  },  // DCT_DCT  = 0
+  { fadst4_1d, fdct4_1d  },  // ADST_DCT = 1
+  { fdct4_1d,  fadst4_1d },  // DCT_ADST = 2
+  { fadst4_1d, fadst4_1d }   // ADST_ADST = 3
+};
 
+void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
+                        int pitch, TX_TYPE tx_type) {
+  int16_t out[4 * 4];
+  int16_t *outptr = &out[0];
+  int i, j;
+  int16_t temp_in[4], temp_out[4];
+  const transform_2d ht = FHT_4[tx_type];
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = input[j * pitch + i] << 4;
+    if (i == 0 && temp_in[0])
+      temp_in[0] += 1;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      outptr[j * 4 + i] = temp_out[j];
   }
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
-
-    op[0]  = (a1 + b1 + 7) >> 4;
-    op[8]  = (a1 - b1 + 7) >> 4;
 
-    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);
-    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;
-
-    ip++;
-    op++;
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j + i * 4];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      output[j + i * 4] = (temp_out[j] + 1) >> 2;
   }
 }
 
-void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
-{
-    vp9_short_fdct4x4_c(input,   output,    pitch);
+void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {
+    vp9_short_fdct4x4_c(input, output, pitch);
     vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-  int pitch_short = pitch >> 1;
+static void fdct8_1d(int16_t *input, int16_t *output) {
+  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+  /*needs32*/ int t0, t1, t2, t3;
+  /*canbe16*/ int x0, x1, x2, x3;
+
+  // stage 1
+  s0 = input[0] + input[7];
+  s1 = input[1] + input[6];
+  s2 = input[2] + input[5];
+  s3 = input[3] + input[4];
+  s4 = input[3] - input[4];
+  s5 = input[2] - input[5];
+  s6 = input[1] - input[6];
+  s7 = input[0] - input[7];
+
+  // fdct4_1d(step, step);
+  x0 = s0 + s3;
+  x1 = s1 + s2;
+  x2 = s1 - s2;
+  x3 = s0 - s3;
+  t0 = (x0 + x1) * cospi_16_64;
+  t1 = (x0 - x1) * cospi_16_64;
+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+  output[0] = dct_const_round_shift(t0);
+  output[2] = dct_const_round_shift(t2);
+  output[4] = dct_const_round_shift(t1);
+  output[6] = dct_const_round_shift(t3);
 
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
+  // Stage 2
+  t0 = (s6 - s5) * cospi_16_64;
+  t1 = (s6 + s5) * cospi_16_64;
+  t2 = dct_const_round_shift(t0);
+  t3 = dct_const_round_shift(t1);
 
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
+  // Stage 3
+  x0 = s4 + t2;
+  x1 = s4 - t2;
+  x2 = s7 - t3;
+  x3 = s7 + t3;
 
-    ip++;
-    op++;
-  }
-  ip = output;
-  op = output;
+  // Stage 4
+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[1] = dct_const_round_shift(t0);
+  output[3] = dct_const_round_shift(t2);
+  output[5] = dct_const_round_shift(t1);
+  output[7] = dct_const_round_shift(t3);
+}
 
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[3];
-    b1 = ip[1] + ip[2];
-    c1 = ip[1] - ip[2];
-    d1 = ip[0] - ip[3];
+void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
+  const int stride = pitch >> 1;
+  int i, j;
+  int16_t intermediate[64];
 
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
+  // Transform columns
+  {
+    int16_t *output = intermediate;
+    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+    /*needs32*/ int t0, t1, t2, t3;
+    /*canbe16*/ int x0, x1, x2, x3;
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) << 2;
+      s1 = (input[1 * stride] + input[6 * stride]) << 2;
+      s2 = (input[2 * stride] + input[5 * stride]) << 2;
+      s3 = (input[3 * stride] + input[4 * stride]) << 2;
+      s4 = (input[3 * stride] - input[4 * stride]) << 2;
+      s5 = (input[2 * stride] - input[5 * stride]) << 2;
+      s6 = (input[1 * stride] - input[6 * stride]) << 2;
+      s7 = (input[0 * stride] - input[7 * stride]) << 2;
+
+      // fdct4_1d(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0 * 8] = dct_const_round_shift(t0);
+      output[2 * 8] = dct_const_round_shift(t2);
+      output[4 * 8] = dct_const_round_shift(t1);
+      output[6 * 8] = dct_const_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = dct_const_round_shift(t0);
+      t3 = dct_const_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1 * 8] = dct_const_round_shift(t0);
+      output[3 * 8] = dct_const_round_shift(t2);
+      output[5 * 8] = dct_const_round_shift(t1);
+      output[7 * 8] = dct_const_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
 
-    ip += 4;
-    op += 4;
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
+    for (j = 0; j < 8; ++j)
+      final_output[j + i * 8] /= 2;
   }
 }
 
-#if CONFIG_LOSSLESS
-void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-  int pitch_short = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-    b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-    c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-    d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+static void fadst8_1d(int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+
+  int x0 = input[7];
+  int x1 = input[0];
+  int x2 = input[5];
+  int x3 = input[2];
+  int x4 = input[3];
+  int x5 = input[4];
+  int x6 = input[1];
+  int x7 = input[6];
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = dct_const_round_shift(s0 + s4);
+  x1 = dct_const_round_shift(s1 + s5);
+  x2 = dct_const_round_shift(s2 + s6);
+  x3 = dct_const_round_shift(s3 + s7);
+  x4 = dct_const_round_shift(s0 - s4);
+  x5 = dct_const_round_shift(s1 - s5);
+  x6 = dct_const_round_shift(s2 - s6);
+  x7 = dct_const_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+
+  output[0] =   x0;
+  output[1] = - x4;
+  output[2] =   x6;
+  output[3] = - x2;
+  output[4] =   x3;
+  output[5] = - x7;
+  output[6] =   x5;
+  output[7] = - x1;
+}
 
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
+static const transform_2d FHT_8[] = {
+  { fdct8_1d,  fdct8_1d  },  // DCT_DCT  = 0
+  { fadst8_1d, fdct8_1d  },  // ADST_DCT = 1
+  { fdct8_1d,  fadst8_1d },  // DCT_ADST = 2
+  { fadst8_1d, fadst8_1d }   // ADST_ADST = 3
+};
 
-    ip++;
-    op++;
+void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
+                        int pitch, TX_TYPE tx_type) {
+  int16_t out[64];
+  int16_t *outptr = &out[0];
+  int i, j;
+  int16_t temp_in[8], temp_out[8];
+  const transform_2d ht = FHT_8[tx_type];
+
+  // Columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = input[j * pitch + i] << 2;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 8; ++j)
+      outptr[j * 8 + i] = temp_out[j];
   }
-  ip = output;
-  op = output;
 
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[3];
-    b1 = ip[1] + ip[2];
-    c1 = ip[1] - ip[2];
-    d1 = ip[0] - ip[3];
-
-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-    op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-
-    ip += 4;
-    op += 4;
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j + i * 8];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 8; ++j)
+      output[j + i * 8] = temp_out[j] >> 1;
   }
 }
 
@@ -898,1491 +418,642 @@ void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
   vp9_short_walsh4x4_x8_c(input,   output,    pitch);
   vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
 }
-#endif
-
-#define TEST_INT_16x16_DCT 1
-#if !TEST_INT_16x16_DCT
-
-static void dct16x16_1d(double input[16], double output[16]) {
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-    // step 1
-    step[ 0] = input[0] + input[15];
-    step[ 1] = input[1] + input[14];
-    step[ 2] = input[2] + input[13];
-    step[ 3] = input[3] + input[12];
-    step[ 4] = input[4] + input[11];
-    step[ 5] = input[5] + input[10];
-    step[ 6] = input[6] + input[ 9];
-    step[ 7] = input[7] + input[ 8];
-    step[ 8] = input[7] - input[ 8];
-    step[ 9] = input[6] - input[ 9];
-    step[10] = input[5] - input[10];
-    step[11] = input[4] - input[11];
-    step[12] = input[3] - input[12];
-    step[13] = input[2] - input[13];
-    step[14] = input[1] - input[14];
-    step[15] = input[0] - input[15];
-
-    // step 2
-    output[0] = step[0] + step[7];
-    output[1] = step[1] + step[6];
-    output[2] = step[2] + step[5];
-    output[3] = step[3] + step[4];
-    output[4] = step[3] - step[4];
-    output[5] = step[2] - step[5];
-    output[6] = step[1] - step[6];
-    output[7] = step[0] - step[7];
-
-    temp1 = step[ 8]*C7;
-    temp2 = step[15]*C9;
-    output[ 8] = temp1 + temp2;
-
-    temp1 = step[ 9]*C11;
-    temp2 = step[14]*C5;
-    output[ 9] = temp1 - temp2;
-
-    temp1 = step[10]*C3;
-    temp2 = step[13]*C13;
-    output[10] = temp1 + temp2;
-
-    temp1 = step[11]*C15;
-    temp2 = step[12]*C1;
-    output[11] = temp1 - temp2;
-
-    temp1 = step[11]*C1;
-    temp2 = step[12]*C15;
-    output[12] = temp2 + temp1;
-
-    temp1 = step[10]*C13;
-    temp2 = step[13]*C3;
-    output[13] = temp2 - temp1;
-
-    temp1 = step[ 9]*C5;
-    temp2 = step[14]*C11;
-    output[14] = temp2 + temp1;
-
-    temp1 = step[ 8]*C9;
-    temp2 = step[15]*C7;
-    output[15] = temp2 - temp1;
-
-    // step 3
-    step[ 0] = output[0] + output[3];
-    step[ 1] = output[1] + output[2];
-    step[ 2] = output[1] - output[2];
-    step[ 3] = output[0] - output[3];
-
-    temp1 = output[4]*C14;
-    temp2 = output[7]*C2;
-    step[ 4] = temp1 + temp2;
-
-    temp1 = output[5]*C10;
-    temp2 = output[6]*C6;
-    step[ 5] = temp1 + temp2;
-
-    temp1 = output[5]*C6;
-    temp2 = output[6]*C10;
-    step[ 6] = temp2 - temp1;
-
-    temp1 = output[4]*C2;
-    temp2 = output[7]*C14;
-    step[ 7] = temp2 - temp1;
-
-    step[ 8] = output[ 8] + output[11];
-    step[ 9] = output[ 9] + output[10];
-    step[10] = output[ 9] - output[10];
-    step[11] = output[ 8] - output[11];
-
-    step[12] = output[12] + output[15];
-    step[13] = output[13] + output[14];
-    step[14] = output[13] - output[14];
-    step[15] = output[12] - output[15];
-
-    // step 4
-    output[ 0] = (step[ 0] + step[ 1]);
-    output[ 8] = (step[ 0] - step[ 1]);
-
-    temp1 = step[2]*C12;
-    temp2 = step[3]*C4;
-    temp1 = temp1 + temp2;
-    output[ 4] = 2*(temp1*C8);
-
-    temp1 = step[2]*C4;
-    temp2 = step[3]*C12;
-    temp1 = temp2 - temp1;
-    output[12] = 2*(temp1*C8);
-
-    output[ 2] = 2*((step[4] + step[ 5])*C8);
-    output[14] = 2*((step[7] - step[ 6])*C8);
-
-    temp1 = step[4] - step[5];
-    temp2 = step[6] + step[7];
-    output[ 6] = (temp1 + temp2);
-    output[10] = (temp1 - temp2);
-
-    intermediate[8] = step[8] + step[14];
-    intermediate[9] = step[9] + step[15];
-
-    temp1 = intermediate[8]*C12;
-    temp2 = intermediate[9]*C4;
-    temp1 = temp1 - temp2;
-    output[3] = 2*(temp1*C8);
-
-    temp1 = intermediate[8]*C4;
-    temp2 = intermediate[9]*C12;
-    temp1 = temp2 + temp1;
-    output[13] = 2*(temp1*C8);
-
-    output[ 9] = 2*((step[10] + step[11])*C8);
-
-    intermediate[11] = step[10] - step[11];
-    intermediate[12] = step[12] + step[13];
-    intermediate[13] = step[12] - step[13];
-    intermediate[14] = step[ 8] - step[14];
-    intermediate[15] = step[ 9] - step[15];
-
-    output[15] = (intermediate[11] + intermediate[12]);
-    output[ 1] = -(intermediate[11] - intermediate[12]);
-
-    output[ 7] = 2*(intermediate[13]*C8);
-
-    temp1 = intermediate[14]*C12;
-    temp2 = intermediate[15]*C4;
-    temp1 = temp1 - temp2;
-    output[11] = -2*(temp1*C8);
-
-    temp1 = intermediate[14]*C4;
-    temp2 = intermediate[15]*C12;
-    temp1 = temp2 + temp1;
-    output[ 5] = 2*(temp1*C8);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
 
-void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-  {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    double output[256];
-    // First transform columns
-    for (i = 0; i < 16; i++) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; j++)
-            temp_in[j] = input[j*shortpitch + i];
-        dct16x16_1d(temp_in, temp_out);
-        for (j = 0; j < 16; j++)
-            output[j*16 + i] = temp_out[j];
-    }
-    // Then transform rows
-    for (i = 0; i < 16; ++i) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; ++j)
-            temp_in[j] = output[j + i*16];
-        dct16x16_1d(temp_in, temp_out);
-        for (j = 0; j < 16; ++j)
-            output[j + i*16] = temp_out[j];
-    }
-    // Scale by some magic number
-    for (i = 0; i < 256; i++)
-        out[i] = (short)round(output[i]/2);
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
 
-#else
-static const int16_t C1 = 16305;
-static const int16_t C2 = 16069;
-static const int16_t C3 = 15679;
-static const int16_t C4 = 15137;
-static const int16_t C5 = 14449;
-static const int16_t C6 = 13623;
-static const int16_t C7 = 12665;
-static const int16_t C8 = 11585;
-static const int16_t C9 = 10394;
-static const int16_t C10 = 9102;
-static const int16_t C11 = 7723;
-static const int16_t C12 = 6270;
-static const int16_t C13 = 4756;
-static const int16_t C14 = 3196;
-static const int16_t C15 = 1606;
-
-#define RIGHT_SHIFT 14
-#define ROUNDING (1 << (RIGHT_SHIFT - 1))
-
-static void dct16x16_1d(int16_t input[16], int16_t output[16],
-                        int last_shift_bits) {
-    int16_t step[16];
-    int intermediate[16];
-    int temp1, temp2;
-    int final_shift = RIGHT_SHIFT;
-    int final_rounding = ROUNDING;
-    int output_shift = 0;
-    int output_rounding = 0;
-
-    final_shift += last_shift_bits;
-    if (final_shift > 0)
-    final_rounding = 1 << (final_shift - 1);
-
-    output_shift += last_shift_bits;
-    if (output_shift > 0)
-      output_rounding = 1 << (output_shift - 1);
-
-    // step 1
-    step[ 0] = input[0] + input[15];
-    step[ 1] = input[1] + input[14];
-    step[ 2] = input[2] + input[13];
-    step[ 3] = input[3] + input[12];
-    step[ 4] = input[4] + input[11];
-    step[ 5] = input[5] + input[10];
-    step[ 6] = input[6] + input[ 9];
-    step[ 7] = input[7] + input[ 8];
-    step[ 8] = input[7] - input[ 8];
-    step[ 9] = input[6] - input[ 9];
-    step[10] = input[5] - input[10];
-    step[11] = input[4] - input[11];
-    step[12] = input[3] - input[12];
-    step[13] = input[2] - input[13];
-    step[14] = input[1] - input[14];
-    step[15] = input[0] - input[15];
-
-    // step 2
-    output[0] = step[0] + step[7];
-    output[1] = step[1] + step[6];
-    output[2] = step[2] + step[5];
-    output[3] = step[3] + step[4];
-    output[4] = step[3] - step[4];
-    output[5] = step[2] - step[5];
-    output[6] = step[1] - step[6];
-    output[7] = step[0] - step[7];
-
-    temp1 = step[ 8] * C7;
-    temp2 = step[15] * C9;
-    output[ 8] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[ 9] * C11;
-    temp2 = step[14] * C5;
-    output[ 9] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[10] * C3;
-    temp2 = step[13] * C13;
-    output[10] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[11] * C15;
-    temp2 = step[12] * C1;
-    output[11] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[11] * C1;
-    temp2 = step[12] * C15;
-    output[12] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[10] * C13;
-    temp2 = step[13] * C3;
-    output[13] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[ 9] * C5;
-    temp2 = step[14] * C11;
-    output[14] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[ 8] * C9;
-    temp2 = step[15] * C7;
-    output[15] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
-
-    // step 3
-    step[ 0] = output[0] + output[3];
-    step[ 1] = output[1] + output[2];
-    step[ 2] = output[1] - output[2];
-    step[ 3] = output[0] - output[3];
-
-    temp1 = output[4] * C14;
-    temp2 = output[7] * C2;
-    step[ 4] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[5] * C10;
-    temp2 = output[6] * C6;
-    step[ 5] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[5] * C6;
-    temp2 = output[6] * C10;
-    step[ 6] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = output[4] * C2;
-    temp2 = output[7] * C14;
-    step[ 7] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
-
-    step[ 8] = output[ 8] + output[11];
-    step[ 9] = output[ 9] + output[10];
-    step[10] = output[ 9] - output[10];
-    step[11] = output[ 8] - output[11];
-
-    step[12] = output[12] + output[15];
-    step[13] = output[13] + output[14];
-    step[14] = output[13] - output[14];
-    step[15] = output[12] - output[15];
-
-    // step 4
-    output[ 0] = (step[ 0] + step[ 1] + output_rounding) >> output_shift;
-    output[ 8] = (step[ 0] - step[ 1] + output_rounding) >> output_shift;
-
-    temp1 = step[2] * C12;
-    temp2 = step[3] * C4;
-    temp1 = (temp1 + temp2 + final_rounding) >> final_shift;
-    output[ 4] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = step[2] * C4;
-    temp2 = step[3] * C12;
-    temp1 = (temp2 - temp1 + final_rounding) >> final_shift;
-    output[12] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
-
-    output[ 2] = (2 * ((step[4] + step[ 5]) * C8) + final_rounding)
-        >> final_shift;
-    output[14] = (2 * ((step[7] - step[ 6]) * C8) + final_rounding)
-        >> final_shift;
-
-    temp1 = step[4] - step[5];
-    temp2 = step[6] + step[7];
-    output[ 6] = (temp1 + temp2 + output_rounding) >> output_shift;
-    output[10] = (temp1 - temp2 + output_rounding) >> output_shift;
-
-    intermediate[8] = step[8] + step[14];
-    intermediate[9] = step[9] + step[15];
-
-    temp1 = intermediate[8] * C12;
-    temp2 = intermediate[9] * C4;
-    temp1 = (temp1 - temp2 + final_rounding) >> final_shift;
-    output[3] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = intermediate[8] * C4;
-    temp2 = intermediate[9] * C12;
-    temp1 = (temp2 + temp1 + final_rounding) >> final_shift;
-    output[13] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
-
-    output[ 9] = (2 * ((step[10] + step[11]) * C8) + final_rounding)
-        >> final_shift;
-
-    intermediate[11] = step[10] - step[11];
-    intermediate[12] = step[12] + step[13];
-    intermediate[13] = step[12] - step[13];
-    intermediate[14] = step[ 8] - step[14];
-    intermediate[15] = step[ 9] - step[15];
-
-    output[15] = (intermediate[11] + intermediate[12] + output_rounding)
-        >> output_shift;
-    output[ 1] = -(intermediate[11] - intermediate[12] + output_rounding)
-        >> output_shift;
-
-    output[ 7] = (2 * (intermediate[13] * C8) + final_rounding) >> final_shift;
-
-    temp1 = intermediate[14] * C12;
-    temp2 = intermediate[15] * C4;
-    temp1 = (temp1 - temp2 + final_rounding) >> final_shift;
-    output[11] = (-2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
-
-    temp1 = intermediate[14] * C4;
-    temp2 = intermediate[15] * C12;
-    temp1 = (temp2 + temp1 + final_rounding) >> final_shift;
-    output[ 5] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
+// Rewrote to use same algorithm as others.
+static void fdct16_1d(int16_t input[16], int16_t output[16]) {
+  int16_t step[16];
+  int temp1, temp2;
+
+  // step 1
+  step[ 0] = input[0] + input[15];
+  step[ 1] = input[1] + input[14];
+  step[ 2] = input[2] + input[13];
+  step[ 3] = input[3] + input[12];
+  step[ 4] = input[4] + input[11];
+  step[ 5] = input[5] + input[10];
+  step[ 6] = input[6] + input[ 9];
+  step[ 7] = input[7] + input[ 8];
+  step[ 8] = input[7] - input[ 8];
+  step[ 9] = input[6] - input[ 9];
+  step[10] = input[5] - input[10];
+  step[11] = input[4] - input[11];
+  step[12] = input[3] - input[12];
+  step[13] = input[2] - input[13];
+  step[14] = input[1] - input[14];
+  step[15] = input[0] - input[15];
+
+  fdct8_1d(step, step);
+
+  // step 2
+  output[8] = step[8];
+  output[9] = step[9];
+  temp1 = (-step[10] + step[13]) * cospi_16_64;
+  temp2 = (-step[11] + step[12]) * cospi_16_64;
+  output[10] = dct_const_round_shift(temp1);
+  output[11] = dct_const_round_shift(temp2);
+  temp1 = (step[11] + step[12]) * cospi_16_64;
+  temp2 = (step[10] + step[13]) * cospi_16_64;
+  output[12] = dct_const_round_shift(temp1);
+  output[13] = dct_const_round_shift(temp2);
+  output[14] = step[14];
+  output[15] = step[15];
+
+  // step 3
+  step[ 8] = output[8] + output[11];
+  step[ 9] = output[9] + output[10];
+  step[ 10] = output[9] - output[10];
+  step[ 11] = output[8] - output[11];
+  step[ 12] = -output[12] + output[15];
+  step[ 13] = -output[13] + output[14];
+  step[ 14] = output[13] + output[14];
+  step[ 15] = output[12] + output[15];
+
+  // step 4
+  output[8] = step[8];
+  temp1 = -step[9] * cospi_8_64 + step[14] * cospi_24_64;
+  temp2 = -step[10] * cospi_24_64 - step[13] * cospi_8_64;
+  output[9] = dct_const_round_shift(temp1);
+  output[10] = dct_const_round_shift(temp2);
+  output[11] = step[11];
+  output[12] = step[12];
+  temp1 = -step[10] * cospi_8_64 + step[13] * cospi_24_64;
+  temp2 = step[9] * cospi_24_64 + step[14] * cospi_8_64;
+  output[13] = dct_const_round_shift(temp1);
+  output[14] = dct_const_round_shift(temp2);
+  output[15] = step[15];
+
+  // step 5
+  step[8] = output[8] + output[9];
+  step[9] = output[8] - output[9];
+  step[10] = -output[10] + output[11];
+  step[11] = output[10] + output[11];
+  step[12] = output[12] + output[13];
+  step[13] = output[12] - output[13];
+  step[14] = -output[14] + output[15];
+  step[15] = output[14] + output[15];
+
+  // step 6
+  output[0] = step[0];
+  output[8] = step[4];
+  output[4] = step[2];
+  output[12] = step[6];
+  output[2] = step[1];
+  output[10] = step[5];
+  output[6] = step[3];
+  output[14] = step[7];
+
+  temp1 = step[8] * cospi_30_64 + step[15] * cospi_2_64;
+  temp2 = step[9] * cospi_14_64 + step[14] * cospi_18_64;
+  output[1] = dct_const_round_shift(temp1);
+  output[9] = dct_const_round_shift(temp2);
+
+  temp1 = step[10] * cospi_22_64 + step[13] * cospi_10_64;
+  temp2 = step[11] * cospi_6_64 + step[12] * cospi_26_64;
+  output[5] = dct_const_round_shift(temp1);
+  output[13] = dct_const_round_shift(temp2);
+
+  temp1 = -step[11] * cospi_26_64 + step[12] * cospi_6_64;
+  temp2 = -step[10] * cospi_10_64 + step[13] * cospi_22_64;
+  output[3] = dct_const_round_shift(temp1);
+  output[11] = dct_const_round_shift(temp2);
+
+  temp1 = -step[9] * cospi_18_64 + step[14] * cospi_14_64;
+  temp2 = -step[8] * cospi_2_64 + step[15] * cospi_30_64;
+  output[7] = dct_const_round_shift(temp1);
+  output[15] = dct_const_round_shift(temp2);
 }
 
 void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    int16_t output[256];
-    int16_t *outptr = &output[0];
-
-    // First transform columns
-    for (i = 0; i < 16; i++) {
-        int16_t temp_in[16];
-        int16_t temp_out[16];
-        for (j = 0; j < 16; j++)
-            temp_in[j] = input[j * shortpitch + i];
-        dct16x16_1d(temp_in, temp_out, 0);
-        for (j = 0; j < 16; j++)
-            output[j * 16 + i] = temp_out[j];
-    }
-
-    // Then transform rows
-    for (i = 0; i < 16; ++i) {
-        dct16x16_1d(outptr, out, 1);
-        outptr += 16;
-        out += 16;
-    }
-}
-#undef RIGHT_SHIFT
-#undef ROUNDING
-#endif
-
-#if !CONFIG_DWTDCTHYBRID
-static void dct32_1d(double *input, double *output, int stride) {
-  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
-  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
-  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
-  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
-  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
-  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
-  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
-  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
-  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
-  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
-  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
-  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
-  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
-  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
-  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
-  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
-  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
-  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
-  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
-  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
-  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
-  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
-  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
-  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
-  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
-  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
-  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
-  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
-  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
-  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
-  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
-
-  double step[32];
-
-  // Stage 1
-  step[0] = input[stride*0] + input[stride*(32 - 1)];
-  step[1] = input[stride*1] + input[stride*(32 - 2)];
-  step[2] = input[stride*2] + input[stride*(32 - 3)];
-  step[3] = input[stride*3] + input[stride*(32 - 4)];
-  step[4] = input[stride*4] + input[stride*(32 - 5)];
-  step[5] = input[stride*5] + input[stride*(32 - 6)];
-  step[6] = input[stride*6] + input[stride*(32 - 7)];
-  step[7] = input[stride*7] + input[stride*(32 - 8)];
-  step[8] = input[stride*8] + input[stride*(32 - 9)];
-  step[9] = input[stride*9] + input[stride*(32 - 10)];
-  step[10] = input[stride*10] + input[stride*(32 - 11)];
-  step[11] = input[stride*11] + input[stride*(32 - 12)];
-  step[12] = input[stride*12] + input[stride*(32 - 13)];
-  step[13] = input[stride*13] + input[stride*(32 - 14)];
-  step[14] = input[stride*14] + input[stride*(32 - 15)];
-  step[15] = input[stride*15] + input[stride*(32 - 16)];
-  step[16] = -input[stride*16] + input[stride*(32 - 17)];
-  step[17] = -input[stride*17] + input[stride*(32 - 18)];
-  step[18] = -input[stride*18] + input[stride*(32 - 19)];
-  step[19] = -input[stride*19] + input[stride*(32 - 20)];
-  step[20] = -input[stride*20] + input[stride*(32 - 21)];
-  step[21] = -input[stride*21] + input[stride*(32 - 22)];
-  step[22] = -input[stride*22] + input[stride*(32 - 23)];
-  step[23] = -input[stride*23] + input[stride*(32 - 24)];
-  step[24] = -input[stride*24] + input[stride*(32 - 25)];
-  step[25] = -input[stride*25] + input[stride*(32 - 26)];
-  step[26] = -input[stride*26] + input[stride*(32 - 27)];
-  step[27] = -input[stride*27] + input[stride*(32 - 28)];
-  step[28] = -input[stride*28] + input[stride*(32 - 29)];
-  step[29] = -input[stride*29] + input[stride*(32 - 30)];
-  step[30] = -input[stride*30] + input[stride*(32 - 31)];
-  step[31] = -input[stride*31] + input[stride*(32 - 32)];
-
-  // Stage 2
-  output[stride*0] = step[0] + step[16 - 1];
-  output[stride*1] = step[1] + step[16 - 2];
-  output[stride*2] = step[2] + step[16 - 3];
-  output[stride*3] = step[3] + step[16 - 4];
-  output[stride*4] = step[4] + step[16 - 5];
-  output[stride*5] = step[5] + step[16 - 6];
-  output[stride*6] = step[6] + step[16 - 7];
-  output[stride*7] = step[7] + step[16 - 8];
-  output[stride*8] = -step[8] + step[16 - 9];
-  output[stride*9] = -step[9] + step[16 - 10];
-  output[stride*10] = -step[10] + step[16 - 11];
-  output[stride*11] = -step[11] + step[16 - 12];
-  output[stride*12] = -step[12] + step[16 - 13];
-  output[stride*13] = -step[13] + step[16 - 14];
-  output[stride*14] = -step[14] + step[16 - 15];
-  output[stride*15] = -step[15] + step[16 - 16];
-
-  output[stride*16] = step[16];
-  output[stride*17] = step[17];
-  output[stride*18] = step[18];
-  output[stride*19] = step[19];
-
-  output[stride*20] = (-step[20] + step[27])*C16;
-  output[stride*21] = (-step[21] + step[26])*C16;
-  output[stride*22] = (-step[22] + step[25])*C16;
-  output[stride*23] = (-step[23] + step[24])*C16;
-
-  output[stride*24] = (step[24] + step[23])*C16;
-  output[stride*25] = (step[25] + step[22])*C16;
-  output[stride*26] = (step[26] + step[21])*C16;
-  output[stride*27] = (step[27] + step[20])*C16;
-
-  output[stride*28] = step[28];
-  output[stride*29] = step[29];
-  output[stride*30] = step[30];
-  output[stride*31] = step[31];
-
-  // Stage 3
-  step[0] = output[stride*0] + output[stride*(8 - 1)];
-  step[1] = output[stride*1] + output[stride*(8 - 2)];
-  step[2] = output[stride*2] + output[stride*(8 - 3)];
-  step[3] = output[stride*3] + output[stride*(8 - 4)];
-  step[4] = -output[stride*4] + output[stride*(8 - 5)];
-  step[5] = -output[stride*5] + output[stride*(8 - 6)];
-  step[6] = -output[stride*6] + output[stride*(8 - 7)];
-  step[7] = -output[stride*7] + output[stride*(8 - 8)];
-  step[8] = output[stride*8];
-  step[9] = output[stride*9];
-  step[10] = (-output[stride*10] + output[stride*13])*C16;
-  step[11] = (-output[stride*11] + output[stride*12])*C16;
-  step[12] = (output[stride*12] + output[stride*11])*C16;
-  step[13] = (output[stride*13] + output[stride*10])*C16;
-  step[14] = output[stride*14];
-  step[15] = output[stride*15];
-
-  step[16] = output[stride*16] + output[stride*23];
-  step[17] = output[stride*17] + output[stride*22];
-  step[18] = output[stride*18] + output[stride*21];
-  step[19] = output[stride*19] + output[stride*20];
-  step[20] = -output[stride*20] + output[stride*19];
-  step[21] = -output[stride*21] + output[stride*18];
-  step[22] = -output[stride*22] + output[stride*17];
-  step[23] = -output[stride*23] + output[stride*16];
-  step[24] = -output[stride*24] + output[stride*31];
-  step[25] = -output[stride*25] + output[stride*30];
-  step[26] = -output[stride*26] + output[stride*29];
-  step[27] = -output[stride*27] + output[stride*28];
-  step[28] = output[stride*28] + output[stride*27];
-  step[29] = output[stride*29] + output[stride*26];
-  step[30] = output[stride*30] + output[stride*25];
-  step[31] = output[stride*31] + output[stride*24];
-
-  // Stage 4
-  output[stride*0] = step[0] + step[3];
-  output[stride*1] = step[1] + step[2];
-  output[stride*2] = -step[2] + step[1];
-  output[stride*3] = -step[3] + step[0];
-  output[stride*4] = step[4];
-  output[stride*5] = (-step[5] + step[6])*C16;
-  output[stride*6] = (step[6] + step[5])*C16;
-  output[stride*7] = step[7];
-  output[stride*8] = step[8] + step[11];
-  output[stride*9] = step[9] + step[10];
-  output[stride*10] = -step[10] + step[9];
-  output[stride*11] = -step[11] + step[8];
-  output[stride*12] = -step[12] + step[15];
-  output[stride*13] = -step[13] + step[14];
-  output[stride*14] = step[14] + step[13];
-  output[stride*15] = step[15] + step[12];
-
-  output[stride*16] = step[16];
-  output[stride*17] = step[17];
-  output[stride*18] = step[18]*-C8 + step[29]*C24;
-  output[stride*19] = step[19]*-C8 + step[28]*C24;
-  output[stride*20] = step[20]*-C24 + step[27]*-C8;
-  output[stride*21] = step[21]*-C24 + step[26]*-C8;
-  output[stride*22] = step[22];
-  output[stride*23] = step[23];
-  output[stride*24] = step[24];
-  output[stride*25] = step[25];
-  output[stride*26] = step[26]*C24 + step[21]*-C8;
-  output[stride*27] = step[27]*C24 + step[20]*-C8;
-  output[stride*28] = step[28]*C8 + step[19]*C24;
-  output[stride*29] = step[29]*C8 + step[18]*C24;
-  output[stride*30] = step[30];
-  output[stride*31] = step[31];
-
-  // Stage 5
-  step[0] = (output[stride*0] + output[stride*1]) * C16;
-  step[1] = (-output[stride*1] + output[stride*0]) * C16;
-  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
-  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
-  step[4] = output[stride*4] + output[stride*5];
-  step[5] = -output[stride*5] + output[stride*4];
-  step[6] = -output[stride*6] + output[stride*7];
-  step[7] = output[stride*7] + output[stride*6];
-  step[8] = output[stride*8];
-  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
-  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
-  step[11] = output[stride*11];
-  step[12] = output[stride*12];
-  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
-  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
-  step[15] = output[stride*15];
-
-  step[16] = output[stride*16] + output[stride*19];
-  step[17] = output[stride*17] + output[stride*18];
-  step[18] = -output[stride*18] + output[stride*17];
-  step[19] = -output[stride*19] + output[stride*16];
-  step[20] = -output[stride*20] + output[stride*23];
-  step[21] = -output[stride*21] + output[stride*22];
-  step[22] = output[stride*22] + output[stride*21];
-  step[23] = output[stride*23] + output[stride*20];
-  step[24] = output[stride*24] + output[stride*27];
-  step[25] = output[stride*25] + output[stride*26];
-  step[26] = -output[stride*26] + output[stride*25];
-  step[27] = -output[stride*27] + output[stride*24];
-  step[28] = -output[stride*28] + output[stride*31];
-  step[29] = -output[stride*29] + output[stride*30];
-  step[30] = output[stride*30] + output[stride*29];
-  step[31] = output[stride*31] + output[stride*28];
-
-  // Stage 6
-  output[stride*0] = step[0];
-  output[stride*1] = step[1];
-  output[stride*2] = step[2];
-  output[stride*3] = step[3];
-  output[stride*4] = step[4]*C28 + step[7]*C4;
-  output[stride*5] = step[5]*C12 + step[6]*C20;
-  output[stride*6] = step[6]*C12 + step[5]*-C20;
-  output[stride*7] = step[7]*C28 + step[4]*-C4;
-  output[stride*8] = step[8] + step[9];
-  output[stride*9] = -step[9] + step[8];
-  output[stride*10] = -step[10] + step[11];
-  output[stride*11] = step[11] + step[10];
-  output[stride*12] = step[12] + step[13];
-  output[stride*13] = -step[13] + step[12];
-  output[stride*14] = -step[14] + step[15];
-  output[stride*15] = step[15] + step[14];
-
-  output[stride*16] = step[16];
-  output[stride*17] = step[17]*-C4 + step[30]*C28;
-  output[stride*18] = step[18]*-C28 + step[29]*-C4;
-  output[stride*19] = step[19];
-  output[stride*20] = step[20];
-  output[stride*21] = step[21]*-C20 + step[26]*C12;
-  output[stride*22] = step[22]*-C12 + step[25]*-C20;
-  output[stride*23] = step[23];
-  output[stride*24] = step[24];
-  output[stride*25] = step[25]*C12 + step[22]*-C20;
-  output[stride*26] = step[26]*C20 + step[21]*C12;
-  output[stride*27] = step[27];
-  output[stride*28] = step[28];
-  output[stride*29] = step[29]*C28 + step[18]*-C4;
-  output[stride*30] = step[30]*C4 + step[17]*C28;
-  output[stride*31] = step[31];
-
-  // Stage 7
-  step[0] = output[stride*0];
-  step[1] = output[stride*1];
-  step[2] = output[stride*2];
-  step[3] = output[stride*3];
-  step[4] = output[stride*4];
-  step[5] = output[stride*5];
-  step[6] = output[stride*6];
-  step[7] = output[stride*7];
-  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
-  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
-  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
-  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
-  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
-  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
-  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
-  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
-
-  step[16] = output[stride*16] + output[stride*17];
-  step[17] = -output[stride*17] + output[stride*16];
-  step[18] = -output[stride*18] + output[stride*19];
-  step[19] = output[stride*19] + output[stride*18];
-  step[20] = output[stride*20] + output[stride*21];
-  step[21] = -output[stride*21] + output[stride*20];
-  step[22] = -output[stride*22] + output[stride*23];
-  step[23] = output[stride*23] + output[stride*22];
-  step[24] = output[stride*24] + output[stride*25];
-  step[25] = -output[stride*25] + output[stride*24];
-  step[26] = -output[stride*26] + output[stride*27];
-  step[27] = output[stride*27] + output[stride*26];
-  step[28] = output[stride*28] + output[stride*29];
-  step[29] = -output[stride*29] + output[stride*28];
-  step[30] = -output[stride*30] + output[stride*31];
-  step[31] = output[stride*31] + output[stride*30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[stride*0] = step[0];
-  output[stride*16] = step[1];
-  output[stride*8] = step[2];
-  output[stride*24] = step[3];
-  output[stride*4] = step[4];
-  output[stride*20] = step[5];
-  output[stride*12] = step[6];
-  output[stride*28] = step[7];
-  output[stride*2] = step[8];
-  output[stride*18] = step[9];
-  output[stride*10] = step[10];
-  output[stride*26] = step[11];
-  output[stride*6] = step[12];
-  output[stride*22] = step[13];
-  output[stride*14] = step[14];
-  output[stride*30] = step[15];
-
-  output[stride*1] = step[16]*C31 + step[31]*C1;
-  output[stride*17] = step[17]*C15 + step[30]*C17;
-  output[stride*9] = step[18]*C23 + step[29]*C9;
-  output[stride*25] = step[19]*C7 + step[28]*C25;
-  output[stride*5] = step[20]*C27 + step[27]*C5;
-  output[stride*21] = step[21]*C11 + step[26]*C21;
-  output[stride*13] = step[22]*C19 + step[25]*C13;
-  output[stride*29] = step[23]*C3 + step[24]*C29;
-  output[stride*3] = step[24]*C3 + step[23]*-C29;
-  output[stride*19] = step[25]*C19 + step[22]*-C13;
-  output[stride*11] = step[26]*C11 + step[21]*-C21;
-  output[stride*27] = step[27]*C27 + step[20]*-C5;
-  output[stride*7] = step[28]*C7 + step[19]*-C25;
-  output[stride*23] = step[29]*C23 + step[18]*-C9;
-  output[stride*15] = step[30]*C15 + step[17]*-C17;
-  output[stride*31] = step[31]*C31 + step[16]*-C1;
-}
-
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    double output[1024];
-    // First transform columns
-    for (i = 0; i < 32; i++) {
-      double temp_in[32], temp_out[32];
-      for (j = 0; j < 32; j++)
-        temp_in[j] = input[j*shortpitch + i];
-      dct32_1d(temp_in, temp_out, 1);
-      for (j = 0; j < 32; j++)
-        output[j*32 + i] = temp_out[j];
-    }
-    // Then transform rows
-    for (i = 0; i < 32; ++i) {
-      double temp_in[32], temp_out[32];
-      for (j = 0; j < 32; ++j)
-        temp_in[j] = output[j + i*32];
-      dct32_1d(temp_in, temp_out, 1);
-      for (j = 0; j < 32; ++j)
-        output[j + i*32] = temp_out[j];
-    }
-    // Scale by some magic number
-    for (i = 0; i < 1024; i++) {
-      out[i] = (short)round(output[i]/4);
-    }
-  }
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-#else  // CONFIG_DWTDCTHYBRID
-
-#if DWT_TYPE == 53
-
-// Note: block length must be even for this implementation
-static void analysis_53_row(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int n;
-  short r, *a, *b;
+  int shortpitch = pitch >> 1;
+  int i, j;
+  int16_t output[256];
+  int16_t temp_in[16], temp_out[16];
 
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    *a++ = (r = *x++) << 1;
-    *b++ = *x - ((r + x[1] + 1) >> 1);
-    x++;
-  }
-  *a = (r = *x++) << 1;
-  *b = *x - r;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ += (r + (*b) + 1) >> 1;
-    r = *b++;
+  // First transform columns
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 16; j++)
+      temp_in[j] = input[j * shortpitch + i] << 2;
+    fdct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; j++)
+      output[j * 16 + i] = (temp_out[j] + 1) >> 2;
   }
-}
 
-static void analysis_53_col(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int n;
-  short r, *a, *b;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    *a++ = (r = *x++);
-    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
-    x++;
-  }
-  *a = (r = *x++);
-  *b = (*x - r + 1) >> 1;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ += (r + (*b) + 1) >> 1;
-    r = *b++;
+  // Then transform rows
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = output[j + i * 16];
+    fdct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      out[j + i * 16] = temp_out[j];
   }
 }
 
-static void dyadic_analyze_53(int levels, int width, int height,
-                              short *x, int pitch_x, short *c, int pitch_c) {
-  int lv, i, j, nh, nw, hh = height, hw = width;
-  short buffer[2 * DWT_MAX_LENGTH];
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
-    }
-  }
-  for (lv = 0; lv < levels; lv++) {
-    nh = hh;
-    hh = (hh + 1) >> 1;
-    nw = hw;
-    hw = (hw + 1) >> 1;
-    if ((nh < 2) || (nw < 2)) return;
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
-      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
-    }
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i + nh] = c[i * pitch_c + j];
-      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i];
-    }
-  }
+void fadst16_1d(int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] = x0;
+  output[1] = - x8;
+  output[2] = x12;
+  output[3] = - x4;
+  output[4] = x6;
+  output[5] = x14;
+  output[6] = x10;
+  output[7] = x2;
+  output[8] = x3;
+  output[9] =  x11;
+  output[10] = x15;
+  output[11] = x7;
+  output[12] = x5;
+  output[13] = - x13;
+  output[14] = x9;
+  output[15] = - x1;
 }
 
-#elif DWT_TYPE == 26
-
-static void analysis_26_row(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int i, n;
-  short r, s, *a, *b;
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    r = *x++;
-    s = *x++;
-    *a++ = r + s;
-    *b++ = r - s;
-  }
-  n = length >> 1;
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ -= (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b -= (r - *a + 4) >> 3;
-  }
-}
+static const transform_2d FHT_16[] = {
+  { fdct16_1d,  fdct16_1d  },  // DCT_DCT  = 0
+  { fadst16_1d, fdct16_1d  },  // ADST_DCT = 1
+  { fdct16_1d,  fadst16_1d },  // DCT_ADST = 2
+  { fadst16_1d, fadst16_1d }   // ADST_ADST = 3
+};
 
-static void analysis_26_col(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int i, n;
-  short r, s, *a, *b;
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    r = *x++;
-    s = *x++;
-    *a++ = (r + s + 1) >> 1;
-    *b++ = (r - s + 1) >> 1;
-  }
-  n = length >> 1;
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ -= (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b -= (r - *a + 4) >> 3;
-  }
-}
+void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
+                          int pitch, TX_TYPE tx_type) {
+  int16_t out[256];
+  int16_t *outptr = &out[0];
+  int i, j;
+  int16_t temp_in[16], temp_out[16];
+  const transform_2d ht = FHT_16[tx_type];
 
-static void dyadic_analyze_26(int levels, int width, int height,
-                              short *x, int pitch_x, short *c, int pitch_c) {
-  int lv, i, j, nh, nw, hh = height, hw = width;
-  short buffer[2 * DWT_MAX_LENGTH];
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
-    }
-  }
-  for (lv = 0; lv < levels; lv++) {
-    nh = hh;
-    hh = (hh + 1) >> 1;
-    nw = hw;
-    hw = (hw + 1) >> 1;
-    if ((nh < 2) || (nw < 2)) return;
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
-      analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
-    }
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i + nh] = c[i * pitch_c + j];
-      analysis_26_col(nh, buffer + nh, buffer, buffer + hh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i];
-    }
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = input[j * pitch + i] << 2;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
-}
 
-#elif DWT_TYPE == 97
-
-static void analysis_97(int length, double *x,
-                        double *lowpass, double *highpass) {
-  static const double a_predict1 = -1.586134342;
-  static const double a_update1 = -0.05298011854;
-  static const double a_predict2 = 0.8829110762;
-  static const double a_update2 = 0.4435068522;
-  static const double s_low = 1.149604398;
-  static const double s_high = 1/1.149604398;
-  int i;
-  double y[DWT_MAX_LENGTH];
-  // Predict 1
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] += a_predict1 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] += 2 * a_predict1 * x[length - 2];
-  // Update 1
-  for (i = 2; i < length; i += 2) {
-    x[i] += a_update1 * (x[i - 1] + x[i + 1]);
-  }
-  x[0] += 2 * a_update1 * x[1];
-  // Predict 2
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] += a_predict2 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] += 2 * a_predict2 * x[length - 2];
-  // Update 2
-  for (i = 2; i < length; i += 2) {
-    x[i] += a_update2 * (x[i - 1] + x[i + 1]);
-  }
-  x[0] += 2 * a_update2 * x[1];
-  memcpy(y, x, sizeof(*y) * length);
-  // Scale and pack
-  for (i = 0; i < length / 2; i++) {
-    lowpass[i] = y[2 * i] * s_low;
-    highpass[i] = y[2 * i + 1] * s_high;
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j + i * 16];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      output[j + i * 16] = temp_out[j];
   }
 }
 
-static void dyadic_analyze_97(int levels, int width, int height,
-                             short *x, int pitch_x, short *c, int pitch_c) {
-  int lv, i, j, nh, nw, hh = height, hw = width;
-  double buffer[2 * DWT_MAX_LENGTH];
-  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
-    }
-  }
-  for (lv = 0; lv < levels; lv++) {
-    nh = hh;
-    hh = (hh + 1) >> 1;
-    nw = hw;
-    hw = (hw + 1) >> 1;
-    if ((nh < 2) || (nw < 2)) return;
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
-      analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],
-                  &y[i * DWT_MAX_LENGTH] + hw);
-    }
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];
-      analysis_97(nh, buffer + nh, buffer, buffer + hh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = round(buffer[i]);
-    }
-  }
-}
 
-#endif  // DWT_TYPE
-
-// TODO(debargha): Implement the scaling differently so as not to have to
-// use the floating point dct
-static void dct16x16_1d_f(double input[16], double output[16]) {
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-    // step 1
-    step[ 0] = input[0] + input[15];
-    step[ 1] = input[1] + input[14];
-    step[ 2] = input[2] + input[13];
-    step[ 3] = input[3] + input[12];
-    step[ 4] = input[4] + input[11];
-    step[ 5] = input[5] + input[10];
-    step[ 6] = input[6] + input[ 9];
-    step[ 7] = input[7] + input[ 8];
-    step[ 8] = input[7] - input[ 8];
-    step[ 9] = input[6] - input[ 9];
-    step[10] = input[5] - input[10];
-    step[11] = input[4] - input[11];
-    step[12] = input[3] - input[12];
-    step[13] = input[2] - input[13];
-    step[14] = input[1] - input[14];
-    step[15] = input[0] - input[15];
-
-    // step 2
-    output[0] = step[0] + step[7];
-    output[1] = step[1] + step[6];
-    output[2] = step[2] + step[5];
-    output[3] = step[3] + step[4];
-    output[4] = step[3] - step[4];
-    output[5] = step[2] - step[5];
-    output[6] = step[1] - step[6];
-    output[7] = step[0] - step[7];
-
-    temp1 = step[ 8]*C7;
-    temp2 = step[15]*C9;
-    output[ 8] = temp1 + temp2;
-
-    temp1 = step[ 9]*C11;
-    temp2 = step[14]*C5;
-    output[ 9] = temp1 - temp2;
-
-    temp1 = step[10]*C3;
-    temp2 = step[13]*C13;
-    output[10] = temp1 + temp2;
-
-    temp1 = step[11]*C15;
-    temp2 = step[12]*C1;
-    output[11] = temp1 - temp2;
-
-    temp1 = step[11]*C1;
-    temp2 = step[12]*C15;
-    output[12] = temp2 + temp1;
-
-    temp1 = step[10]*C13;
-    temp2 = step[13]*C3;
-    output[13] = temp2 - temp1;
-
-    temp1 = step[ 9]*C5;
-    temp2 = step[14]*C11;
-    output[14] = temp2 + temp1;
-
-    temp1 = step[ 8]*C9;
-    temp2 = step[15]*C7;
-    output[15] = temp2 - temp1;
-
-    // step 3
-    step[ 0] = output[0] + output[3];
-    step[ 1] = output[1] + output[2];
-    step[ 2] = output[1] - output[2];
-    step[ 3] = output[0] - output[3];
-
-    temp1 = output[4]*C14;
-    temp2 = output[7]*C2;
-    step[ 4] = temp1 + temp2;
-
-    temp1 = output[5]*C10;
-    temp2 = output[6]*C6;
-    step[ 5] = temp1 + temp2;
-
-    temp1 = output[5]*C6;
-    temp2 = output[6]*C10;
-    step[ 6] = temp2 - temp1;
-
-    temp1 = output[4]*C2;
-    temp2 = output[7]*C14;
-    step[ 7] = temp2 - temp1;
-
-    step[ 8] = output[ 8] + output[11];
-    step[ 9] = output[ 9] + output[10];
-    step[10] = output[ 9] - output[10];
-    step[11] = output[ 8] - output[11];
-
-    step[12] = output[12] + output[15];
-    step[13] = output[13] + output[14];
-    step[14] = output[13] - output[14];
-    step[15] = output[12] - output[15];
-
-    // step 4
-    output[ 0] = (step[ 0] + step[ 1]);
-    output[ 8] = (step[ 0] - step[ 1]);
-
-    temp1 = step[2]*C12;
-    temp2 = step[3]*C4;
-    temp1 = temp1 + temp2;
-    output[ 4] = 2*(temp1*C8);
-
-    temp1 = step[2]*C4;
-    temp2 = step[3]*C12;
-    temp1 = temp2 - temp1;
-    output[12] = 2*(temp1*C8);
-
-    output[ 2] = 2*((step[4] + step[ 5])*C8);
-    output[14] = 2*((step[7] - step[ 6])*C8);
-
-    temp1 = step[4] - step[5];
-    temp2 = step[6] + step[7];
-    output[ 6] = (temp1 + temp2);
-    output[10] = (temp1 - temp2);
-
-    intermediate[8] = step[8] + step[14];
-    intermediate[9] = step[9] + step[15];
-
-    temp1 = intermediate[8]*C12;
-    temp2 = intermediate[9]*C4;
-    temp1 = temp1 - temp2;
-    output[3] = 2*(temp1*C8);
-
-    temp1 = intermediate[8]*C4;
-    temp2 = intermediate[9]*C12;
-    temp1 = temp2 + temp1;
-    output[13] = 2*(temp1*C8);
-
-    output[ 9] = 2*((step[10] + step[11])*C8);
-
-    intermediate[11] = step[10] - step[11];
-    intermediate[12] = step[12] + step[13];
-    intermediate[13] = step[12] - step[13];
-    intermediate[14] = step[ 8] - step[14];
-    intermediate[15] = step[ 9] - step[15];
-
-    output[15] = (intermediate[11] + intermediate[12]);
-    output[ 1] = -(intermediate[11] - intermediate[12]);
-
-    output[ 7] = 2*(intermediate[13]*C8);
-
-    temp1 = intermediate[14]*C12;
-    temp2 = intermediate[15]*C4;
-    temp1 = temp1 - temp2;
-    output[11] = -2*(temp1*C8);
-
-    temp1 = intermediate[14]*C4;
-    temp2 = intermediate[15]*C12;
-    temp1 = temp2 + temp1;
-    output[ 5] = 2*(temp1*C8);
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
+static void dct32_1d(int *input, int *output) {
+  int step[32];
+  // Stage 1
+  step[0] = input[0] + input[(32 - 1)];
+  step[1] = input[1] + input[(32 - 2)];
+  step[2] = input[2] + input[(32 - 3)];
+  step[3] = input[3] + input[(32 - 4)];
+  step[4] = input[4] + input[(32 - 5)];
+  step[5] = input[5] + input[(32 - 6)];
+  step[6] = input[6] + input[(32 - 7)];
+  step[7] = input[7] + input[(32 - 8)];
+  step[8] = input[8] + input[(32 - 9)];
+  step[9] = input[9] + input[(32 - 10)];
+  step[10] = input[10] + input[(32 - 11)];
+  step[11] = input[11] + input[(32 - 12)];
+  step[12] = input[12] + input[(32 - 13)];
+  step[13] = input[13] + input[(32 - 14)];
+  step[14] = input[14] + input[(32 - 15)];
+  step[15] = input[15] + input[(32 - 16)];
+  step[16] = -input[16] + input[(32 - 17)];
+  step[17] = -input[17] + input[(32 - 18)];
+  step[18] = -input[18] + input[(32 - 19)];
+  step[19] = -input[19] + input[(32 - 20)];
+  step[20] = -input[20] + input[(32 - 21)];
+  step[21] = -input[21] + input[(32 - 22)];
+  step[22] = -input[22] + input[(32 - 23)];
+  step[23] = -input[23] + input[(32 - 24)];
+  step[24] = -input[24] + input[(32 - 25)];
+  step[25] = -input[25] + input[(32 - 26)];
+  step[26] = -input[26] + input[(32 - 27)];
+  step[27] = -input[27] + input[(32 - 28)];
+  step[28] = -input[28] + input[(32 - 29)];
+  step[29] = -input[29] + input[(32 - 30)];
+  step[30] = -input[30] + input[(32 - 31)];
+  step[31] = -input[31] + input[(32 - 32)];
 
-static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch,
-                                    int scale) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    double output[256];
-    // First transform columns
-    for (i = 0; i < 16; i++) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; j++)
-            temp_in[j] = input[j*shortpitch + i];
-        dct16x16_1d_f(temp_in, temp_out);
-        for (j = 0; j < 16; j++)
-            output[j*16 + i] = temp_out[j];
-    }
-    // Then transform rows
-    for (i = 0; i < 16; ++i) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; ++j)
-            temp_in[j] = output[j + i*16];
-        dct16x16_1d_f(temp_in, temp_out);
-        for (j = 0; j < 16; ++j)
-            output[j + i*16] = temp_out[j];
-    }
-    // Scale by some magic number
-    for (i = 0; i < 256; i++)
-        out[i] = (short)round(output[i] / (2 << scale));
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
+  // Stage 2
+  output[0] = step[0] + step[16 - 1];
+  output[1] = step[1] + step[16 - 2];
+  output[2] = step[2] + step[16 - 3];
+  output[3] = step[3] + step[16 - 4];
+  output[4] = step[4] + step[16 - 5];
+  output[5] = step[5] + step[16 - 6];
+  output[6] = step[6] + step[16 - 7];
+  output[7] = step[7] + step[16 - 8];
+  output[8] = -step[8] + step[16 - 9];
+  output[9] = -step[9] + step[16 - 10];
+  output[10] = -step[10] + step[16 - 11];
+  output[11] = -step[11] + step[16 - 12];
+  output[12] = -step[12] + step[16 - 13];
+  output[13] = -step[13] + step[16 - 14];
+  output[14] = -step[14] + step[16 - 15];
+  output[15] = -step[15] + step[16 - 16];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = step[18];
+  output[19] = step[19];
+
+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+  output[28] = step[28];
+  output[29] = step[29];
+  output[30] = step[30];
+  output[31] = step[31];
 
-void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) {
-  int j1, i, j, k;
-  float b[8];
-  float b1[8];
-  float d[8][8];
-  float f0 = (float) .7071068;
-  float f1 = (float) .4903926;
-  float f2 = (float) .4619398;
-  float f3 = (float) .4157348;
-  float f4 = (float) .3535534;
-  float f5 = (float) .2777851;
-  float f6 = (float) .1913417;
-  float f7 = (float) .0975452;
-  pitch = pitch / 2;
-  for (i = 0, k = 0; i < 8; i++, k += pitch) {
-    for (j = 0; j < 8; j++) {
-      b[j] = (float)(block[k + j] << (3 - scale));
-    }
-    /* Horizontal transform */
-    for (j = 0; j < 4; j++) {
-      j1 = 7 - j;
-      b1[j] = b[j] + b[j1];
-      b1[j1] = b[j] - b[j1];
-    }
-    b[0] = b1[0] + b1[3];
-    b[1] = b1[1] + b1[2];
-    b[2] = b1[1] - b1[2];
-    b[3] = b1[0] - b1[3];
-    b[4] = b1[4];
-    b[5] = (b1[6] - b1[5]) * f0;
-    b[6] = (b1[6] + b1[5]) * f0;
-    b[7] = b1[7];
-    d[i][0] = (b[0] + b[1]) * f4;
-    d[i][4] = (b[0] - b[1]) * f4;
-    d[i][2] = b[2] * f6 + b[3] * f2;
-    d[i][6] = b[3] * f6 - b[2] * f2;
-    b1[4] = b[4] + b[5];
-    b1[7] = b[7] + b[6];
-    b1[5] = b[4] - b[5];
-    b1[6] = b[7] - b[6];
-    d[i][1] = b1[4] * f7 + b1[7] * f1;
-    d[i][5] = b1[5] * f3 + b1[6] * f5;
-    d[i][7] = b1[7] * f7 - b1[4] * f1;
-    d[i][3] = b1[6] * f3 - b1[5] * f5;
-  }
-  /* Vertical transform */
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 4; j++) {
-      j1 = 7 - j;
-      b1[j] = d[j][i] + d[j1][i];
-      b1[j1] = d[j][i] - d[j1][i];
-    }
-    b[0] = b1[0] + b1[3];
-    b[1] = b1[1] + b1[2];
-    b[2] = b1[1] - b1[2];
-    b[3] = b1[0] - b1[3];
-    b[4] = b1[4];
-    b[5] = (b1[6] - b1[5]) * f0;
-    b[6] = (b1[6] + b1[5]) * f0;
-    b[7] = b1[7];
-    d[0][i] = (b[0] + b[1]) * f4;
-    d[4][i] = (b[0] - b[1]) * f4;
-    d[2][i] = b[2] * f6 + b[3] * f2;
-    d[6][i] = b[3] * f6 - b[2] * f2;
-    b1[4] = b[4] + b[5];
-    b1[7] = b[7] + b[6];
-    b1[5] = b[4] - b[5];
-    b1[6] = b[7] - b[6];
-    d[1][i] = b1[4] * f7 + b1[7] * f1;
-    d[5][i] = b1[5] * f3 + b1[6] * f5;
-    d[7][i] = b1[7] * f7 - b1[4] * f1;
-    d[3][i] = b1[6] * f3 - b1[5] * f5;
-  }
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5);
-    }
-  }
-  return;
-}
+  // Stage 3
+  step[0] = output[0] + output[(8 - 1)];
+  step[1] = output[1] + output[(8 - 2)];
+  step[2] = output[2] + output[(8 - 3)];
+  step[3] = output[3] + output[(8 - 4)];
+  step[4] = -output[4] + output[(8 - 5)];
+  step[5] = -output[5] + output[(8 - 6)];
+  step[6] = -output[6] + output[(8 - 7)];
+  step[7] = -output[7] + output[(8 - 8)];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  step[16] = output[16] + output[23];
+  step[17] = output[17] + output[22];
+  step[18] = output[18] + output[21];
+  step[19] = output[19] + output[20];
+  step[20] = -output[20] + output[19];
+  step[21] = -output[21] + output[18];
+  step[22] = -output[22] + output[17];
+  step[23] = -output[23] + output[16];
+  step[24] = -output[24] + output[31];
+  step[25] = -output[25] + output[30];
+  step[26] = -output[26] + output[29];
+  step[27] = -output[27] + output[28];
+  step[28] = output[28] + output[27];
+  step[29] = output[29] + output[26];
+  step[30] = output[30] + output[25];
+  step[31] = output[31] + output[24];
 
-#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))
+  // Stage 4
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = -step[2] + step[1];
+  output[3] = -step[3] + step[0];
+  output[4] = step[4];
+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = -step[10] + step[9];
+  output[11] = -step[11] + step[8];
+  output[12] = -step[12] + step[15];
+  output[13] = -step[13] + step[14];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+  output[22] = step[22];
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = step[25];
+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+  output[30] = step[30];
+  output[31] = step[31];
 
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+  // Stage 5
+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+  step[4] = output[4] + output[5];
+  step[5] = -output[5] + output[4];
+  step[6] = -output[6] + output[7];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+  step[15] = output[15];
+
+  step[16] = output[16] + output[19];
+  step[17] = output[17] + output[18];
+  step[18] = -output[18] + output[17];
+  step[19] = -output[19] + output[16];
+  step[20] = -output[20] + output[23];
+  step[21] = -output[21] + output[22];
+  step[22] = output[22] + output[21];
+  step[23] = output[23] + output[20];
+  step[24] = output[24] + output[27];
+  step[25] = output[25] + output[26];
+  step[26] = -output[26] + output[25];
+  step[27] = -output[27] + output[24];
+  step[28] = -output[28] + output[31];
+  step[29] = -output[29] + output[30];
+  step[30] = output[30] + output[29];
+  step[31] = output[31] + output[28];
 
-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
-  // assume out is a 32x32 buffer
-  short buffer[16 * 16];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-}
+  // Stage 6
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+  output[8] = step[8] + step[9];
+  output[9] = -step[9] + step[8];
+  output[10] = -step[10] + step[11];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = -step[13] + step[12];
+  output[14] = -step[14] + step[15];
+  output[15] = step[15] + step[14];
+
+  output[16] = step[16];
+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+  output[19] = step[19];
+  output[20] = step[20];
+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+  output[27] = step[27];
+  output[28] = step[28];
+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+  output[31] = step[31];
 
-#elif DWTDCT_TYPE == DWTDCT16X16
+  // Stage 7
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+  step[16] = output[16] + output[17];
+  step[17] = -output[17] + output[16];
+  step[18] = -output[18] + output[19];
+  step[19] = output[19] + output[18];
+  step[20] = output[20] + output[21];
+  step[21] = -output[21] + output[20];
+  step[22] = -output[22] + output[23];
+  step[23] = output[23] + output[22];
+  step[24] = output[24] + output[25];
+  step[25] = -output[25] + output[24];
+  step[26] = -output[26] + output[27];
+  step[27] = output[27] + output[26];
+  step[28] = output[28] + output[29];
+  step[29] = -output[29] + output[28];
+  step[30] = -output[30] + output[31];
+  step[31] = output[31] + output[30];
 
-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
-  // assume out is a 32x32 buffer
-  short buffer[16 * 16];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
-  vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);
+  // Final stage --- outputs indices are bit-reversed.
+  output[0]  = step[0];
+  output[16] = step[1];
+  output[8]  = step[2];
+  output[24] = step[3];
+  output[4]  = step[4];
+  output[20] = step[5];
+  output[12] = step[6];
+  output[28] = step[7];
+  output[2]  = step[8];
+  output[18] = step[9];
+  output[10] = step[10];
+  output[26] = step[11];
+  output[6]  = step[12];
+  output[22] = step[13];
+  output[14] = step[14];
+  output[30] = step[15];
+
+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-#elif DWTDCT_TYPE == DWTDCT8X8
-
-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
-  // assume out is a 32x32 buffer
-  short buffer[8 * 8];
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
+  int shortpitch = pitch >> 1;
   int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);
-
-  vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);
-
-  vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);
-
-  vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);
-
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
+  int output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; i++) {
+    int temp_in[32], temp_out[32];
+    for (j = 0; j < 32; j++)
+      temp_in[j] = input[j * shortpitch + i] << 2;
+    dct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; j++)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-}
 
-#endif
-
-#if CONFIG_TX64X64
-void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {
-  // assume out is a 64x64 buffer
-  short buffer[16 * 16];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 48; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 16; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-#elif DWTDCT_TYPE == DWTDCT16X16
-  vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);
-
-  // There is no dct used on the highest bands for now.
-  // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS
-  // TODO(debargha): experiment with turning these coeffs to 0
+  // Rows
   for (i = 0; i < 32; ++i) {
-    for (j = 32; j < 64; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 32; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
+    int temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    dct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
-#endif  // DWTDCT_TYPE
 }
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_DWTDCTHYBRID
+
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3f5133062..5271a597c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -21,7 +21,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_setupintrarecon.h"
-#include "vp9/common/vp9_reconintra4x4.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_invtrans.h"
@@ -29,8 +28,9 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include <stdio.h>
 #include <math.h>
 #include <limits.h>
@@ -45,18 +45,15 @@
 int enc_debug = 0;
 #endif
 
-extern void select_interp_filter_type(VP9_COMP *cpi);
+void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int recon_yoffset, int recon_uvoffset,
                               int output_enabled, int mb_row, int mb_col);
 
 static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int recon_yoffset, int recon_uvoffset,
                                 int output_enabled, int mb_row, int mb_col);
 
 static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int recon_yoffset, int recon_uvoffset,
                                 int output_enabled, int mb_row, int mb_col);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
@@ -103,7 +100,7 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
    */
   act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,
                           &sse);
-  act = act << 4;
+  act <<= 4;
 
   /* If the region is flat, lower the activity some more. */
   if (act < 8 << 12)
@@ -488,8 +485,7 @@ static void update_state(VP9_COMP *cpi,
 
   {
     int segment_id = mbmi->segment_id;
-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
       for (i = 0; i < NB_TXFM_MODES; i++) {
         cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
       }
@@ -625,27 +621,19 @@ static unsigned find_seg_id(uint8_t *buf, int block_size,
 }
 
 static void set_offsets(VP9_COMP *cpi,
-                        int mb_row, int mb_col, int block_size,
-                        int *ref_yoffset, int *ref_uvoffset) {
+                        int mb_row, int mb_col, int block_size) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
-  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
-  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
-  const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
-  const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
-  const int src_y_stride = x->src.y_stride;
-  const int src_uv_stride = x->src.uv_stride;
-  const int src_yoffset = 16 * mb_row * src_y_stride + 16 * mb_col;
-  const int src_uvoffset = 8 * mb_row * src_uv_stride + 8 * mb_col;
-  const int ref_fb_idx = cm->lst_fb_idx;
-  const int ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  const int ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   const int idx_str = xd->mode_info_stride * mb_row + mb_col;
 
+#ifdef ENC_DEBUG
+  enc_debug = (cpi->common.current_video_frame == 2 &&
+               mb_row == 4 && mb_col == 5);
+#endif
   // entropy context structures
   xd->above_context = cm->above_context + mb_col;
   xd->left_context  = cm->left_context + (mb_row & 3);
@@ -664,9 +652,9 @@ static void set_offsets(VP9_COMP *cpi,
   xd->prev_mode_info_context = cm->prev_mi + idx_str;
 
   // Set up destination pointers
-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+  setup_pred_block(&xd->dst,
+                   &cm->yv12_fb[dst_fb_idx],
+                   mb_row, mb_col, NULL, NULL);
 
   /* Set up limit values for MV components to prevent them from
    * extending beyond the UMV borders assuming 16x16 block size */
@@ -686,17 +674,12 @@ static void set_offsets(VP9_COMP *cpi,
   xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
 
   // Are edges available for intra prediction?
-  xd->up_available   = (mb_row != 0);
-  xd->left_available = (mb_col != 0);
-
-  /* Reference buffer offsets */
-  *ref_yoffset  = (mb_row * ref_y_stride * 16) + (mb_col * 16);
-  *ref_uvoffset = (mb_row * ref_uv_stride * 8) + (mb_col *  8);
+  xd->up_available    = (mb_row != 0);
+  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
+  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
 
   /* set up source buffers */
-  x->src.y_buffer = cpi->Source->y_buffer + src_yoffset;
-  x->src.u_buffer = cpi->Source->u_buffer + src_uvoffset;
-  x->src.v_buffer = cpi->Source->v_buffer + src_uvoffset;
+  setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);
 
   /* R/D setup */
   x->rddiv = cpi->RDDIV;
@@ -727,34 +710,36 @@ static void set_offsets(VP9_COMP *cpi,
       const int x = mb_col & ~3;
       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
+      const int tile_progress = cm->cur_tile_mb_col_start * cm->mb_rows;
+      const int mb_cols = cm->cur_tile_mb_col_end - cm->cur_tile_mb_col_start;
 
       cpi->seg0_progress =
-          ((y * cm->mb_cols + x * 4 + p32 + p16) << 16) / cm->MBs;
+          ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
     }
   } else {
     mbmi->segment_id = 0;
   }
 }
 
-static void pick_mb_modes(VP9_COMP *cpi,
-                          int mb_row,
-                          int mb_col,
-                          TOKENEXTRA **tp,
-                          int *totalrate,
-                          int *totaldist) {
+static int pick_mb_modes(VP9_COMP *cpi,
+                         int mb_row0,
+                         int mb_col0,
+                         TOKENEXTRA **tp,
+                         int *totalrate,
+                         int *totaldist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int i;
-  int recon_yoffset, recon_uvoffset;
+  int splitmodes_used = 0;
   ENTROPY_CONTEXT_PLANES left_context[2];
   ENTROPY_CONTEXT_PLANES above_context[2];
   ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
-                                                      + mb_col;
+                                                      + mb_col0;
 
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cm->left_context + (mb_row & 2),
+             cm->left_context + (mb_row0 & 2),
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -763,17 +748,18 @@ static void pick_mb_modes(VP9_COMP *cpi,
   /* Encode MBs in raster order within the SB */
   for (i = 0; i < 4; i++) {
     const int x_idx = i & 1, y_idx = i >> 1;
+    const int mb_row = mb_row0 + y_idx;
+    const int mb_col = mb_col0 + x_idx;
     MB_MODE_INFO *mbmi;
 
-    if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {
+    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
       // MB lies outside frame, move on
       continue;
     }
 
     // Index of the MB in the SB 0..3
     xd->mb_index = i;
-    set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,
-                &recon_yoffset, &recon_uvoffset);
+    set_offsets(cpi, mb_row, mb_col, 16);
 
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
       vp9_activity_masking(cpi, x);
@@ -781,10 +767,6 @@ static void pick_mb_modes(VP9_COMP *cpi,
     mbmi = &xd->mode_info_context->mbmi;
     mbmi->sb_type = BLOCK_SIZE_MB16X16;
 
-    cpi->update_context = 0;    // TODO Do we need this now??
-
-    vp9_intra_prediction_down_copy(xd);
-
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
@@ -798,8 +780,8 @@ static void pick_mb_modes(VP9_COMP *cpi,
       *totaldist += d;
 
       // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,
-                        mb_row + y_idx, mb_col + x_idx);
+      encode_macroblock(cpi, tp, 0, mb_row, mb_col);
+
       // Note the encoder may have changed the segment_id
 
       // Save the coding context
@@ -812,14 +794,14 @@ static void pick_mb_modes(VP9_COMP *cpi,
       if (enc_debug)
         printf("inter pick_mb_modes %d %d\n", mb_row, mb_col);
 #endif
-      vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
-                                     recon_uvoffset, &r, &d);
+      vp9_pick_mode_inter_macroblock(cpi, x, mb_row, mb_col, &r, &d);
       *totalrate += r;
       *totaldist += d;
 
+      splitmodes_used += (mbmi->mode == SPLITMV);
+
       // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,
-                        mb_row + y_idx, mb_col + x_idx);
+      encode_macroblock(cpi, tp, 0, mb_row, mb_col);
 
       seg_id = mbmi->segment_id;
       if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
@@ -842,12 +824,14 @@ static void pick_mb_modes(VP9_COMP *cpi,
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cm->left_context + (mb_row & 2),
+  vpx_memcpy(cm->left_context + (mb_row0 & 2),
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
              above_context,
              sizeof(above_context));
+
+  return splitmodes_used;
 }
 
 static void pick_sb_modes(VP9_COMP *cpi,
@@ -859,13 +843,11 @@ static void pick_sb_modes(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int recon_yoffset, recon_uvoffset;
 
-  set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);
+  set_offsets(cpi, mb_row, mb_col, 32);
   xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32;
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
-  cpi->update_context = 0;    // TODO Do we need this now??
 
   /* Find best coding mode & reconstruct the MB so it is available
    * as a predictor for MBs that follow in the SB */
@@ -878,11 +860,7 @@ static void pick_sb_modes(VP9_COMP *cpi,
     vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,
                sizeof(MODE_INFO));
   } else {
-    vp9_rd_pick_inter_mode_sb32(cpi, x,
-                                recon_yoffset,
-                                recon_uvoffset,
-                                totalrate,
-                                totaldist);
+    vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist);
   }
 }
 
@@ -895,30 +873,21 @@ static void pick_sb64_modes(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int recon_yoffset, recon_uvoffset;
 
-  set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);
+  set_offsets(cpi, mb_row, mb_col, 64);
   xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
-  cpi->update_context = 0;    // TODO(rbultje) Do we need this now??
 
   /* Find best coding mode & reconstruct the MB so it is available
    * as a predictor for MBs that follow in the SB */
   if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode_sb64(cpi, x,
-                                totalrate,
-                                totaldist);
+    vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist);
 
     /* Save the coding context */
-    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context,
-               sizeof(MODE_INFO));
+    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO));
   } else {
-    vp9_rd_pick_inter_mode_sb64(cpi, x,
-                                recon_yoffset,
-                                recon_uvoffset,
-                                totalrate,
-                                totaldist);
+    vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist);
   }
 }
 
@@ -986,14 +955,13 @@ static void encode_sb(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int recon_yoffset, recon_uvoffset;
 
   cpi->sb32_count[is_sb]++;
   if (is_sb) {
-    set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);
+    set_offsets(cpi, mb_row, mb_col, 32);
     update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled);
 
-    encode_superblock32(cpi, tp, recon_yoffset, recon_uvoffset,
+    encode_superblock32(cpi, tp,
                         output_enabled, mb_row, mb_col);
     if (output_enabled)
       update_stats(cpi);
@@ -1015,17 +983,14 @@ static void encode_sb(VP9_COMP *cpi,
         continue;
       }
 
-      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,
-                  &recon_yoffset, &recon_uvoffset);
+      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16);
       xd->mb_index = i;
       update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled);
 
       if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
         vp9_activity_masking(cpi, x);
 
-      vp9_intra_prediction_down_copy(xd);
-
-      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset,
+      encode_macroblock(cpi, tp,
                         output_enabled, mb_row + y_idx, mb_col + x_idx);
       if (output_enabled)
         update_stats(cpi);
@@ -1060,11 +1025,9 @@ static void encode_sb64(VP9_COMP *cpi,
 
   cpi->sb64_count[is_sb[0] == 2]++;
   if (is_sb[0] == 2) {
-    int recon_yoffset, recon_uvoffset;
-
-    set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);
+    set_offsets(cpi, mb_row, mb_col, 64);
     update_state(cpi, &x->sb64_context, 64, 1);
-    encode_superblock64(cpi, tp, recon_yoffset, recon_uvoffset,
+    encode_superblock64(cpi, tp,
                         1, mb_row, mb_col);
     update_stats(cpi);
 
@@ -1098,17 +1061,18 @@ static void encode_sb_row(VP9_COMP *cpi,
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int mb_col;
-  int mb_cols = cm->mb_cols;
 
   // Initialize the left context for the new SB row
   vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
-  for (mb_col = 0; mb_col < mb_cols; mb_col += 4) {
+  for (mb_col = cm->cur_tile_mb_col_start;
+       mb_col < cm->cur_tile_mb_col_end; mb_col += 4) {
     int i;
     int sb32_rate = 0, sb32_dist = 0;
     int is_sb[4];
     int sb64_rate = INT_MAX, sb64_dist;
+    int sb64_skip = 0;
     ENTROPY_CONTEXT_PLANES l[4], a[4];
     TOKENEXTRA *tp_orig = *tp;
 
@@ -1118,18 +1082,27 @@ static void encode_sb_row(VP9_COMP *cpi,
       const int x_idx = (i & 1) << 1, y_idx = i & 2;
       int mb_rate = 0, mb_dist = 0;
       int sb_rate = INT_MAX, sb_dist;
+      int splitmodes_used = 0;
+      int sb32_skip = 0;
 
       if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)
         continue;
 
       xd->sb_index = i;
 
-      pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
-                    tp, &mb_rate, &mb_dist);
+      splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
+                                      tp, &mb_rate, &mb_dist);
+
       mb_rate += vp9_cost_bit(cm->sb32_coded, 0);
 
-      if (!(((    mb_cols & 1) && mb_col + x_idx ==     mb_cols - 1) ||
-            ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {
+      if (cpi->sf.splitmode_breakout) {
+        sb32_skip = splitmodes_used;
+        sb64_skip += splitmodes_used;
+      }
+
+      if ( !sb32_skip &&
+           !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) ||
+             ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {
         /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
         pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
                       tp, &sb_rate, &sb_dist);
@@ -1147,6 +1120,11 @@ static void encode_sb_row(VP9_COMP *cpi,
         is_sb[i] = 0;
         sb32_rate += mb_rate;
         sb32_dist += mb_dist;
+
+        // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
+        if (cpi->sf.mb16_breakout) {
+          ++sb64_skip;
+        }
       }
 
       /* Encode SB using best computed mode(s) */
@@ -1162,7 +1140,8 @@ static void encode_sb_row(VP9_COMP *cpi,
     memcpy(cm->left_context, &l, sizeof(l));
     sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);
 
-    if (!(((    mb_cols & 3) && mb_col + 3 >=     mb_cols) ||
+    if (!sb64_skip &&
+        !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) ||
           ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {
       pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist);
       sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);
@@ -1205,7 +1184,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Copy data over into macro block data structures.
   x->src = *cpi->Source;
-  xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
   xd->dst = cm->yv12_fb[cm->new_fb_idx];
 
   // set up frame for intra coded blocks
@@ -1239,18 +1218,33 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   vpx_memset(cm->above_context, 0,
              sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (cm->full_pixel)
-    xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff;
 }
 
+static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
+  if (lossless) {
+    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4_x8;
+    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4_x8;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;
+    cpi->mb.optimize              = 0;
+    cpi->common.filter_level      = 0;
+    cpi->zbin_mode_boost_enabled  = FALSE;
+    cpi->common.txfm_mode         = ONLY_4X4;
+  } else {
+    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
+    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;
+  }
+}
+
+
 static void encode_frame_internal(VP9_COMP *cpi) {
   int mb_row;
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-
-  TOKENEXTRA *tp = cpi->tok;
   int totalrate;
 
   // printf("encode_frame_internal frame %d (%d)\n",
@@ -1273,9 +1267,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   totalrate = 0;
 
-  // Functions setup for all frame types so we can use MC in AltRef
-  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);
-
   // Reset frame count of inter 0,0 motion vector usage.
   cpi->inter_zz_count = 0;
 
@@ -1292,16 +1283,21 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   vp9_zero(cpi->NMVcount);
   vp9_zero(cpi->coef_counts_4x4);
-  vp9_zero(cpi->hybrid_coef_counts_4x4);
   vp9_zero(cpi->coef_counts_8x8);
-  vp9_zero(cpi->hybrid_coef_counts_8x8);
   vp9_zero(cpi->coef_counts_16x16);
-  vp9_zero(cpi->hybrid_coef_counts_16x16);
   vp9_zero(cpi->coef_counts_32x32);
 #if CONFIG_NEW_MVREF
   vp9_zero(cpi->mb_mv_ref_count);
 #endif
 
+
+  // force lossless mode when Q0 is selected
+  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
+                            cm->y1dc_delta_q == 0 &&
+                            cm->uvdc_delta_q == 0 &&
+                            cm->uvac_delta_q == 0);
+  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
   vp9_frame_init_quantizer(cpi);
 
   vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
@@ -1330,12 +1326,20 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     vpx_usec_timer_start(&emr_timer);
 
     {
-      // For each row of SBs in the frame
-      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
-        encode_sb_row(cpi, mb_row, &tp, &totalrate);
-      }
+      // Take tiles into account and give start/end MB
+      int tile_col;
+      TOKENEXTRA *tp = cpi->tok;
+
+      for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+        TOKENEXTRA *tp_old = tp;
 
-      cpi->tok_count = (unsigned int)(tp - cpi->tok);
+        // For each row of SBs in the frame
+        vp9_get_tile_col_offsets(cm, tile_col);
+        for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
+          encode_sb_row(cpi, mb_row, &tp, &totalrate);
+        }
+        cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
+      }
     }
 
     vpx_usec_timer_mark(&emr_timer);
@@ -1388,8 +1392,7 @@ static void reset_skip_txfm_size_mb(VP9_COMP *cpi,
     const int segment_id = mbmi->segment_id;
 
     xd->mode_info_context = mi;
-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
            (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
     mbmi->txfm_size = txfm_max;
   }
@@ -1413,9 +1416,8 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
   int x, y;
 
   for (y = 0; y < ymbs; y++) {
-    for (x = 0; x < xmbs; x++) {
+    for (x = 0; x < xmbs; x++)
       mi[y * mis + x].mbmi.txfm_size = txfm_size;
-    }
   }
 }
 
@@ -1433,8 +1435,7 @@ static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi,
     const int xmbs = MIN(2, mb_cols_left);
 
     xd->mode_info_context = mi;
-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
            (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
   }
@@ -1454,8 +1455,7 @@ static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi,
     const int xmbs = MIN(4, mb_cols_left);
 
     xd->mode_info_context = mi;
-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
            (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
   }
@@ -1526,9 +1526,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      */
     if (cpi->common.frame_type == KEY_FRAME)
       frame_type = 0;
-    else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)
+    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
       frame_type = 3;
-    else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
       frame_type = 1;
     else
       frame_type = 2;
@@ -1549,11 +1549,12 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       pred_type = HYBRID_PREDICTION;
 
     /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
-#if CONFIG_LOSSLESS
+
+    cpi->mb.e_mbd.lossless = 0;
     if (cpi->oxcf.lossless) {
       txfm_type = ONLY_4X4;
+      cpi->mb.e_mbd.lossless = 1;
     } else
-#endif
     /* FIXME (rbultje)
      * this is a hack (no really), basically to work around the complete
      * nonsense coefficient cost prediction for keyframes. The probabilities
@@ -1671,7 +1672,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
     // Update interpolation filter strategy for next frame.
     if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
-      select_interp_filter_type(cpi);
+      vp9_select_interp_filter_type(cpi);
   } else {
     encode_frame_internal(cpi);
   }
@@ -1683,30 +1684,23 @@ void vp9_setup_block_ptrs(MACROBLOCK *x) {
   int i;
 
   for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
+    for (c = 0; c < 4; c++)
       x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
-    }
   }
 
   for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
+    for (c = 0; c < 2; c++)
       x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
-    }
   }
 
 
   for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
+    for (c = 0; c < 2; c++)
       x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
-    }
   }
 
-  x->block[24].src_diff = x->src_diff + 384;
-
-
-  for (i = 0; i < 25; i++) {
+  for (i = 0; i < 24; i++)
     x->block[i].coeff = x->coeff + i * 16;
-  }
 }
 
 void vp9_build_block_offsets(MACROBLOCK *x) {
@@ -1995,7 +1989,6 @@ static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
 }
 
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int recon_yoffset, int recon_uvoffset,
                               int output_enabled,
                               int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2007,8 +2000,8 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
   assert(!xd->mode_info_context->mbmi.sb_type);
 
 #ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 46 &&
-               mb_row == 5 && mb_col == 2);
+  enc_debug = (cpi->common.current_video_frame == 2 &&
+               mb_row == 5 && mb_col == 18);
   if (enc_debug)
     printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
 #endif
@@ -2086,58 +2079,50 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.lst_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.gld_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.alt_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+    setup_pred_block(&xd->pre,
+                     &cpi->common.yv12_fb[ref_fb_idx],
+                     mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     if (mbmi->second_ref_frame > 0) {
       int second_ref_fb_idx;
 
       if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.lst_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.gld_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.alt_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
-                                recon_yoffset;
-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
-                                recon_uvoffset;
-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
-                                recon_uvoffset;
+      setup_pred_block(&xd->second_pre,
+                       &cpi->common.yv12_fb[second_ref_fb_idx],
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
     if (!x->skip) {
-      vp9_encode_inter16x16(x);
+      vp9_encode_inter16x16(x, mb_row, mb_col);
 
       // Clear mb_skip_coeff if mb_no_coeff_skip is not set
       if (!cpi->common.mb_no_coeff_skip)
         mbmi->mb_skip_coeff = 0;
 
     } else {
-      vp9_build_1st_inter16x16_predictors_mb(xd,
-                                             xd->dst.y_buffer,
-                                             xd->dst.u_buffer,
-                                             xd->dst.v_buffer,
-                                             xd->dst.y_stride,
-                                             xd->dst.uv_stride);
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        vp9_build_2nd_inter16x16_predictors_mb(xd,
-                                               xd->dst.y_buffer,
-                                               xd->dst.u_buffer,
-                                               xd->dst.v_buffer,
-                                               xd->dst.y_stride,
-                                               xd->dst.uv_stride);
-      }
+      vp9_build_inter16x16_predictors_mb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
 #if CONFIG_COMP_INTERINTRA_PRED
-      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
         vp9_build_interintra_16x16_predictors_mb(xd,
                                                  xd->dst.y_buffer,
                                                  xd->dst.u_buffer,
@@ -2150,7 +2135,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
   }
 
   if (!x->skip) {
-#ifdef ENC_DEBUG
+#if 0  // def ENC_DEBUG
     if (enc_debug) {
       int i, j;
       printf("\n");
@@ -2227,8 +2212,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     int segment_id = mbmi->segment_id;
     if (cpi->common.txfm_mode == TX_MODE_SELECT &&
         !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
-           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP)))) {
       assert(mbmi->txfm_size <= TX_16X16);
       if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
           mbmi->mode != SPLITMV) {
@@ -2253,7 +2237,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
 }
 
 static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int recon_yoffset, int recon_uvoffset,
                                 int output_enabled, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -2326,37 +2309,37 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.lst_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.gld_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.alt_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+    setup_pred_block(&xd->pre,
+                     &cpi->common.yv12_fb[ref_fb_idx],
+                     mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
       int second_ref_fb_idx;
 
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.lst_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.gld_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.alt_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
-                                    recon_yoffset;
-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
-                                    recon_uvoffset;
-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
-                                    recon_uvoffset;
+      setup_pred_block(&xd->second_pre,
+                       &cpi->common.yv12_fb[second_ref_fb_idx],
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
     vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
@@ -2465,8 +2448,7 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
   if (output_enabled) {
     if (cm->txfm_mode == TX_MODE_SELECT &&
         !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
       cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
     } else {
       TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
@@ -2485,7 +2467,6 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
 }
 
 static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int recon_yoffset, int recon_uvoffset,
                                 int output_enabled, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -2557,40 +2538,37 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.lst_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.gld_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.alt_fb_idx;
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-    xd->pre.y_buffer =
-        cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-    xd->pre.u_buffer =
-        cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-    xd->pre.v_buffer =
-        cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+    setup_pred_block(&xd->pre,
+                     &cpi->common.yv12_fb[ref_fb_idx],
+                     mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
       int second_ref_fb_idx;
 
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.lst_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.gld_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.alt_fb_idx;
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
-      xd->second_pre.y_buffer =
-          cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-      xd->second_pre.u_buffer =
-          cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->second_pre.v_buffer =
-          cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+      setup_pred_block(&xd->second_pre,
+                       &cpi->common.yv12_fb[second_ref_fb_idx],
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
     vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
@@ -2729,8 +2707,7 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
              skip[4] && skip[5] && skip[6] && skip[7] &&
              skip[8] && skip[9] && skip[10] && skip[11] &&
              skip[12] && skip[13] && skip[14] && skip[15]))) ||
-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
       cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
     } else {
       int x, y;
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 1b056e163..9f13edcec 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -14,8 +14,8 @@
 
 struct macroblock;
 
-extern void vp9_build_block_offsets(struct macroblock *x);
+void vp9_build_block_offsets(struct macroblock *x);
 
-extern void vp9_setup_block_ptrs(struct macroblock *x);
+void vp9_setup_block_ptrs(struct macroblock *x);
 
 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index ce9a38003..be9c224b3 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -12,14 +12,11 @@
 #include "vp9_rtcd.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_reconintra4x4.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/common/vp9_invtrans.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
-  int i;
-  int intra_pred_var = 0;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
 
@@ -30,15 +27,15 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
 
     vp9_encode_intra16x16mby(x);
   } else {
+    int i;
+
     for (i = 0; i < 16; i++) {
       x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
       vp9_encode_intra4x4block(x, i);
     }
   }
 
-  intra_pred_var = vp9_get_mb_ss(x->src_diff);
-
-  return intra_pred_var;
+  return vp9_get_mb_ss(x->src_diff);
 }
 
 void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
@@ -50,17 +47,17 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   b->bmi.as_mode.context = vp9_find_bpred_context(b);
 #endif
 
-  vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
+  vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);
   vp9_subtract_b(be, b, 16);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, b);
   if (tx_type != DCT_DCT) {
-    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
-    vp9_ht_quantize_b_4x4(be, b, tx_type);
-    vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
+    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+    vp9_ht_quantize_b_4x4(x, ib, tx_type);
+    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
   } else {
-    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-    x->quantize_b_4x4(be, b) ;
+    x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+    x->quantize_b_4x4(x, ib);
     vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32);
   }
 
@@ -72,7 +69,6 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb) {
 
   for (i = 0; i < 16; i++)
     vp9_encode_intra4x4block(mb, i);
-  return;
 }
 
 void vp9_encode_intra16x16mby(MACROBLOCK *x) {
@@ -84,24 +80,28 @@ void vp9_encode_intra16x16mby(MACROBLOCK *x) {
 
   vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
 
-  if (tx_size == TX_16X16) {
-    vp9_transform_mby_16x16(x);
-    vp9_quantize_mby_16x16(x);
-    if (x->optimize)
-      vp9_optimize_mby_16x16(x);
-    vp9_inverse_transform_mby_16x16(xd);
-  } else if (tx_size == TX_8X8) {
-    vp9_transform_mby_8x8(x);
-    vp9_quantize_mby_8x8(x);
-    if (x->optimize)
-      vp9_optimize_mby_8x8(x);
-    vp9_inverse_transform_mby_8x8(xd);
-  } else {
-    vp9_transform_mby_4x4(x);
-    vp9_quantize_mby_4x4(x);
-    if (x->optimize)
-      vp9_optimize_mby_4x4(x);
-    vp9_inverse_transform_mby_4x4(xd);
+  switch (tx_size) {
+    case TX_16X16:
+      vp9_transform_mby_16x16(x);
+      vp9_quantize_mby_16x16(x);
+      if (x->optimize)
+        vp9_optimize_mby_16x16(x);
+      vp9_inverse_transform_mby_16x16(xd);
+      break;
+    case TX_8X8:
+      vp9_transform_mby_8x8(x);
+      vp9_quantize_mby_8x8(x);
+      if (x->optimize)
+        vp9_optimize_mby_8x8(x);
+      vp9_inverse_transform_mby_8x8(xd);
+      break;
+    default:
+      vp9_transform_mby_4x4(x);
+      vp9_quantize_mby_4x4(x);
+      if (x->optimize)
+        vp9_optimize_mby_4x4(x);
+      vp9_inverse_transform_mby_4x4(xd);
+      break;
   }
 
   vp9_recon_mby(xd);
@@ -116,19 +116,22 @@ void vp9_encode_intra16x16mbuv(MACROBLOCK *x) {
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     xd->predictor, x->src.uv_stride);
 
-  if (tx_size == TX_4X4) {
-    vp9_transform_mbuv_4x4(x);
-    vp9_quantize_mbuv_4x4(x);
-    if (x->optimize)
-      vp9_optimize_mbuv_4x4(x);
-    vp9_inverse_transform_mbuv_4x4(xd);
-  } else /* 16x16 or 8x8 */ {
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-    if (x->optimize)
-      vp9_optimize_mbuv_8x8(x);
-    vp9_inverse_transform_mbuv_8x8(xd);
-  }
+  switch (tx_size) {
+    case TX_4X4:
+      vp9_transform_mbuv_4x4(x);
+      vp9_quantize_mbuv_4x4(x);
+      if (x->optimize)
+        vp9_optimize_mbuv_4x4(x);
+      vp9_inverse_transform_mbuv_4x4(xd);
+      break;
+    default:  // 16x16 or 8x8
+      vp9_transform_mbuv_8x8(x);
+      vp9_quantize_mbuv_8x8(x);
+      if (x->optimize)
+        vp9_optimize_mbuv_8x8(x);
+      vp9_inverse_transform_mbuv_8x8(xd);
+      break;
+    }
 
   vp9_recon_intra_mbuv(xd);
 }
@@ -141,7 +144,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
   int i;
   TX_TYPE tx_type;
 
-  vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
+  vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, b->predictor);
   // generate residual blocks
   vp9_subtract_4b_c(be, b, 16);
 
@@ -150,14 +153,13 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
 
     tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
     if (tx_type != DCT_DCT) {
-      vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,
-                tx_type, 8);
-      x->quantize_b_8x8(x->block + idx, xd->block + idx);
-      vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                   tx_type, 8, xd->block[idx].eob);
+      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
+      x->quantize_b_8x8(x, idx);
+      vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,
+                            16, tx_type);
     } else {
-      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-      x->quantize_b_8x8(x->block + idx, xd->block + idx);
+      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->quantize_b_8x8(x, idx);
       vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
     }
   } else {
@@ -166,12 +168,18 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
       be = &x->block[ib + iblock[i]];
       tx_type = get_tx_type_4x4(xd, b);
       if (tx_type != DCT_DCT) {
-        vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
-        vp9_ht_quantize_b_4x4(be, b, tx_type);
-        vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
+        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+        vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
+        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
+      } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
+        x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
+        vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
+        vp9_inverse_transform_b_4x4(xd, ib + iblock[i] + 1, 32);
+        i++;
       } else {
-        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4(be, b);
+        x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b_4x4(x, ib + iblock[i]);
         vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
       }
     }
@@ -186,25 +194,22 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
 }
 
 void vp9_encode_intra8x8mby(MACROBLOCK *x) {
-  int i, ib;
+  int i;
 
-  for (i = 0; i < 4; i++) {
-    ib = vp9_i8x8_block[i];
-    vp9_encode_intra8x8(x, ib);
-  }
+  for (i = 0; i < 4; i++)
+    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);
 }
 
-static void encode_intra_uv4x4(MACROBLOCK *x, int ib,
-                               int mode) {
+static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
 
-  vp9_intra_uv4x4_predict(b, mode, b->predictor);
+  vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor);
 
   vp9_subtract_b(be, b, 8);
 
-  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
-  x->quantize_b_4x4(be, b);
+  x->fwd_txm4x4(be->src_diff, be->coeff, 16);
+  x->quantize_b_4x4(x, ib);
   vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16);
 
   vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
@@ -212,17 +217,13 @@ static void encode_intra_uv4x4(MACROBLOCK *x, int ib,
 }
 
 void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
-  int i, ib, mode;
-  BLOCKD *b;
+  int i;
 
   for (i = 0; i < 4; i++) {
-    ib = vp9_i8x8_block[i];
-    b = &x->e_mbd.block[ib];
-    mode = b->bmi.as_mode.first;
-
-    /*u */
-    encode_intra_uv4x4(x, i + 16, mode);
-    /*v */
-    encode_intra_uv4x4(x, i + 20, mode);
+    BLOCKD *b = &x->e_mbd.block[vp9_i8x8_block[i]];
+    int mode = b->bmi.as_mode.first;
+
+    encode_intra_uv4x4(x, i + 16, mode);  // u
+    encode_intra_uv4x4(x, i + 20, mode);  // v
   }
 }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 45278a71b..62f1a2a30 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -29,9 +29,8 @@ void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
   int r, c;
 
   for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
+    for (c = 0; c < 4; c++)
       diff_ptr[c] = src_ptr[c] - pred_ptr[c];
-    }
 
     diff_ptr += pitch;
     pred_ptr += pitch;
@@ -47,9 +46,9 @@ void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
   int r, c;
 
   for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++) {
+    for (c = 0; c < 8; c++)
       diff_ptr[c] = src_ptr[c] - pred_ptr[c];
-    }
+
     diff_ptr += pitch;
     pred_ptr += pitch;
     src_ptr  += src_stride;
@@ -65,9 +64,8 @@ void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,
   int r, c;
 
   for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++) {
+    for (c = 0; c < 8; c++)
       udiff[c] = usrc[c] - upred[c];
-    }
 
     udiff += 8;
     upred += dst_stride;
@@ -98,9 +96,8 @@ void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
   int r, c;
 
   for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
+    for (c = 0; c < 16; c++)
       diff[c] = src[c] - pred[c];
-    }
 
     diff += 16;
     pred += dst_stride;
@@ -113,9 +110,8 @@ void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
   int r, c;
 
   for (r = 0; r < 32; r++) {
-    for (c = 0; c < 32; c++) {
+    for (c = 0; c < 32; c++)
       diff[c] = src[c] - pred[c];
-    }
 
     diff += 32;
     pred += dst_stride;
@@ -132,9 +128,8 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
   int r, c;
 
   for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
+    for (c = 0; c < 16; c++)
       udiff[c] = usrc[c] - upred[c];
-    }
 
     udiff += 16;
     upred += dst_stride;
@@ -142,9 +137,8 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
   }
 
   for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
+    for (c = 0; c < 16; c++)
       vdiff[c] = vsrc[c] - vpred[c];
-    }
 
     vdiff += 16;
     vpred += dst_stride;
@@ -166,52 +160,29 @@ static void subtract_mb(MACROBLOCK *x) {
                     x->e_mbd.predictor, x->src.uv_stride);
 }
 
-static void build_dcblock_4x4(MACROBLOCK *x) {
-  int16_t *src_diff_ptr = &x->src_diff[384];
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    src_diff_ptr[i] = x->coeff[i * 16];
-    x->coeff[i * 16] = 0;
-  }
-}
-
 void vp9_transform_mby_4x4(MACROBLOCK *x) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
-  int has_2nd_order = get_2nd_order_usage(xd);
 
   for (i = 0; i < 16; i++) {
     BLOCK *b = &x->block[i];
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      assert(has_2nd_order == 0);
-      vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 4);
+      vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);
+    } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) {
+      x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32);
+      i++;
     } else {
-      x->vp9_short_fdct4x4(&x->block[i].src_diff[0],
-                           &x->block[i].coeff[0], 32);
+      x->fwd_txm4x4(x->block[i].src_diff, x->block[i].coeff, 32);
     }
   }
-
-  if (has_2nd_order) {
-    // build dc block from 16 y dc values
-    build_dcblock_4x4(x);
-
-    // do 2nd order transform on the dc block
-    x->short_walsh4x4(&x->block[24].src_diff[0],
-                      &x->block[24].coeff[0], 8);
-  } else {
-    vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0]));
-  }
 }
 
 void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
   int i;
 
-  for (i = 16; i < 24; i += 2) {
-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
-                         &x->block[i].coeff[0], 16);
-  }
+  for (i = 16; i < 24; i += 2)
+    x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 16);
 }
 
 static void transform_mb_4x4(MACROBLOCK *x) {
@@ -219,71 +190,36 @@ static void transform_mb_4x4(MACROBLOCK *x) {
   vp9_transform_mbuv_4x4(x);
 }
 
-static void build_dcblock_8x8(MACROBLOCK *x) {
-  int16_t *src_diff_ptr = x->block[24].src_diff;
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    src_diff_ptr[i] = 0;
-  }
-  src_diff_ptr[0] = x->coeff[0 * 16];
-  src_diff_ptr[1] = x->coeff[4 * 16];
-  src_diff_ptr[4] = x->coeff[8 * 16];
-  src_diff_ptr[8] = x->coeff[12 * 16];
-  x->coeff[0 * 16] = 0;
-  x->coeff[4 * 16] = 0;
-  x->coeff[8 * 16] = 0;
-  x->coeff[12 * 16] = 0;
-}
-
 void vp9_transform_mby_8x8(MACROBLOCK *x) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
   TX_TYPE tx_type;
-  int has_2nd_order = get_2nd_order_usage(xd);
 
   for (i = 0; i < 9; i += 8) {
     BLOCK *b = &x->block[i];
     tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      assert(has_2nd_order == 0);
-      vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 8);
+      vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);
     } else {
-      x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
-                           &x->block[i].coeff[0], 32);
+      x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 32);
     }
   }
   for (i = 2; i < 11; i += 8) {
     BLOCK *b = &x->block[i];
     tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      assert(has_2nd_order == 0);
-      vp9_fht_c(b->src_diff, 32, (b + 2)->coeff, tx_type, 8);
+      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);
     } else {
-      x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
-                           &x->block[i + 2].coeff[0], 32);
+      x->fwd_txm8x8(x->block[i].src_diff, x->block[i + 2].coeff, 32);
     }
   }
-
-  if (has_2nd_order) {
-    // build dc block from 2x2 y dc values
-    build_dcblock_8x8(x);
-
-    // do 2nd order transform on the dc block
-    x->short_fhaar2x2(&x->block[24].src_diff[0],
-                      &x->block[24].coeff[0], 8);
-  } else {
-    vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0]));
-  }
 }
 
 void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
   int i;
 
-  for (i = 16; i < 24; i += 4) {
-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
-                         &x->block[i].coeff[0], 16);
-  }
+  for (i = 16; i < 24; i += 4)
+    x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 16);
 }
 
 void vp9_transform_mb_8x8(MACROBLOCK *x) {
@@ -297,10 +233,9 @@ void vp9_transform_mby_16x16(MACROBLOCK *x) {
   TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
   vp9_clear_system_state();
   if (tx_type != DCT_DCT) {
-    vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 16);
+    vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);
   } else {
-    x->vp9_short_fdct16x16(&x->block[0].src_diff[0],
-                           &x->block[0].coeff[0], 32);
+    x->fwd_txm16x16(x->block[0].src_diff, x->block[0].coeff, 32);
   }
 }
 
@@ -317,10 +252,8 @@ void vp9_transform_sby_32x32(MACROBLOCK *x) {
 void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
   SUPERBLOCK * const x_sb = &x->sb_coeff_data;
   vp9_clear_system_state();
-  x->vp9_short_fdct16x16(x_sb->src_diff + 1024,
-                         x_sb->coeff + 1024, 32);
-  x->vp9_short_fdct16x16(x_sb->src_diff + 1280,
-                         x_sb->coeff + 1280, 32);
+  x->fwd_txm16x16(x_sb->src_diff + 1024, x_sb->coeff + 1024, 32);
+  x->fwd_txm16x16(x_sb->src_diff + 1280, x_sb->coeff + 1280, 32);
 }
 
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
@@ -338,13 +271,10 @@ struct vp9_token_state {
 // TODO: experiments to find optimal multiple numbers
 #define Y1_RD_MULT 4
 #define UV_RD_MULT 2
-#define Y2_RD_MULT 4
 
 static const int plane_rd_mult[4] = {
   Y1_RD_MULT,
-  Y2_RD_MULT,
   UV_RD_MULT,
-  Y1_RD_MULT
 };
 
 #define UPDATE_RD_COST()\
@@ -357,34 +287,39 @@ static const int plane_rd_mult[4] = {
   }\
 }
 
+// This function is a place holder for now but may ultimately need
+// to scan previous tokens to work out the correct context.
+static int trellis_get_coeff_context(int token) {
+  int recent_energy = 0;
+  return vp9_get_coef_context(&recent_energy, token);
+}
+
 static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        int tx_size) {
+  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;
+  MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK *b = &mb->block[i];
-  BLOCKD *d = &mb->e_mbd.block[i];
+  BLOCKD *d = &xd->block[i];
   vp9_token_state tokens[257][2];
   unsigned best_index[257][2];
   const int16_t *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
   int16_t *qcoeff_ptr = d->qcoeff;
   int16_t *dqcoeff_ptr = d->dqcoeff;
-  int eob = d->eob, final_eob, sz = 0;
-  int i0 = (type == PLANE_TYPE_Y_NO_DC);
+  int eob = xd->eobs[i], final_eob, sz = 0;
+  const int i0 = 0;
   int rc, x, next;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt;
   int err_mult = plane_rd_mult[type];
   int default_eob;
-  int const *scan, *bands;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-#endif
+  int const *scan;
 
   switch (tx_size) {
     default:
     case TX_4X4:
       scan = vp9_default_zig_zag1d_4x4;
-      bands = vp9_coef_bands_4x4;
       default_eob = 16;
       // TODO: this isn't called (for intra4x4 modes), but will be left in
       // since it could be used later
@@ -411,18 +346,13 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       break;
     case TX_8X8:
       scan = vp9_default_zig_zag1d_8x8;
-      bands = vp9_coef_bands_8x8;
       default_eob = 64;
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
-      bands = vp9_coef_bands_16x16;
       default_eob = 256;
       break;
   }
-#if CONFIG_NEWCOEFCONTEXT
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-#endif
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
@@ -454,17 +384,12 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       t0 = (vp9_dct_value_tokens_ptr + x)->Token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = bands[i + 1];
-        pt = vp9_prev_token_class[t0];
-#if CONFIG_NEWCOEFCONTEXT
-        if (NEWCOEFCONTEXT_BAND_COND(band))
-          pt = vp9_get_coef_neighbor_context(
-              qcoeff_ptr, i0, neighbors, scan[i + 1]);
-#endif
+        band = get_coef_band(tx_size, i + 1);
+        pt = trellis_get_coeff_context(t0);
         rate0 +=
-          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
+          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token];
         rate1 +=
-          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];
+          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token];
       }
       UPDATE_RD_COST();
       /* And pick the best. */
@@ -506,37 +431,15 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
       }
       if (next < default_eob) {
-        band = bands[i + 1];
+        band = get_coef_band(tx_size, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
-#if CONFIG_NEWCOEFCONTEXT
-          int tmp = qcoeff_ptr[scan[i]];
-          qcoeff_ptr[scan[i]] = x;
-          if (NEWCOEFCONTEXT_BAND_COND(band))
-            pt = vp9_get_coef_neighbor_context(
-                qcoeff_ptr, i0, neighbors, scan[i + 1]);
-          else
-            pt = vp9_prev_token_class[t0];
-          qcoeff_ptr[scan[i]] = tmp;
-#else
-          pt = vp9_prev_token_class[t0];
-#endif
-          rate0 += mb->token_costs[tx_size][type][band][pt][
+          pt = trellis_get_coeff_context(t0);
+          rate0 += mb->token_costs[tx_size][type][ref][band][pt][
               tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
-#if CONFIG_NEWCOEFCONTEXT
-          int tmp = qcoeff_ptr[scan[i]];
-          qcoeff_ptr[scan[i]] = x;
-          if (NEWCOEFCONTEXT_BAND_COND(band))
-            pt = vp9_get_coef_neighbor_context(
-                qcoeff_ptr, i0, neighbors, scan[i + 1]);
-          else
-            pt = vp9_prev_token_class[t1];
-          qcoeff_ptr[scan[i]] = tmp;
-#else
-          pt = vp9_prev_token_class[t1];
-#endif
-          rate1 += mb->token_costs[tx_size][type][band][pt][
+          pt = trellis_get_coeff_context(t1);
+          rate1 += mb->token_costs[tx_size][type][ref][band][pt][
               tokens[next][1].token];
         }
       }
@@ -563,16 +466,18 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
      *  add a new trellis node, but we do need to update the costs.
      */
     else {
-      band = bands[i + 1];
+      band = get_coef_band(tx_size, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != DCT_EOB_TOKEN) {
-        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];
+        tokens[next][0].rate +=
+            mb->token_costs[tx_size][type][ref][band][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != DCT_EOB_TOKEN) {
-        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];
+        tokens[next][1].rate +=
+            mb->token_costs[tx_size][type][ref][band][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       /* Don't update next, because we didn't add a new node. */
@@ -580,7 +485,7 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = bands[i + 1];
+  band = get_coef_band(tx_size, i + 1);
   VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
@@ -588,8 +493,8 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][band][pt][t0];
-  rate1 += mb->token_costs[tx_size][type][band][pt][t1];
+  rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
@@ -606,81 +511,12 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
   }
   final_eob++;
 
-  d->eob = final_eob;
-  *a = *l = (d->eob > !type);
-}
-
-/**************************************************************************
-our inverse hadamard transform effectively is weighted sum of all 16 inputs
-with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And
-dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the
-output after inverse wht and idct will be all zero. A sum of absolute value
-smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht
-fall between -65 and +65.
-**************************************************************************/
-#define SUM_2ND_COEFF_THRESH 65
-
-static void check_reset_2nd_coeffs(MACROBLOCKD *xd,
-                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  int sum = 0;
-  int i;
-  BLOCKD *bd = &xd->block[24];
-  if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH
-      && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)
-    return;
-
-  for (i = 0; i < bd->eob; i++) {
-    int coef = bd->dqcoeff[vp9_default_zig_zag1d_4x4[i]];
-    sum += (coef >= 0) ? coef : -coef;
-    if (sum >= SUM_2ND_COEFF_THRESH)
-      return;
-  }
-
-  if (sum < SUM_2ND_COEFF_THRESH) {
-    for (i = 0; i < bd->eob; i++) {
-      int rc = vp9_default_zig_zag1d_4x4[i];
-      bd->qcoeff[rc] = 0;
-      bd->dqcoeff[rc] = 0;
-    }
-    bd->eob = 0;
-    *a = *l = (bd->eob != 0);
-  }
-}
-
-#define SUM_2ND_COEFF_THRESH_8X8 32
-static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,
-                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  int sum = 0;
-  BLOCKD *bd = &xd->block[24];
-  int coef;
-
-  coef = bd->dqcoeff[0];
-  sum += (coef >= 0) ? coef : -coef;
-  coef = bd->dqcoeff[1];
-  sum += (coef >= 0) ? coef : -coef;
-  coef = bd->dqcoeff[4];
-  sum += (coef >= 0) ? coef : -coef;
-  coef = bd->dqcoeff[8];
-  sum += (coef >= 0) ? coef : -coef;
-
-  if (sum < SUM_2ND_COEFF_THRESH_8X8) {
-    bd->qcoeff[0] = 0;
-    bd->dqcoeff[0] = 0;
-    bd->qcoeff[1] = 0;
-    bd->dqcoeff[1] = 0;
-    bd->qcoeff[4] = 0;
-    bd->dqcoeff[4] = 0;
-    bd->qcoeff[8] = 0;
-    bd->dqcoeff[8] = 0;
-    bd->eob = 0;
-    *a = *l = (bd->eob != 0);
-  }
+  xd->eobs[d - xd->block] = final_eob;
+  *a = *l = (final_eob > 0);
 }
 
 void vp9_optimize_mby_4x4(MACROBLOCK *x) {
   int b;
-  PLANE_TYPE type;
-  int has_2nd_order;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
@@ -694,25 +530,11 @@ void vp9_optimize_mby_4x4(MACROBLOCK *x) {
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
 
-  has_2nd_order = get_2nd_order_usage(&x->e_mbd);
-
-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
-
   for (b = 0; b < 16; b++) {
-    optimize_b(x, b, type,
+    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
-
-  if (has_2nd_order) {
-    b = 24;
-    optimize_b(x, b, PLANE_TYPE_Y2,
-               ta + vp9_block2above[TX_4X4][b],
-               tl + vp9_block2left[TX_4X4][b], TX_4X4);
-    check_reset_2nd_coeffs(&x->e_mbd,
-                           ta + vp9_block2above[TX_4X4][b],
-                           tl + vp9_block2left[TX_4X4][b]);
-  }
 }
 
 void vp9_optimize_mbuv_4x4(MACROBLOCK *x) {
@@ -744,11 +566,9 @@ static void optimize_mb_4x4(MACROBLOCK *x) {
 
 void vp9_optimize_mby_8x8(MACROBLOCK *x) {
   int b;
-  PLANE_TYPE type;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
-  int has_2nd_order = get_2nd_order_usage(&x->e_mbd);
 
   if (!x->e_mbd.above_context || !x->e_mbd.left_context)
     return;
@@ -758,28 +578,15 @@ void vp9_optimize_mby_8x8(MACROBLOCK *x) {
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
   for (b = 0; b < 16; b += 4) {
     ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
-#if CONFIG_CNVCONTEXT
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-#else
-    ENTROPY_CONTEXT above_ec = a[0];
-    ENTROPY_CONTEXT left_ec = l[0];
-#endif
-    optimize_b(x, b, type, &above_ec, &left_ec, TX_8X8);
+    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, &above_ec, &left_ec, TX_8X8);
     a[1] = a[0] = above_ec;
     l[1] = l[0] = left_ec;
   }
-
-  // 8x8 always have 2nd order block
-  if (has_2nd_order) {
-    check_reset_8x8_2nd_coeffs(&x->e_mbd,
-                               ta + vp9_block2above[TX_8X8][24],
-                               tl + vp9_block2left[TX_8X8][24]);
-  }
 }
 
 void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
@@ -793,13 +600,8 @@ void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
   for (b = 16; b < 24; b += 4) {
     ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
-#if CONFIG_CNVCONTEXT
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-#else
-    ENTROPY_CONTEXT above_ec = a[0];
-    ENTROPY_CONTEXT left_ec = l[0];
-#endif
     optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8);
   }
 }
@@ -817,13 +619,8 @@ void vp9_optimize_mby_16x16(MACROBLOCK *x) {
   if (!t_above || !t_left)
     return;
 
-#if CONFIG_CNVCONTEXT
   ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
   tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
-#else
-  ta = t_above->y1[0];
-  tl = t_left->y1[0];
-#endif
   optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16);
 }
 
@@ -871,21 +668,21 @@ void vp9_fidct_mb(MACROBLOCK *x) {
   }
 }
 
-void vp9_encode_inter16x16(MACROBLOCK *x) {
+void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  vp9_build_inter_predictors_mb(xd);
+  vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   subtract_mb(x);
   vp9_fidct_mb(x);
   vp9_recon_mb(xd);
 }
 
 /* this function is used by first pass only */
-void vp9_encode_inter16x16y(MACROBLOCK *x) {
+void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
 
-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);
 
   vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index f3c679227..6356df215 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -23,14 +23,14 @@ typedef struct {
 
 #include "vp9/encoder/vp9_onyx_int.h"
 struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(MACROBLOCK *x);
+void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_mbuv_4x4(MACROBLOCK *x);
 void vp9_transform_mby_4x4(MACROBLOCK *x);
 
 void vp9_optimize_mby_4x4(MACROBLOCK *x);
 void vp9_optimize_mbuv_4x4(MACROBLOCK *x);
-void vp9_encode_inter16x16y(MACROBLOCK *x);
+void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_mb_8x8(MACROBLOCK *mb);
 void vp9_transform_mby_8x8(MACROBLOCK *x);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 8df6c20a7..337276d59 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -435,9 +435,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+  YV12_BUFFER_CONFIG *lst_yv12 =
+      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-  YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+  YV12_BUFFER_CONFIG *gld_yv12 =
+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
   int recon_y_stride = lst_yv12->y_stride;
   int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
@@ -611,7 +613,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(x);
+          vp9_encode_inter16x16y(x, mb_row, mb_col);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -843,16 +845,13 @@ static double calc_correction_factor(double err_per_mb,
   power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
   power_term = (power_term > pt_high) ? pt_high : power_term;
 
-  // Adjustments to error term
-  // TBD
-
   // Calculate correction factor
   correction_factor = pow(error_term, power_term);
 
   // Clip range
   correction_factor =
     (correction_factor < 0.05)
-    ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;
+    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
 
   return correction_factor;
 }
@@ -886,8 +885,7 @@ static void adjust_maxq_qrange(VP9_COMP *cpi) {
 
 static int estimate_max_q(VP9_COMP *cpi,
                           FIRSTPASS_STATS *fpstats,
-                          int section_target_bandwitdh,
-                          int overhead_bits) {
+                          int section_target_bandwitdh) {
   int Q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
@@ -898,7 +896,6 @@ static int estimate_max_q(VP9_COMP *cpi,
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
   double speed_correction = 1.0;
-  double overhead_bits_per_mb;
 
   if (section_target_bandwitdh <= 0)
     return cpi->twopass.maxq_max_limit;          // Highest value allowed
@@ -950,13 +947,6 @@ static int estimate_max_q(VP9_COMP *cpi,
       speed_correction = 1.25;
   }
 
-  // Estimate of overhead bits per mb
-  // Correction to overhead bits for min allowed Q.
-  // PGW TODO.. This code is broken for the extended Q range
-  //            for now overhead set to 0.
-  overhead_bits_per_mb = overhead_bits / num_mbs;
-  overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);
-
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
   for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
@@ -967,23 +957,9 @@ static int estimate_max_q(VP9_COMP *cpi,
       sr_correction * speed_correction *
       cpi->twopass.est_max_qcorrection_factor;
 
-    if (err_correction_factor < 0.05)
-      err_correction_factor = 0.05;
-    else if (err_correction_factor > 5.0)
-      err_correction_factor = 5.0;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb;
-
-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
-                                  (double)bits_per_mb_at_this_q);
-
-    // Mode and motion overhead
-    // As Q rises in real encode loop rd code will force overhead down
-    // We make a crude adjustment for this here as *.98 per Q step.
-    // PGW TODO.. This code is broken for the extended Q range
-    //            for now overhead set to 0.
-    // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1001,7 +977,7 @@ static int estimate_max_q(VP9_COMP *cpi,
   // PGW TODO.. This code is broken for the extended Q range
   if ((cpi->ni_frames >
        ((int)cpi->twopass.total_stats->count >> 8)) &&
-      (cpi->ni_frames > 150)) {
+      (cpi->ni_frames > 25)) {
     adjust_maxq_qrange(cpi);
   }
 
@@ -1012,8 +988,7 @@ static int estimate_max_q(VP9_COMP *cpi,
 // complexity and data rate.
 static int estimate_cq(VP9_COMP *cpi,
                        FIRSTPASS_STATS *fpstats,
-                       int section_target_bandwitdh,
-                       int overhead_bits) {
+                       int section_target_bandwitdh) {
   int Q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
@@ -1026,15 +1001,11 @@ static int estimate_cq(VP9_COMP *cpi,
   double speed_correction = 1.0;
   double clip_iiratio;
   double clip_iifactor;
-  double overhead_bits_per_mb;
-
 
   target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
                             ? (512 * section_target_bandwitdh) / num_mbs
                             : 512 * (section_target_bandwitdh / num_mbs);
 
-  // Estimate of overhead bits per mb
-  overhead_bits_per_mb = overhead_bits / num_mbs;
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
@@ -1073,23 +1044,8 @@ static int estimate_cq(VP9_COMP *cpi,
       calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
       sr_correction * speed_correction * clip_iifactor;
 
-    if (err_correction_factor < 0.05)
-      err_correction_factor = 0.05;
-    else if (err_correction_factor > 5.0)
-      err_correction_factor = 5.0;
-
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb;
-
-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
-                                  (double)bits_per_mb_at_this_q);
-
-    // Mode and motion overhead
-    // As Q rises in real encode loop rd code will force overhead down
-    // We make a crude adjustment for this here as *.98 per Q step.
-    // PGW TODO.. This code is broken for the extended Q range
-    //            for now overhead set to 0.
-    overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1953,8 +1909,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  int overhead_bits;
-
   if (!cpi->twopass.stats_in) {
     return;
   }
@@ -2018,11 +1972,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
   if (cpi->target_bandwidth < 0)
     cpi->target_bandwidth = 0;
 
-
-  // Account for mv, mode and other overheads.
-  overhead_bits = (int)estimate_modemvcost(
-                        cpi, cpi->twopass.total_left_stats);
-
   // Special case code for first frame.
   if (cpi->common.current_video_frame == 0) {
     cpi->twopass.est_max_qcorrection_factor = 1.0;
@@ -2034,8 +1983,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
       est_cq =
         estimate_cq(cpi,
                     cpi->twopass.total_left_stats,
-                    (int)(cpi->twopass.bits_left / frames_left),
-                    overhead_bits);
+                    (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
       if (est_cq > cpi->cq_target_quality)
@@ -2049,21 +1997,23 @@ void vp9_second_pass(VP9_COMP *cpi) {
     tmp_q = estimate_max_q(
               cpi,
               cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left),
-              overhead_bits);
+              (int)(cpi->twopass.bits_left / frames_left));
 
     cpi->active_worst_quality         = tmp_q;
     cpi->ni_av_qi                     = tmp_q;
     cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
 
+#ifndef ONE_SHOT_Q_ESTIMATE
     // Limit the maxq value returned subsequently.
     // This increases the risk of overspend or underspend if the initial
     // estimate for the clip is bad, but helps prevent excessive
     // variation in Q, especially near the end of a clip
     // where for example a small overspend may cause Q to crash
     adjust_maxq_qrange(cpi);
+#endif
   }
 
+#ifndef ONE_SHOT_Q_ESTIMATE
   // The last few frames of a clip almost always have to few or too many
   // bits and for the sake of over exact rate control we dont want to make
   // radical adjustments to the allowed quantizer range just to use up a
@@ -2078,13 +2028,13 @@ void vp9_second_pass(VP9_COMP *cpi) {
     tmp_q = estimate_max_q(
               cpi,
               cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left),
-              overhead_bits);
+              (int)(cpi->twopass.bits_left / frames_left));
 
     // Make a damped adjustment to active max Q
     cpi->active_worst_quality =
       adjust_active_maxq(cpi->active_worst_quality, tmp_q);
   }
+#endif
 
   cpi->twopass.frames_to_key--;
 
@@ -2092,7 +2042,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
   subtract_stats(cpi->twopass.total_left_stats, &this_frame);
 }
 
-
 static int test_candidate_kf(VP9_COMP *cpi,
                              FIRSTPASS_STATS *last_frame,
                              FIRSTPASS_STATS *this_frame,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 19bc4d67d..2296a6669 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -11,12 +11,12 @@
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
 
-extern void vp9_init_first_pass(VP9_COMP *cpi);
-extern void vp9_first_pass(VP9_COMP *cpi);
-extern void vp9_end_first_pass(VP9_COMP *cpi);
+void vp9_init_first_pass(VP9_COMP *cpi);
+void vp9_first_pass(VP9_COMP *cpi);
+void vp9_end_first_pass(VP9_COMP *cpi);
 
-extern void vp9_init_second_pass(VP9_COMP *cpi);
-extern void vp9_second_pass(VP9_COMP *cpi);
-extern void vp9_end_second_pass(VP9_COMP *cpi);
+void vp9_init_second_pass(VP9_COMP *cpi);
+void vp9_second_pass(VP9_COMP *cpi);
+void vp9_end_second_pass(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 0ff60c8b0..121de653f 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -20,14 +20,16 @@
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               int_mv *ref_mv,
-                                              int_mv *dst_mv) {
+                                              int_mv *dst_mv,
+                                              int mb_row,
+                                              int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   BLOCK *b  = &x->block[0];
   BLOCKD *d = &xd->block[0];
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
   unsigned int best_err;
-  int step_param;
+
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -36,11 +38,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   int_mv ref_full;
 
   // Further step/diamond searches as necessary
-  if (cpi->Speed < 8) {
-    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-  } else {
-    step_param = cpi->sf.first_step + 2;
-  }
+  int step_param = cpi->sf.first_step +
+      (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2);
 
   vp9_clamp_mv_min_max(x, ref_mv);
 
@@ -72,7 +71,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   }
 
   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);
   best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
                           xd->predictor, 16, INT_MAX);
 
@@ -93,8 +92,9 @@ static int do_16x16_motion_search
   YV12_BUFFER_CONFIG *buf,
   int buf_mb_y_offset,
   YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset
-) {
+  int mb_y_offset,
+  int mb_row,
+  int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
@@ -124,7 +124,7 @@ static int do_16x16_motion_search
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
   if (tmp_err < err) {
     err            = tmp_err;
     dst_mv->as_int = tmp_mv.as_int;
@@ -136,7 +136,8 @@ static int do_16x16_motion_search
     int_mv zero_ref_mv, tmp_mv;
 
     zero_ref_mv.as_int = 0;
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
+                                        mb_row, mb_col);
     if (tmp_err < err) {
       dst_mv->as_int = tmp_mv.as_int;
       err = tmp_err;
@@ -229,7 +230,9 @@ static void update_mbgraph_mb_stats
   int gld_y_offset,
   YV12_BUFFER_CONFIG *alt_ref,
   int_mv *prev_alt_ref_mv,
-  int arf_y_offset
+  int arf_y_offset,
+  int mb_row,
+  int mb_col
 ) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -249,7 +252,8 @@ static void update_mbgraph_mb_stats
     int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
                                                 &stats->ref[GOLDEN_FRAME].m.mv,
                                                 buf, mb_y_offset,
-                                                golden_ref, gld_y_offset);
+                                                golden_ref, gld_y_offset,
+                                                mb_row, mb_col);
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
     stats->ref[GOLDEN_FRAME].err = INT_MAX;
@@ -292,6 +296,9 @@ static void update_mbgraph_frame_stats
   int_mv arf_top_mv, gld_top_mv;
   MODE_INFO mi_local;
 
+  // Make sure the mi context starts in a consistent state.
+  memset(&mi_local, 0, sizeof(mi_local));
+
   // Set up limit values for motion vectors to prevent them extending outside the UMV borders
   arf_top_mv.as_int = 0;
   gld_top_mv.as_int = 0;
@@ -323,7 +330,8 @@ static void update_mbgraph_frame_stats
 
       update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
                               golden_ref, &gld_left_mv, gld_y_in_offset,
-                              alt_ref,    &arf_left_mv, arf_y_in_offset);
+                              alt_ref,    &arf_left_mv, arf_y_in_offset,
+                              mb_row, mb_col);
       arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
       gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
       if (mb_col == 0) {
@@ -427,13 +435,11 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
   vpx_free(arf_not_zz);
 }
 
-void vp9_update_mbgraph_stats
-(
-  VP9_COMP *cpi
-) {
+void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
-  YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];
+  YV12_BUFFER_CONFIG *golden_ref =
+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
 
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h
index db23eca33..c5bca4d01 100644
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -11,6 +11,6 @@
 #ifndef VP9_ENCODER_VP9_MBGRAPH_H_
 #define VP9_ENCODER_VP9_MBGRAPH_H_
 
-extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);
+void vp9_update_mbgraph_stats(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_MBGRAPH_H_
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4694a92c6..300d9f85c 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -8,22 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
 
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vpx_mem/vpx_mem.h"
 #include "./vpx_config.h"
-#include <stdio.h>
-#include <limits.h>
-#include <math.h>
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
 
-#ifdef ENTROPY_STATS
-static int mv_ref_ct [31] [4] [2];
-static int mv_mode_cts [4] [2];
-#endif
-
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
   int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
       ((ref_mv->as_mv.col & 7) ? 1 : 0);
@@ -44,21 +39,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
 }
 
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
-                    int Weight, int ishp) {
+                    int weight, int ishp) {
   MV v;
-  v.row = (mv->as_mv.row - ref->as_mv.row);
-  v.col = (mv->as_mv.col - ref->as_mv.col);
+  v.row = mv->as_mv.row - ref->as_mv.row;
+  v.col = mv->as_mv.col - ref->as_mv.col;
   return ((mvjcost[vp9_get_mv_joint(v)] +
-           mvcost[0][v.row] + mvcost[1][v.col]) *
-          Weight) >> 7;
+           mvcost[0][v.row] + mvcost[1][v.col]) * weight) >> 7;
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
                        int error_per_bit, int ishp) {
   if (mvcost) {
     MV v;
-    v.row = (mv->as_mv.row - ref->as_mv.row);
-    v.col = (mv->as_mv.col - ref->as_mv.col);
+    v.row = mv->as_mv.row - ref->as_mv.row;
+    v.col = mv->as_mv.col - ref->as_mv.col;
     return ((mvjcost[vp9_get_mv_joint(v)] +
              mvcost[0][v.row] + mvcost[1][v.col]) *
             error_per_bit + 128) >> 8;
@@ -68,11 +62,10 @@ static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
 
 static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost,
                           int *mvsadcost[2], int error_per_bit) {
-
   if (mvsadcost) {
     MV v;
-    v.row = (mv->as_mv.row - ref->as_mv.row);
-    v.col = (mv->as_mv.col - ref->as_mv.col);
+    v.row = mv->as_mv.row - ref->as_mv.row;
+    v.col = mv->as_mv.col - ref->as_mv.col;
     return ((mvjsadcost[vp9_get_mv_joint(v)] +
              mvsadcost[0][v.row] + mvsadcost[1][v.col]) *
             error_per_bit + 128) >> 8;
@@ -81,45 +74,39 @@ static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost,
 }
 
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
-  int Len;
+  int len;
   int search_site_count = 0;
 
-
   // Generate offsets for 4 search sites per step.
-  Len = MAX_FIRST_STEP;
   x->ss[search_site_count].mv.col = 0;
   x->ss[search_site_count].mv.row = 0;
   x->ss[search_site_count].offset = 0;
   search_site_count++;
 
-  while (Len > 0) {
-
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Compute offsets for search sites.
     x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride;
+    x->ss[search_site_count].mv.row = -len;
+    x->ss[search_site_count].offset = -len * stride;
     search_site_count++;
 
     // Compute offsets for search sites.
     x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride;
+    x->ss[search_site_count].mv.row = len;
+    x->ss[search_site_count].offset = len * stride;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
+    x->ss[search_site_count].mv.col = -len;
     x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -Len;
+    x->ss[search_site_count].offset = -len;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
+    x->ss[search_site_count].mv.col = len;
     x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = Len;
+    x->ss[search_site_count].offset = len;
     search_site_count++;
-
-    // Contract.
-    Len /= 2;
   }
 
   x->ss_count = search_site_count;
@@ -127,68 +114,63 @@ void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
 }
 
 void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
-  int Len;
+  int len;
   int search_site_count = 0;
 
   // Generate offsets for 8 search sites per step.
-  Len = MAX_FIRST_STEP;
   x->ss[search_site_count].mv.col = 0;
   x->ss[search_site_count].mv.row = 0;
   x->ss[search_site_count].offset = 0;
   search_site_count++;
 
-  while (Len > 0) {
-
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Compute offsets for search sites.
     x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride;
+    x->ss[search_site_count].mv.row = -len;
+    x->ss[search_site_count].offset = -len * stride;
     search_site_count++;
 
     // Compute offsets for search sites.
     x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride;
+    x->ss[search_site_count].mv.row = len;
+    x->ss[search_site_count].offset = len * stride;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
+    x->ss[search_site_count].mv.col = -len;
     x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -Len;
+    x->ss[search_site_count].offset = -len;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
+    x->ss[search_site_count].mv.col = len;
     x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = Len;
+    x->ss[search_site_count].offset = len;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride - Len;
+    x->ss[search_site_count].mv.col = -len;
+    x->ss[search_site_count].mv.row = -len;
+    x->ss[search_site_count].offset = -len * stride - len;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
-    x->ss[search_site_count].mv.row = -Len;
-    x->ss[search_site_count].offset = -Len * stride + Len;
+    x->ss[search_site_count].mv.col = len;
+    x->ss[search_site_count].mv.row = -len;
+    x->ss[search_site_count].offset = -len * stride + len;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -Len;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride - Len;
+    x->ss[search_site_count].mv.col = -len;
+    x->ss[search_site_count].mv.row = len;
+    x->ss[search_site_count].offset = len * stride - len;
     search_site_count++;
 
     // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = Len;
-    x->ss[search_site_count].mv.row = Len;
-    x->ss[search_site_count].offset = Len * stride + Len;
+    x->ss[search_site_count].mv.col = len;
+    x->ss[search_site_count].mv.row = len;
+    x->ss[search_site_count].offset = len * stride + len;
     search_site_count++;
-
-    // Contract.
-    Len /= 2;
   }
 
   x->ss_count = search_site_count;
@@ -1546,7 +1528,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv *best_mv = &d->bmi.as_mv[0];
   int_mv this_mv;
   int bestsad = INT_MAX;
   int r, c;
@@ -1641,7 +1623,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv *best_mv = &d->bmi.as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1770,7 +1752,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv *best_mv = &d->bmi.as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1787,7 +1769,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
 
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
+  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
   unsigned int sad_array[3];
   int_mv fcenter_mv;
 
@@ -2023,12 +2005,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
-    int all_in = 1;
-
-    all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
-    all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
-    all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
-    all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+    int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) &
+                 ((ref_mv->as_mv.row + 1) < x->mv_row_max) &
+                 ((ref_mv->as_mv.col - 1) > x->mv_col_min) &
+                 ((ref_mv->as_mv.col + 1) < x->mv_col_max);
 
     if (all_in) {
       unsigned int sad_array[4];
@@ -2103,21 +2083,22 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 
 
 #ifdef ENTROPY_STATS
-void print_mode_context(void) {
+void print_mode_context(VP9_COMMON *pc) {
   FILE *f = fopen("vp9_modecont.c", "a");
   int i, j;
 
   fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "const int vp9_mode_contexts[6][4] =");
+  fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] =");
   fprintf(f, "{\n");
-  for (j = 0; j < 6; j++) {
+  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
     fprintf(f, "  {/* %d */ ", j);
     fprintf(f, "    ");
     for (i = 0; i < 4; i++) {
       int this_prob;
 
       // context probs
-      this_prob = get_binary_prob(mv_ref_ct[j][i][0], mv_ref_ct[j][i][1]);
+      this_prob = get_binary_prob(pc->fc.mv_ref_ct[j][i][0],
+                                  pc->fc.mv_ref_ct[j][i][1]);
 
       fprintf(f, "%5d, ", this_prob);
     }
@@ -2128,44 +2109,4 @@ void print_mode_context(void) {
   fclose(f);
 }
 
-/* MV ref count ENTROPY_STATS stats code */
-void init_mv_ref_counts() {
-  vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-  vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
-  if (m == ZEROMV) {
-    ++mv_ref_ct [ct[0]] [0] [0];
-    ++mv_mode_cts[0][0];
-  } else {
-    ++mv_ref_ct [ct[0]] [0] [1];
-    ++mv_mode_cts[0][1];
-
-    if (m == NEARESTMV) {
-      ++mv_ref_ct [ct[1]] [1] [0];
-      ++mv_mode_cts[1][0];
-    } else {
-      ++mv_ref_ct [ct[1]] [1] [1];
-      ++mv_mode_cts[1][1];
-
-      if (m == NEARMV) {
-        ++mv_ref_ct [ct[2]] [2] [0];
-        ++mv_mode_cts[2][0];
-      } else {
-        ++mv_ref_ct [ct[2]] [2] [1];
-        ++mv_mode_cts[2][1];
-
-        if (m == NEWMV) {
-          ++mv_ref_ct [ct[3]] [3] [0];
-          ++mv_mode_cts[3][0];
-        } else {
-          ++mv_ref_ct [ct[3]] [3] [1];
-          ++mv_mode_cts[3][1];
-        }
-      }
-    }
-  }
-}
-
 #endif/* END MV ref count ENTROPY_STATS stats code */
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 358d10bc6..2479d7235 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -16,9 +16,7 @@
 #include "vp9/encoder/vp9_variance.h"
 
 #ifdef ENTROPY_STATS
-extern void init_mv_ref_counts();
-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
-void print_mode_context(void);
+void print_mode_context(VP9_COMMON *pc);
 #endif
 
 
@@ -26,11 +24,12 @@ void print_mode_context(void);
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
 
-extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
-                           int *mvcost[2], int Weight, int ishp);
-extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-extern void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
+void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
+int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
+                           int *mvcost[2], int weight, int ishp);
+void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
+
 // Runs sequence of diamond searches in smaller steps for RD
 struct VP9_COMP;
 int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
@@ -39,20 +38,13 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
                            vp9_variance_fn_ptr_t *fn_ptr,
                            int_mv *ref_mv, int_mv *dst_mv);
 
-extern int vp9_hex_search
-(
-  MACROBLOCK *x,
-  BLOCK *b,
-  BLOCKD *d,
-  int_mv *ref_mv,
-  int_mv *best_mv,
-  int search_param,
-  int error_per_bit,
-  const vp9_variance_fn_ptr_t *vf,
-  int *mvjsadcost, int *mvsadcost[2],
-  int *mvjcost, int *mvcost[2],
-  int_mv *center_mv
-);
+int vp9_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                   int_mv *ref_mv, int_mv *best_mv,
+                   int search_param, int error_per_bit,
+                   const vp9_variance_fn_ptr_t *vf,
+                   int *mvjsadcost, int *mvsadcost[2],
+                   int *mvjcost, int *mvcost[2],
+                   int_mv *center_mv);
 
 typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv
   *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 27e0e48a3..5278ac2a3 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -10,7 +10,9 @@
 
 
 #include "vpx_config.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_quantize.h"
@@ -22,6 +24,7 @@
 #include "vp9/common/vp9_extend.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
@@ -236,12 +239,12 @@ static void update_base_skip_probs(VP9_COMP *cpi) {
   if (cm->frame_type != KEY_FRAME) {
     vp9_update_skip_probs(cpi);
 
-    if (cm->refresh_alt_ref_frame) {
+    if (cpi->refresh_alt_ref_frame) {
       int k;
       for (k = 0; k < MBSKIP_CONTEXTS; ++k)
         cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];
       cpi->last_skip_probs_q[2] = cm->base_qindex;
-    } else if (cpi->common.refresh_golden_frame) {
+    } else if (cpi->refresh_golden_frame) {
       int k;
       for (k = 0; k < MBSKIP_CONTEXTS; ++k)
         cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];
@@ -388,7 +391,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
   return target_index - start_index;
 }
 
-static void init_seg_features(VP9_COMP *cpi) {
+static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
@@ -408,10 +411,8 @@ static void init_seg_features(VP9_COMP *cpi) {
 
     // Clear down the segment features.
     vp9_clearall_segfeatures(xd);
-  }
-
-  // If this is an alt ref frame
-  else if (cm->refresh_alt_ref_frame) {
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
     // Clear down the global segmentation map
     vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
     xd->update_mb_segmentation_map = 0;
@@ -448,7 +449,7 @@ static void init_seg_features(VP9_COMP *cpi) {
   else if (xd->segmentation_enabled) {
     // First normal frame in a valid gf or alt ref group
     if (cpi->common.frames_since_golden == 0) {
-      // Set up segment features for normal frames in an af group
+      // Set up segment features for normal frames in an arf group
       if (cpi->source_alt_ref_active) {
         xd->update_mb_segmentation_map = 0;
         xd->update_mb_segmentation_data = 1;
@@ -465,16 +466,9 @@ static void init_seg_features(VP9_COMP *cpi) {
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
-          // set_segref(xd, 1, LAST_FRAME);
           vp9_set_segref(xd, 1, ALTREF_FRAME);
           vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
-
-          vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
-
-          // EOB segment coding not fixed for 8x8 yet
-          vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
-          vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
+          vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
         }
       }
       // Disable segmentation and clear down features if alt ref
@@ -493,29 +487,23 @@ static void init_seg_features(VP9_COMP *cpi) {
     }
 
     // Special case where we are coding over the top of a previous
-    // alt ref frame
+    // alt ref frame.
     // Segment coding disabled for compred testing
     else if (cpi->is_src_frame_alt_ref) {
-      // Enable mode and ref frame features for segment 0 as well
+      // Enable ref frame features for segment 0 as well
       vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);
       vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
 
-      // All mbs should use ALTREF_FRAME, ZEROMV exclusively
+      // All mbs should use ALTREF_FRAME
       vp9_clear_segref(xd, 0);
       vp9_set_segref(xd, 0, ALTREF_FRAME);
       vp9_clear_segref(xd, 1);
       vp9_set_segref(xd, 1, ALTREF_FRAME);
-      vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);
-      vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
 
-      // Skip all MBs if high Q
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
       if (high_q) {
-        vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);
-        vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);
-        vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
-        vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
+          vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP);
+          vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
       }
       // Enable data udpate
       xd->update_mb_segmentation_data = 1;
@@ -590,16 +578,165 @@ static void set_default_lf_deltas(VP9_COMP *cpi) {
   cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
 }
 
+static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
+  SPEED_FEATURES *sf = &cpi->sf;
+  int speed_multiplier = speed + 1;
+  int i;
+
+  // Set baseline threshold values
+  for (i = 0; i < MAX_MODES; ++i) {
+    sf->thresh_mult[i] = (mode == 0) ? -500 : 0;
+  }
+
+  sf->thresh_mult[THR_ZEROMV   ] = 0;
+  sf->thresh_mult[THR_ZEROG    ] = 0;
+  sf->thresh_mult[THR_ZEROA    ] = 0;
+
+  sf->thresh_mult[THR_NEARESTMV] = 0;
+  sf->thresh_mult[THR_NEARESTG ] = 0;
+  sf->thresh_mult[THR_NEARESTA ] = 0;
+
+  sf->thresh_mult[THR_NEARMV   ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEARG    ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEARA    ] += speed_multiplier * 1000;
+
+  sf->thresh_mult[THR_DC       ] = 0;
+  sf->thresh_mult[THR_TM       ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_V_PRED   ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_H_PRED   ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;
+
+  sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
+  sf->thresh_mult[THR_NEWA     ] += speed_multiplier * 1000;
+
+  sf->thresh_mult[THR_SPLITMV  ] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_SPLITG   ] += speed_multiplier * 2500;
+  sf->thresh_mult[THR_SPLITA   ] += speed_multiplier * 2500;
+
+  sf->thresh_mult[THR_COMP_ZEROLG   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_ZEROLA   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_ZEROGA   ] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_COMP_NEARESTLG] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_COMP_NEARLG   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_NEARLA   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_NEARGA   ] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_COMP_NEWLG    ] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_COMP_NEWLA    ] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_COMP_NEWGA    ] += speed_multiplier * 2000;
+
+  sf->thresh_mult[THR_COMP_SPLITLA  ] += speed_multiplier * 4500;
+  sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
+  sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;
+
+#if CONFIG_COMP_INTERINTRA_PRED
+  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] += speed_multiplier * 1500;
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] += speed_multiplier * 1500;
+
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] += speed_multiplier * 2000;
+  sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] += speed_multiplier * 2000;
+#endif
+
+  /* disable frame modes if flags not set */
+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
+    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+#if CONFIG_COMP_INTERINTRA_PRED
+    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;
+#endif
+  }
+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+#if CONFIG_COMP_INTERINTRA_PRED
+    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = INT_MAX;
+#endif
+  }
+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
+    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+#if CONFIG_COMP_INTERINTRA_PRED
+    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = INT_MAX;
+#endif
+  }
+
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) !=
+      (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
+    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;
+  }
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
+    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
+  }
+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
+    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
+  }
+}
+
 void vp9_set_speed_features(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
-  int Mode = cpi->compressor_speed;
-  int Speed = cpi->Speed;
+  int mode = cpi->compressor_speed;
+  int speed = cpi->Speed;
   int i;
   VP9_COMMON *cm = &cpi->common;
 
   // Only modes 0 and 1 supported for now in experimental code basae
-  if (Mode > 1)
-    Mode = 1;
+  if (mode > 1)
+    mode = 1;
 
   // Initialise default mode frequency sampling variables
   for (i = 0; i < MAX_MODES; i ++) {
@@ -617,167 +754,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->quarter_pixel_search = 1;
   sf->half_pixel_search = 1;
   sf->iterative_sub_pixel = 1;
-#if CONFIG_LOSSLESS
-  sf->optimize_coefficients = 0;
-#else
-  sf->optimize_coefficients = 1;
-#endif
   sf->no_skip_block4x4_search = 1;
+  if (cpi->oxcf.lossless)
+    sf->optimize_coefficients = 0;
+  else
+    sf->optimize_coefficients = 1;
 
   sf->first_step = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+  sf->static_segmentation = 1;
+  sf->splitmode_breakout = 0;
+  sf->mb16_breakout = 0;
 
-  // default thresholds to 0
-  for (i = 0; i < MAX_MODES; i++)
-    sf->thresh_mult[i] = 0;
-
-  switch (Mode) {
+  switch (mode) {
     case 0: // best quality mode
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROG    ] = 0;
-      sf->thresh_mult[THR_ZEROA    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTG ] = 0;
-      sf->thresh_mult[THR_NEARESTA ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARG    ] = 0;
-      sf->thresh_mult[THR_NEARA    ] = 0;
-
-      sf->thresh_mult[THR_DC       ] = 0;
-
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2000;
-      sf->thresh_mult[THR_I8X8_PRED] = 2000;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-
-      sf->thresh_mult[THR_SPLITMV  ] = 2500;
-      sf->thresh_mult[THR_SPLITG   ] = 5000;
-      sf->thresh_mult[THR_SPLITA   ] = 5000;
-
-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;
-
-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;
-
-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 2500;
-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 5000;
-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 5000;
-
-#if CONFIG_COMP_INTERINTRA_PRED
-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;
-#endif
-
-      sf->first_step = 0;
-      sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
       sf->search_best_filter = SEARCH_BEST_FILTER;
       break;
+
     case 1:
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_DC       ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2500;
-      sf->thresh_mult[THR_I8X8_PRED] = 2500;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEARESTG ] = 1000;
-      sf->thresh_mult[THR_NEARESTA ] = 1000;
-
-      sf->thresh_mult[THR_ZEROG    ] = 1000;
-      sf->thresh_mult[THR_ZEROA    ] = 1000;
-      sf->thresh_mult[THR_NEARG    ] = 1000;
-      sf->thresh_mult[THR_NEARA    ] = 1000;
-
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROG    ] = 0;
-      sf->thresh_mult[THR_ZEROA    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTG ] = 0;
-      sf->thresh_mult[THR_NEARESTA ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARG    ] = 0;
-      sf->thresh_mult[THR_NEARA    ] = 0;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-
-      sf->thresh_mult[THR_SPLITMV  ] = 1700;
-      sf->thresh_mult[THR_SPLITG   ] = 4500;
-      sf->thresh_mult[THR_SPLITA   ] = 4500;
-
-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;
-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;
-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;
-
-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;
-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;
-
-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 1700;
-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 4500;
-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 4500;
-#if CONFIG_COMP_INTERINTRA_PRED
-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;
-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;
-#endif
+      sf->static_segmentation = 1;
+      sf->splitmode_breakout = 1;
+      sf->mb16_breakout = 0;
 
-      if (Speed > 0) {
+      if (speed > 0) {
         /* Disable coefficient optimization above speed 0 */
         sf->optimize_coefficients = 0;
         sf->no_skip_block4x4_search = 0;
@@ -793,7 +792,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
       }
 
-      if (Speed > 1) {
+      if (speed > 1) {
         cpi->mode_check_freq[THR_SPLITG] = 4;
         cpi->mode_check_freq[THR_SPLITA] = 4;
         cpi->mode_check_freq[THR_SPLITMV] = 2;
@@ -801,73 +800,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
         cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
         cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
-
-        sf->thresh_mult[THR_TM       ] = 1500;
-        sf->thresh_mult[THR_V_PRED   ] = 1500;
-        sf->thresh_mult[THR_H_PRED   ] = 1500;
-        sf->thresh_mult[THR_D45_PRED ] = 1500;
-        sf->thresh_mult[THR_D135_PRED] = 1500;
-        sf->thresh_mult[THR_D117_PRED] = 1500;
-        sf->thresh_mult[THR_D153_PRED] = 1500;
-        sf->thresh_mult[THR_D27_PRED ] = 1500;
-        sf->thresh_mult[THR_D63_PRED ] = 1500;
-        sf->thresh_mult[THR_B_PRED   ] = 5000;
-        sf->thresh_mult[THR_I8X8_PRED] = 5000;
-
-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-          sf->thresh_mult[THR_NEWMV    ] = 2000;
-          sf->thresh_mult[THR_SPLITMV  ] = 10000;
-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-          sf->thresh_mult[THR_NEARESTG ] = 1500;
-          sf->thresh_mult[THR_ZEROG    ] = 1500;
-          sf->thresh_mult[THR_NEARG    ] = 1500;
-          sf->thresh_mult[THR_NEWG     ] = 2000;
-          sf->thresh_mult[THR_SPLITG   ] = 20000;
-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-          sf->thresh_mult[THR_NEARESTA ] = 1500;
-          sf->thresh_mult[THR_ZEROA    ] = 1500;
-          sf->thresh_mult[THR_NEARA    ] = 1500;
-          sf->thresh_mult[THR_NEWA     ] = 2000;
-          sf->thresh_mult[THR_SPLITA   ] = 20000;
-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;
-        }
-
-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 1500;
-        sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;
-        sf->thresh_mult[THR_COMP_NEARLG   ] = 1500;
-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 1500;
-        sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;
-        sf->thresh_mult[THR_COMP_NEARLA   ] = 1500;
-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 1500;
-        sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;
-        sf->thresh_mult[THR_COMP_NEARGA   ] = 1500;
-
-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2000;
-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2000;
-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2000;
-#if CONFIG_COMP_INTERINTRA_PRED
-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;
-#endif
       }
 
-      if (Speed > 2) {
+      if (speed > 2) {
         cpi->mode_check_freq[THR_SPLITG] = 15;
         cpi->mode_check_freq[THR_SPLITA] = 15;
         cpi->mode_check_freq[THR_SPLITMV] = 7;
@@ -876,150 +811,19 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
         cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
 
-        sf->thresh_mult[THR_TM       ] = 2000;
-        sf->thresh_mult[THR_V_PRED   ] = 2000;
-        sf->thresh_mult[THR_H_PRED   ] = 2000;
-        sf->thresh_mult[THR_D45_PRED ] = 2000;
-        sf->thresh_mult[THR_D135_PRED] = 2000;
-        sf->thresh_mult[THR_D117_PRED] = 2000;
-        sf->thresh_mult[THR_D153_PRED] = 2000;
-        sf->thresh_mult[THR_D27_PRED ] = 2000;
-        sf->thresh_mult[THR_D63_PRED ] = 2000;
-        sf->thresh_mult[THR_B_PRED   ] = 7500;
-        sf->thresh_mult[THR_I8X8_PRED] = 7500;
-
-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-          sf->thresh_mult[THR_NEWMV    ] = 2000;
-          sf->thresh_mult[THR_SPLITMV  ] = 25000;
-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-          sf->thresh_mult[THR_NEARESTG ] = 2000;
-          sf->thresh_mult[THR_ZEROG    ] = 2000;
-          sf->thresh_mult[THR_NEARG    ] = 2000;
-          sf->thresh_mult[THR_NEWG     ] = 2500;
-          sf->thresh_mult[THR_SPLITG   ] = 50000;
-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;
-        }
-
-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-          sf->thresh_mult[THR_NEARESTA ] = 2000;
-          sf->thresh_mult[THR_ZEROA    ] = 2000;
-          sf->thresh_mult[THR_NEARA    ] = 2000;
-          sf->thresh_mult[THR_NEWA     ] = 2500;
-          sf->thresh_mult[THR_SPLITA   ] = 50000;
-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;
-        }
-
-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 2000;
-        sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;
-        sf->thresh_mult[THR_COMP_NEARLG   ] = 2000;
-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 2000;
-        sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
-        sf->thresh_mult[THR_COMP_NEARLA   ] = 2000;
-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 2000;
-        sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;
-        sf->thresh_mult[THR_COMP_NEARGA   ] = 2000;
-
-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2500;
-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2500;
-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2500;
-#if CONFIG_COMP_INTERINTRA_PRED
-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;
-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;
-#endif
-
         sf->improved_dct = 0;
 
         // Only do recode loop on key frames, golden frames and
         // alt ref frames
         sf->recode_loop = 2;
-
       }
 
       break;
 
   }; /* switch */
 
-  /* disable frame modes if flags not set */
-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
-    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-  }
-
-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = INT_MAX;
-#endif
-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-  }
-
-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = INT_MAX;
-#endif
-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;
-  }
-#if CONFIG_COMP_INTERINTRA_PRED
-  if ((cpi->ref_frame_flags & VP9_LAST_FLAG) != VP9_LAST_FLAG) {
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;
-  }
-#endif
+  // Set rd thresholds based on mode and speed setting
+  set_rd_speed_thresholds(cpi, mode, speed);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -1028,36 +832,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     sf->improved_dct = 0;
   }
 
-  if (cpi->sf.search_method == NSTEP) {
-    vp9_init3smotion_compensation(&cpi->mb,
-                                  cm->yv12_fb[cm->lst_fb_idx].y_stride);
-  } else if (cpi->sf.search_method == DIAMOND) {
-    vp9_init_dsmotion_compensation(&cpi->mb,
-                                   cm->yv12_fb[cm->lst_fb_idx].y_stride);
-  }
+  {
+    int y_stride = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].y_stride;
 
-  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;
-  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;
-  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;
-  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;
-  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
-  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
+    if (cpi->sf.search_method == NSTEP) {
+      vp9_init3smotion_compensation(&cpi->mb, y_stride);
+    } else if (cpi->sf.search_method == DIAMOND) {
+      vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
+    }
+  }
 
-#if CONFIG_LOSSLESS
-  if (cpi->oxcf.lossless) {
-    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;
-    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;
-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
-    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;
+  cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;
+  cpi->mb.fwd_txm8x8    = vp9_short_fdct8x8;
+  cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;
+  cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;
+  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
+    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4_x8;
+    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4_x8;
   }
-#endif
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
   cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
   cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
   cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
-  cpi->mb.quantize_b_2x2      = vp9_regular_quantize_b_2x2;
 
   vp9_init_quantizer(cpi);
 
@@ -1078,6 +875,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   frames_at_speed[cpi->Speed]++;
 #endif
 }
+
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   int width = (cpi->oxcf.Width + 15) & ~15;
   int height = (cpi->oxcf.Height + 15) & ~15;
@@ -1144,7 +942,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
-
   vpx_free(cpi->tok);
 
   {
@@ -1199,6 +996,38 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
 }
 
 
+static void update_frame_size(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  /* our internal buffers are always multiples of 16 */
+  int width = (cm->Width + 15) & ~15;
+  int height = (cm->Height + 15) & ~15;
+
+  cm->mb_rows = height >> 4;
+  cm->mb_cols = width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+  memset(cm->mip, 0,
+        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));
+  vp9_update_mode_info_border(cm, cm->mip);
+
+  cm->mi = cm->mip + cm->mode_info_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  vp9_update_mode_info_in_image(cm, cm->mi);
+
+  /* Update size of buffers local to this frame */
+  if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,
+                                    width, height, VP9BORDERINPIXELS))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to reallocate last frame buffer");
+
+  if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,
+                                    width, height, VP9BORDERINPIXELS))
+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled source buffer");
+}
+
+
 // TODO perhaps change number of steps expose to outside world when setting
 // max and min limits. Also this will likely want refining for the extended Q
 // range.
@@ -1239,10 +1068,7 @@ void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
     cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;
 
   // Set Maximum gf/arf interval
-  cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
-
-  if (cpi->max_gf_interval < 12)
-    cpi->max_gf_interval = 12;
+  cpi->max_gf_interval = 15;
 
   // Extended interval for genuinely static scenes
   cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
@@ -1270,10 +1096,26 @@ rescale(int val, int num, int denom) {
   return (int)(llval * llnum / llden);
 }
 
+static void set_tile_limits(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int min_log2_tiles, max_log2_tiles;
+
+  cm->log2_tile_columns = cpi->oxcf.tile_columns;
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+  vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);
+  max_log2_tiles += min_log2_tiles;
+  if (cm->log2_tile_columns < min_log2_tiles)
+    cm->log2_tile_columns = min_log2_tiles;
+  else if (cm->log2_tile_columns > max_log2_tiles)
+    cm->log2_tile_columns = max_log2_tiles;
+  cm->tile_columns = 1 << cm->log2_tile_columns;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+}
 
 static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
 
@@ -1304,6 +1146,12 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cpi->static_mb_pct = 0;
 
+  cpi->lst_fb_idx = 0;
+  cpi->gld_fb_idx = 1;
+  cpi->alt_fb_idx = 2;
+
+  set_tile_limits(cpi);
+
 #if VP9_TEMPORAL_ALT_REF
   {
     int i;
@@ -1319,7 +1167,7 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
 void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
 
   if (!cpi)
     return;
@@ -1351,7 +1199,6 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
       if (cpi->oxcf.cpu_used > 5)
         cpi->oxcf.cpu_used = 5;
-
       break;
 
     case MODE_SECONDPASS_BEST:
@@ -1364,20 +1211,14 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
   cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
-  cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
-  cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_idct4x4llm;
-  cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
-  cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
-
-#if CONFIG_LOSSLESS
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;
-    cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;
-    cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;
-    cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;
+  } else {
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;
   }
-#endif
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
@@ -1385,8 +1226,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   // cpi->use_golden_frame_only = 0;
   // cpi->use_last_frame_only = 0;
-  cm->refresh_golden_frame = 0;
-  cm->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_last_frame = 1;
   cm->refresh_entropy_probs = 1;
 
   setup_features(cpi);
@@ -1491,14 +1332,18 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
     cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
   }
 
-  if (((cm->Width + 15) & 0xfffffff0) !=
-      cm->yv12_fb[cm->lst_fb_idx].y_width ||
-      ((cm->Height + 15) & 0xfffffff0) !=
-      cm->yv12_fb[cm->lst_fb_idx].y_height ||
-      cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {
+  // Increasing the size of the frame beyond the first seen frame, or some
+  // otherwise signalled maximum size, is not supported.
+  // TODO(jkoleszar): exit gracefully.
+  if (!cpi->initial_width) {
     alloc_raw_frame_buffers(cpi);
     vp9_alloc_compressor_data(cpi);
+    cpi->initial_width = cm->Width;
+    cpi->initial_height = cm->Height;
   }
+  assert(cm->Width <= cpi->initial_width);
+  assert(cm->Height <= cpi->initial_height);
+  update_frame_size(cpi);
 
   if (cpi->oxcf.fixed_q >= 0) {
     cpi->last_q[0] = cpi->oxcf.fixed_q;
@@ -1526,6 +1371,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->last_frame_distortion = 0;
 #endif
 
+  set_tile_limits(cpi);
 }
 
 #define M_LOG2_E 0.693147180559945309417
@@ -1693,7 +1539,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cpi->source_alt_ref_pending = FALSE;
   cpi->source_alt_ref_active = FALSE;
-  cpi->common.refresh_alt_ref_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -1795,10 +1641,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
     cpi->rd_thresh_mult[i] = 128;
   }
 
-#ifdef ENTROPY_STATS
-  init_mv_ref_counts();
-#endif
-
 #define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].vf             = VF; \
@@ -1838,14 +1680,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
       NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
-#if ARCH_X86 || ARCH_X86_64
-  cpi->fn_ptr[BLOCK_16X16].copymem  = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_16X8].copymem   = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_8X16].copymem   = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_8X8].copymem    = vp9_copy32xn;
-  cpi->fn_ptr[BLOCK_4X4].copymem    = vp9_copy32xn;
-#endif
-
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
   cpi->refining_search_sad = vp9_refining_search_sad;
@@ -1885,7 +1719,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
     if (cpi->pass != 1) {
       print_context_counters();
       print_tree_update_probs();
-      print_mode_context();
+      print_mode_context(&cpi->common);
     }
 #endif
 #ifdef NMV_STATS
@@ -1908,7 +1742,8 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
       print_mode_contexts(&cpi->common);
 #endif
       if (cpi->b_calculate_psnr) {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+        YV12_BUFFER_CONFIG *lst_yv12 =
+            &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];
         double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
         double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
         double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
@@ -2230,18 +2065,18 @@ int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
   if (ref_frame_flags > 7)
     return -1;
 
-  cpi->common.refresh_golden_frame = 0;
-  cpi->common.refresh_alt_ref_frame = 0;
-  cpi->common.refresh_last_frame   = 0;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
+  cpi->refresh_last_frame   = 0;
 
   if (ref_frame_flags & VP9_LAST_FLAG)
-    cpi->common.refresh_last_frame = 1;
+    cpi->refresh_last_frame = 1;
 
   if (ref_frame_flags & VP9_GOLD_FLAG)
-    cpi->common.refresh_golden_frame = 1;
+    cpi->refresh_golden_frame = 1;
 
   if (ref_frame_flags & VP9_ALT_FLAG)
-    cpi->common.refresh_alt_ref_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
 
   return 0;
 }
@@ -2253,11 +2088,11 @@ int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int ref_fb_idx;
 
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->lst_fb_idx;
+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->gld_fb_idx;
+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->alt_fb_idx;
+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
   else
     return -1;
 
@@ -2274,11 +2109,11 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int ref_fb_idx;
 
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->lst_fb_idx;
+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->gld_fb_idx;
+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->alt_fb_idx;
+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
   else
     return -1;
 
@@ -2349,9 +2184,73 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
     fwrite(src, s->uv_width, 1, yuv_rec_file);
     src += s->uv_stride;
   } while (--h);
+  fflush(yuv_rec_file);
 }
 #endif
 
+static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
+                                   YV12_BUFFER_CONFIG *dst_fb) {
+  const int in_w = src_fb->y_width;
+  const int in_h = src_fb->y_height;
+  const int out_w = dst_fb->y_width;
+  const int out_h = dst_fb->y_height;
+  int x, y;
+
+  for (y = 0; y < out_h; y += 16) {
+    for (x = 0; x < out_w; x += 16) {
+      int x_q4 = x * 16 * in_w / out_w;
+      int y_q4 = y * 16 * in_h / out_h;
+      uint8_t *src, *dst;
+      int src_stride, dst_stride;
+
+
+      src = src_fb->y_buffer +
+          y * in_h / out_h * src_fb->y_stride +
+          x * in_w / out_w;
+      dst = dst_fb->y_buffer +
+          y * dst_fb->y_stride +
+          x;
+      src_stride = src_fb->y_stride;
+      dst_stride = dst_fb->y_stride;
+
+      vp9_convolve8(src, src_stride, dst, dst_stride,
+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                    16, 16);
+
+      x_q4 >>= 1;
+      y_q4 >>= 1;
+      src_stride = src_fb->uv_stride;
+      dst_stride = dst_fb->uv_stride;
+
+      src = src_fb->u_buffer +
+          y / 2 * in_h / out_h * src_fb->uv_stride +
+          x / 2 * in_w / out_w;
+      dst = dst_fb->u_buffer +
+          y / 2 * dst_fb->uv_stride +
+          x / 2;
+      vp9_convolve8(src, src_stride, dst, dst_stride,
+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                    8, 8);
+
+      src = src_fb->v_buffer +
+          y / 2 * in_h / out_h * src_fb->uv_stride +
+          x / 2 * in_w / out_w;
+      dst = dst_fb->v_buffer +
+          y / 2 * dst_fb->uv_stride +
+          x / 2;
+      vp9_convolve8(src, src_stride, dst, dst_stride,
+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                    8, 8);
+    }
+  }
+
+  vp8_yv12_extend_frame_borders(dst_fb);
+}
+
+
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
@@ -2374,13 +2273,13 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
   // Update the Golden frame usage counts.
-  if (cm->refresh_golden_frame) {
+  if (cpi->refresh_golden_frame) {
     // Update data structure that monitors level of reference to last GF
     vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
     // this frame refreshes means next frames don't unless specified by user
-    cm->refresh_golden_frame = 0;
+    cpi->refresh_golden_frame = 0;
     cpi->common.frames_since_golden = 0;
 
     // if ( cm->frame_type == KEY_FRAME )
@@ -2402,7 +2301,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     // ******** Fixed Q test code only ************
     // If we are going to use the ALT reference for the next group of frames set a flag to say so.
     if (cpi->oxcf.fixed_q >= 0 &&
-        cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {
+        cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
       cpi->source_alt_ref_pending = TRUE;
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
     }
@@ -2414,7 +2313,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     if (cpi->frames_till_gf_update_due > 0)
       cpi->frames_till_gf_update_due--;
 
-  } else if (!cpi->common.refresh_alt_ref_frame) {
+  } else if (!cpi->refresh_alt_ref_frame) {
     // Decrement count down till next gf
     if (cpi->frames_till_gf_update_due > 0)
       cpi->frames_till_gf_update_due--;
@@ -2535,8 +2434,8 @@ static int recode_loop_test(VP9_COMP *cpi,
   if ((cpi->sf.recode_loop == 1) ||
       ((cpi->sf.recode_loop == 2) &&
        ((cm->frame_type == KEY_FRAME) ||
-        cm->refresh_golden_frame ||
-        cm->refresh_alt_ref_frame))) {
+        cpi->refresh_golden_frame ||
+        cpi->refresh_alt_ref_frame))) {
     // General over and under shoot tests
     if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
         ((cpi->projected_frame_size < low_limit) && (q > minq))) {
@@ -2563,86 +2462,56 @@ static int recode_loop_test(VP9_COMP *cpi,
   return force_recode;
 }
 
-static void update_reference_frames(VP9_COMMON *cm) {
-  YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
+static void update_reference_frames(VP9_COMP * const cpi) {
+  VP9_COMMON * const cm = &cpi->common;
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
-
   if (cm->frame_type == KEY_FRAME) {
-    yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;
-
-    yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-    yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-
-    cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
-  } else { /* For non key frames */
-    if (cm->refresh_alt_ref_frame) {
-      assert(!cm->copy_buffer_to_arf);
-
-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;
-      cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-      cm->alt_fb_idx = cm->new_fb_idx;
-    } else if (cm->copy_buffer_to_arf) {
-      assert(!(cm->copy_buffer_to_arf & ~0x3));
-
-      if (cm->copy_buffer_to_arf == 1) {
-        if (cm->alt_fb_idx != cm->lst_fb_idx) {
-          yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;
-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-          cm->alt_fb_idx = cm->lst_fb_idx;
-        }
-      } else { /* if (cm->copy_buffer_to_arf == 2) */
-        if (cm->alt_fb_idx != cm->gld_fb_idx) {
-          yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;
-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-          cm->alt_fb_idx = cm->gld_fb_idx;
-        }
-      }
+    ref_cnt_fb(cm->fb_idx_ref_cnt,
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(cm->fb_idx_ref_cnt,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+  } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+    /* Preserve the previously existing golden frame and update the frame in
+     * the alt ref slot instead. This is highly specific to the current use of
+     * alt-ref as a forward reference, and this needs to be generalized as
+     * other uses are implemented (like RTC/temporal scaling)
+     *
+     * The update to the buffer in the alt ref slot was signalled in
+     * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated
+     * as the golden frame next time.
+     */
+    int tmp;
+
+    ref_cnt_fb(cm->fb_idx_ref_cnt,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+    tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+  } else { /* For non key/golden frames */
+    if (cpi->refresh_alt_ref_frame) {
+      ref_cnt_fb(cm->fb_idx_ref_cnt,
+                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
     }
 
-    if (cm->refresh_golden_frame) {
-      assert(!cm->copy_buffer_to_gf);
-
-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;
-      cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-      cm->gld_fb_idx = cm->new_fb_idx;
-    } else if (cm->copy_buffer_to_gf) {
-      assert(!(cm->copy_buffer_to_arf & ~0x3));
-
-      if (cm->copy_buffer_to_gf == 1) {
-        if (cm->gld_fb_idx != cm->lst_fb_idx) {
-          yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;
-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-          cm->gld_fb_idx = cm->lst_fb_idx;
-        }
-      } else { /* if (cm->copy_buffer_to_gf == 2) */
-        if (cm->alt_fb_idx != cm->gld_fb_idx) {
-          yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;
-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
-          cm->gld_fb_idx = cm->alt_fb_idx;
-        }
-      }
+    if (cpi->refresh_golden_frame) {
+      ref_cnt_fb(cm->fb_idx_ref_cnt,
+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     }
   }
 
-  if (cm->refresh_last_frame) {
-    cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;
-    cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;
-    cm->lst_fb_idx = cm->new_fb_idx;
+  if (cpi->refresh_last_frame) {
+    ref_cnt_fb(cm->fb_idx_ref_cnt,
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
   }
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  if (cm->no_lpf) {
+  if (cm->no_lpf || cpi->mb.e_mbd.lossless) {
     cm->filter_level = 0;
-  }
-#if CONFIG_LOSSLESS
-  else if (cpi->oxcf.lossless) {
-    cm->filter_level = 0;
-  }
-#endif
-  else {
+  } else {
     struct vpx_usec_timer timer;
 
     vp9_clear_system_state();
@@ -2666,7 +2535,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
 }
 
-void select_interp_filter_type(VP9_COMP *cpi) {
+void vp9_select_interp_filter_type(VP9_COMP *cpi) {
   int i;
   int high_filter_index = 0;
   unsigned int thresh;
@@ -2719,6 +2588,38 @@ static void select_interintra_mode(VP9_COMP *cpi) {
 }
 #endif
 
+static void scale_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i;
+
+  for (i = 0; i < 3; i++) {
+    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];
+
+    if (ref->y_width != cm->mb_cols * 16 || ref->y_height != cm->mb_rows * 16) {
+      int new_fb = get_free_fb(cm);
+
+      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],
+                                    cm->mb_cols * 16,
+                                    cm->mb_rows * 16,
+                                    VP9BORDERINPIXELS);
+      scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
+      cpi->scaled_ref_idx[i] = new_fb;
+    } else {
+      cpi->scaled_ref_idx[i] = cm->ref_frame_map[i];
+      cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++;
+    }
+  }
+}
+
+static void release_scaled_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i;
+
+  for (i = 0; i < 3; i++) {
+    cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
+  }
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       unsigned long *size,
                                       unsigned char *dest,
@@ -2735,8 +2636,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   int q_low;
   int q_high;
-  int zbin_oq_high;
-  int zbin_oq_low = 0;
 
   int top_index;
   int bottom_index;
@@ -2749,11 +2648,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #if RESET_FOREACH_FILTER
   int q_low0;
   int q_high0;
-  int zbin_oq_high0;
-  int zbin_oq_low0 = 0;
   int Q0;
-  int last_zbin_oq;
-  int last_zbin_oq0;
   int active_best_quality0;
   int active_worst_quality0;
   double rate_correction_factor0;
@@ -2773,36 +2668,43 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   int mcomp_filter_index = 0;
   int64_t mcomp_filter_cost[4];
 
+  /* Scale the source buffer, if required */
+  if (cm->mb_cols * 16 != cpi->un_scaled_source->y_width ||
+      cm->mb_rows * 16 != cpi->un_scaled_source->y_height) {
+    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+    cpi->Source = &cpi->scaled_source;
+  } else {
+    cpi->Source = cpi->un_scaled_source;
+  }
+
+  scale_references(cpi);
+
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();
 
 
   // For an alt ref frame in 2 pass we skip the call to the second
   // pass function that sets the target bandwidth so must set it here
-  if (cpi->common.refresh_alt_ref_frame) {
+  if (cpi->refresh_alt_ref_frame) {
     cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame
     // per second target bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
                                   cpi->output_frame_rate);
   }
 
-  // Default turn off buffer to buffer copying
-  cm->copy_buffer_to_gf = 0;
-  cm->copy_buffer_to_arf = 0;
-
   // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_over_quant = 0;
   cpi->zbin_mode_boost = 0;
 
   // Enable or disable mode based tweaking of the zbin
   // For 2 Pass Only used where GF/ARF prediction quality
   // is above a threshold
   cpi->zbin_mode_boost = 0;
-#if CONFIG_LOSSLESS
-  cpi->zbin_mode_boost_enabled = FALSE;
-#else
-  cpi->zbin_mode_boost_enabled = TRUE;
-#endif
+
+  if (cpi->oxcf.lossless)
+    cpi->zbin_mode_boost_enabled = FALSE;
+  else
+    cpi->zbin_mode_boost_enabled = TRUE;
+
   if (cpi->gfu_boost <= 400) {
     cpi->zbin_mode_boost_enabled = FALSE;
   }
@@ -2846,10 +2748,22 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     for (i = 0; i < MAX_MODES; i++) {
       cpi->rd_thresh_mult[i] = 128;
     }
+
+    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
+    cm->frame_parallel_decoding_mode =
+      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+    if (cm->error_resilient_mode) {
+      cm->frame_parallel_decoding_mode = 1;
+      cm->refresh_entropy_probs = 0;
+    }
   }
 
-  // Test code for new segment features
-  init_seg_features(cpi);
+  // Configure use of segmentation for enhanced coding of static regions.
+  // Only allowed for now in second pass of two pass (as requires lagged coding)
+  // and if the relevent speed feature flag is set.
+  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+    configure_static_seg_features(cpi);
+  }
 
   // Decide how big to make the frame
   vp9_pick_frame_size(cpi);
@@ -2896,9 +2810,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       if (cpi->active_best_quality < cpi->best_quality)
         cpi->active_best_quality = cpi->best_quality;
     }
-  }
-
-  else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
     int high = 2000;
     int low = 400;
 
@@ -2971,17 +2883,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     // Determine initial Q to try
     Q = vp9_regulate_q(cpi, cpi->this_frame_target);
   }
-#if RESET_FOREACH_FILTER
-  last_zbin_oq = cpi->zbin_over_quant;
-#endif
-
-  // Set highest allowed value for Zbin over quant
-  if (cm->frame_type == KEY_FRAME)
-    zbin_oq_high = 0; // ZBIN_OQ_MAX/16
-  else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
-    zbin_oq_high = 16;
-  else
-    zbin_oq_high = ZBIN_OQ_MAX;
 
   vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
                                 &frame_over_shoot_limit);
@@ -3064,9 +2965,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     q_low0 = q_low;
     q_high0 = q_high;
     Q0 = Q;
-    zbin_oq_low0 = zbin_oq_low;
-    zbin_oq_high0 = zbin_oq_high;
-    last_zbin_oq0 = last_zbin_oq;
     rate_correction_factor0 = cpi->rate_correction_factor;
     gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
     active_best_quality0 = cpi->active_best_quality;
@@ -3087,12 +2985,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
           cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];
 
         if (cm->frame_type != KEY_FRAME) {
-          if (cpi->common.refresh_alt_ref_frame) {
+          if (cpi->refresh_alt_ref_frame) {
             for (k = 0; k < MBSKIP_CONTEXTS; k++) {
               if (cpi->last_skip_false_probs[2][k] != 0)
                 cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];
             }
-          } else if (cpi->common.refresh_golden_frame) {
+          } else if (cpi->refresh_golden_frame) {
             for (k = 0; k < MBSKIP_CONTEXTS; k++) {
               if (cpi->last_skip_false_probs[1][k] != 0)
                 cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];
@@ -3124,10 +3022,21 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       }
 
       // Set up entropy depending on frame type.
-      if (cm->frame_type == KEY_FRAME)
+      if (cm->frame_type == KEY_FRAME) {
+        /* Choose which entropy context to use. When using a forward reference
+	 * frame, it immediately follows the keyframe, and thus benefits from
+	 * using the same entropy context established by the keyframe. Otherwise,
+	 * use the default context 0.
+	 */
+        cm->frame_context_idx = cpi->oxcf.play_alternate;
         vp9_setup_key_frame(cpi);
-      else
+      } else {
+	/* Choose which entropy context to use. Currently there are only two
+	 * contexts used, one for normal frames and one for alt ref frames.
+	 */
+        cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         vp9_setup_inter_frame(cpi);
+      }
     }
 
     // transform / motion compensation build reconstruction frame
@@ -3214,23 +3123,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       if (cpi->projected_frame_size > cpi->this_frame_target) {
         q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
 
-        if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low
-          zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-
         if (undershoot_seen || (loop_count > 1)) {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
           if (!active_worst_qchanged)
             vp9_update_rate_correction_factors(cpi, 1);
 
           Q = (q_high + q_low + 1) / 2;
-
-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)
-          if (Q < MAXQ)
-            cpi->zbin_over_quant = 0;
-          else {
-            zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
-          }
         } else {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
           if (!active_worst_qchanged)
@@ -3238,7 +3136,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
           Q = vp9_regulate_q(cpi, cpi->this_frame_target);
 
-          while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {
+          while ((Q < q_low) && (Retries < 10)) {
             vp9_update_rate_correction_factors(cpi, 0);
             Q = vp9_regulate_q(cpi, cpi->this_frame_target);
             Retries++;
@@ -3249,10 +3147,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       }
       // Frame is too small
       else {
-        if (cpi->zbin_over_quant == 0)
-          q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
-        else                                    // else lower zbin_oq_high
-          zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+        q_high = (Q > q_low) ? (Q - 1) : q_low;
 
         if (overshoot_seen || (loop_count > 1)) {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
@@ -3260,12 +3155,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
             vp9_update_rate_correction_factors(cpi, 1);
 
           Q = (q_high + q_low) / 2;
-
-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)
-          if (Q < MAXQ)
-            cpi->zbin_over_quant = 0;
-          else
-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
         } else {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
           if (!active_worst_qchanged)
@@ -3282,7 +3171,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
             q_low = Q;
           }
 
-          while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {
+          while ((Q > q_high) && (Retries < 10)) {
             vp9_update_rate_correction_factors(cpi, 0);
             Q = vp9_regulate_q(cpi, cpi->this_frame_target);
             Retries++;
@@ -3298,16 +3187,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       else if (Q < q_low)
         Q = q_low;
 
-      // Clamp cpi->zbin_over_quant
-      cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?
-          zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?
-          zbin_oq_high : cpi->zbin_over_quant;
-
-      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
       Loop = ((Q != last_q)) ? TRUE : FALSE;
-#if RESET_FOREACH_FILTER
-      last_zbin_oq = cpi->zbin_over_quant;
-#endif
     } else
       Loop = FALSE;
 
@@ -3351,12 +3231,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
         if (Loop == TRUE) {
           overshoot_seen = FALSE;
           undershoot_seen = FALSE;
-          zbin_oq_low = zbin_oq_low0;
-          zbin_oq_high = zbin_oq_high0;
           q_low = q_low0;
           q_high = q_high0;
           Q = Q0;
-          cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;
           cpi->rate_correction_factor = rate_correction_factor0;
           cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
           cpi->active_best_quality = active_best_quality0;
@@ -3412,12 +3289,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);
 
   if (cm->frame_type == KEY_FRAME)
-    cm->refresh_last_frame = 1;
+    cpi->refresh_last_frame = 1;
 
 #if 0
   {
     FILE *f = fopen("gfactive.stt", "a");
-    fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+    fprintf(f, "%8d %8d %8d %8d %8d\n",
+            cm->current_video_frame,
+            (100 * cpi->gf_active_count)
+              / (cpi->common.mb_rows * cpi->common.mb_cols),
+            cpi->this_iiratio,
+            cpi->next_iiratio,
+            cpi->refresh_golden_frame);
     fclose(f);
   }
 #endif
@@ -3444,18 +3327,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     update_reference_segmentation_map(cpi);
   }
 
-  update_reference_frames(cm);
+  release_scaled_references(cpi);
+  update_reference_frames(cpi);
   vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts_4x4,
-           cpi->hybrid_coef_counts_4x4);
   vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8,
-           cpi->hybrid_coef_counts_8x8);
   vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
-           cpi->hybrid_coef_counts_16x16);
   vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
-  vp9_adapt_coef_probs(&cpi->common);
+  if (!cpi->common.error_resilient_mode &&
+      !cpi->common.frame_parallel_decoding_mode)
+    vp9_adapt_coef_probs(&cpi->common);
   if (cpi->common.frame_type != KEY_FRAME) {
     vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
@@ -3467,14 +3347,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #if CONFIG_COMP_INTERINTRA_PRED
     vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);
 #endif
-    vp9_adapt_mode_probs(&cpi->common);
-
     cpi->common.fc.NMVcount = cpi->NMVcount;
-    /*
-    printf("2: %d %d %d %d\n", cpi->NMVcount.joints[0], cpi->NMVcount.joints[1],
-                      cpi->NMVcount.joints[2], cpi->NMVcount.joints[3]);
-                      */
-    vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+    if (!cpi->common.error_resilient_mode &&
+        !cpi->common.frame_parallel_decoding_mode) {
+      vp9_adapt_mode_probs(&cpi->common);
+      vp9_adapt_mode_context(&cpi->common);
+      vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+    }
   }
 #if CONFIG_COMP_INTERINTRA_PRED
   if (cm->frame_type != KEY_FRAME)
@@ -3502,8 +3381,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if ((cm->base_qindex < cpi->last_boosted_qindex) ||
       ((cpi->static_mb_pct < 100) &&
        ((cm->frame_type == KEY_FRAME) ||
-        cm->refresh_alt_ref_frame ||
-        (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+        cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
     cpi->last_boosted_qindex = cm->base_qindex;
   }
 
@@ -3516,7 +3395,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
 
   // Keep a record from which we can calculate the average Q excluding GF updates and key frames
-  if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {
+  if ((cm->frame_type != KEY_FRAME)
+      && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
     cpi->ni_frames++;
     cpi->tot_q += vp9_convert_qindex_to_q(Q);
     cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
@@ -3538,11 +3418,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
 
-  // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
-  cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
-  cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
-  cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
-  cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    cpi->rolling_target_bits =
+      ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+    cpi->rolling_actual_bits =
+      ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+    cpi->long_rolling_target_bits =
+      ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+    cpi->long_rolling_actual_bits =
+      ((cpi->long_rolling_actual_bits * 31) +
+       cpi->projected_frame_size + 16) / 32;
+  }
 
   // Actual bits spent
   cpi->total_actual_bits    += cpi->projected_frame_size;
@@ -3558,7 +3446,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
     if (cpi->twopass.kf_group_bits < 0)
       cpi->twopass.kf_group_bits = 0;
-  } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
     cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
 
     if (cpi->twopass.gf_group_bits < 0)
@@ -3582,7 +3470,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     if (cpi->twopass.total_left_stats->coded_error != 0.0)
       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+              "%6d %5d %5d %5d %8.2f %10d %10.3f"
               "%10.3f %8d %10d %10d %10d\n",
               cpi->common.current_video_frame, cpi->this_frame_target,
               cpi->projected_frame_size, 0, //loop_size_estimate,
@@ -3597,9 +3485,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               cpi->avg_q,
               vp9_convert_qindex_to_q(cpi->ni_av_qi),
               vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->zbin_over_quant,
-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,
-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
               cm->frame_type, cpi->gfu_boost,
               cpi->twopass.est_max_qcorrection_factor,
               (int)cpi->twopass.bits_left,
@@ -3611,7 +3497,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     else
       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+              "%5d %5d %8d %8d %8.2f %10d %10.3f"
               "%8d %10d %10d %10d\n",
               cpi->common.current_video_frame,
               cpi->this_frame_target, cpi->projected_frame_size,
@@ -3627,9 +3513,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
               cpi->avg_q,
               vp9_convert_qindex_to_q(cpi->ni_av_qi),
               vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->zbin_over_quant,
-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,
-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
               cm->frame_type, cpi->gfu_boost,
               cpi->twopass.est_max_qcorrection_factor,
               (int)cpi->twopass.bits_left,
@@ -3645,8 +3529,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
       fprintf(fmodes, "%6d:%1d:%1d:%1d ",
               cpi->common.current_video_frame,
-              cm->frame_type, cm->refresh_golden_frame,
-              cm->refresh_alt_ref_frame);
+              cm->frame_type, cpi->refresh_golden_frame,
+              cpi->refresh_alt_ref_frame);
 
       for (i = 0; i < MAX_MODES; i++)
         fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
@@ -3665,33 +3549,34 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #endif
 
   // If this was a kf or Gf note the Q
-  if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+  if ((cm->frame_type == KEY_FRAME)
+      || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
     cm->last_kf_gf_q = cm->base_qindex;
 
-  if (cm->refresh_golden_frame == 1)
+  if (cpi->refresh_golden_frame == 1)
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
   else
     cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
 
-  if (cm->refresh_alt_ref_frame == 1)
+  if (cpi->refresh_alt_ref_frame == 1)
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
   else
     cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
 
 
-  if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
+  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
     cpi->gold_is_last = 1;
-  else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
     cpi->gold_is_last = 0;
 
-  if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
+  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
     cpi->alt_is_last = 1;
-  else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
+  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
     cpi->alt_is_last = 0;
 
-  if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
+  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
     cpi->gold_is_alt = 1;
-  else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
     cpi->gold_is_alt = 0;
 
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -3705,7 +3590,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if (cpi->gold_is_alt)
     cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
 
-  if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
+  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame
+      && (cm->frame_type != KEY_FRAME))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
   else
@@ -3727,6 +3613,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   xd->update_mb_segmentation_data = 0;
   xd->mode_ref_lf_delta_update = 0;
 
+  // keep track of the last coded dimensions
+  cm->last_width = cm->Width;
+  cm->last_height = cm->Height;
 
   // Dont increment frame counters if this was an altref buffer update not a real frame
   if (cm->show_frame) {
@@ -3744,8 +3633,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     FILE *recon_file;
     sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
     recon_file = fopen(filename, "wb");
-    fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
-           cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
+    fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
+           cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
+           1, recon_file);
     fclose(recon_file);
   }
 #endif
@@ -3765,13 +3655,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
                         unsigned char *dest, unsigned int *frame_flags) {
 
-  if (!cpi->common.refresh_alt_ref_frame)
+  if (!cpi->refresh_alt_ref_frame)
     vp9_second_pass(cpi);
 
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+
+#ifdef DISABLE_RC_LONG_TERM_MEM
+  cpi->twopass.bits_left -=  cpi->this_frame_target;
+#else
   cpi->twopass.bits_left -= 8 * *size;
+#endif
 
-  if (!cpi->common.refresh_alt_ref_frame) {
+  if (!cpi->refresh_alt_ref_frame) {
     double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
                                         * cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -3808,9 +3703,8 @@ static int frame_is_reference(const VP9_COMP *cpi) {
   const VP9_COMMON *cm = &cpi->common;
   const MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
-  return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
-         || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
-         || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
+  return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame
+         || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame
          || cm->refresh_entropy_probs
          || xd->mode_ref_lf_delta_update
          || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
@@ -3846,9 +3740,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         force_src_buffer = &cpi->alt_ref_buffer;
       }
       cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
-      cm->refresh_alt_ref_frame = 1;
-      cm->refresh_golden_frame = 0;
-      cm->refresh_last_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_last_frame = 0;
       cm->show_frame = 0;
       cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.
       cpi->is_src_frame_alt_ref = 0;
@@ -3889,7 +3783,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   }
 
   // adjust frame rates based on timestamps given
-  if (!cm->refresh_alt_ref_frame) {
+  if (!cpi->refresh_alt_ref_frame) {
     int64_t this_duration;
     int step = 0;
 
@@ -3945,28 +3839,34 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
 #if 0
 
-  if (cm->refresh_alt_ref_frame) {
-    // cm->refresh_golden_frame = 1;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 0;
+  if (cpi->refresh_alt_ref_frame) {
+    // cpi->refresh_golden_frame = 1;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_last_frame = 0;
   } else {
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_last_frame = 1;
   }
 
 #endif
-  /* find a free buffer for the new frame */
-  {
-    int i = 0;
-    for (; i < NUM_YV12_BUFFERS; i++) {
-      if (!cm->yv12_fb[i].flags) {
-        cm->new_fb_idx = i;
-        break;
-      }
-    }
 
-    assert(i < NUM_YV12_BUFFERS);
-  }
+  /* find a free buffer for the new frame, releasing the reference previously
+   * held.
+   */
+  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+  cm->new_fb_idx = get_free_fb(cm);
+
+  /* Get the mapping of L/G/A to the reference buffer pool */
+  cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];
+  cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];
+  cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];
+
+  /* Reset the frame pointers to the current frame size */
+  vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+                                cm->mb_cols * 16, cm->mb_rows * 16,
+                                VP9BORDERINPIXELS);
+
+  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
   if (cpi->pass == 1) {
     Pass1Encode(cpi, size, dest, frame_flags);
   } else if (cpi->pass == 2) {
@@ -3976,10 +3876,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   }
 
   if (cm->refresh_entropy_probs) {
-    if (cm->refresh_alt_ref_frame)
-      vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));
-    else
-      vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,
+               sizeof(cm->fc));
   }
 
   // if its a dropped frame honor the requests on subsequent frames
@@ -3988,9 +3886,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
     // return to normal state
     cm->refresh_entropy_probs = 1;
-    cm->refresh_alt_ref_frame = 0;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
+    cpi->refresh_alt_ref_frame = 0;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_last_frame = 1;
     cm->frame_type = INTER_FRAME;
 
   }
@@ -4113,7 +4011,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
 
-  if (cpi->common.refresh_alt_ref_frame)
+  if (cpi->refresh_alt_ref_frame)
     return -1;
   else {
     int ret;
@@ -4217,17 +4115,31 @@ int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
 int vp9_set_internal_size(VP9_PTR comp,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
+  VP9_COMMON *cm = &cpi->common;
 
-  if (horiz_mode <= ONETWO)
-    cpi->common.horiz_scale = horiz_mode;
-  else
+  if (horiz_mode > ONETWO)
     return -1;
 
-  if (vert_mode <= ONETWO)
-    cpi->common.vert_scale  = vert_mode;
-  else
+  if (vert_mode > ONETWO)
     return -1;
 
+  if (cm->horiz_scale != horiz_mode || cm->vert_scale != vert_mode) {
+    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+    cm->horiz_scale = horiz_mode;
+    cm->vert_scale = vert_mode;
+
+    Scale2Ratio(cm->horiz_scale, &hr, &hs);
+    Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+    // always go to the next whole number
+    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+  }
+  assert(cm->Width <= cpi->initial_width);
+  assert(cm->Height <= cpi->initial_height);
+  update_frame_size(cpi);
   return 0;
 }
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 74a58b430..02a371964 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -29,6 +29,10 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
+// Experimental rate control switches
+// #define ONE_SHOT_Q_ESTIMATE 1
+// #define DISABLE_RC_LONG_TERM_MEM 1
+
 // #define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
 #define DEFAULT_GF_INTERVAL         7
@@ -53,7 +57,6 @@
 #define GF_ZEROMV_ZBIN_BOOST 12
 #define LF_ZEROMV_ZBIN_BOOST 6
 #define MV_ZBIN_BOOST        4
-#define ZBIN_OQ_MAX 192
 
 #define VP9_TEMPORAL_ALT_REF 1
 
@@ -86,12 +89,9 @@ typedef struct {
   // 0 = BPRED, ZERO_MV, MV, SPLIT
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];
 
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
@@ -259,7 +259,9 @@ typedef struct {
   int optimize_coefficients;
   int no_skip_block4x4_search;
   int search_best_filter;
-
+  int splitmode_breakout;
+  int mb16_breakout;
+  int static_segmentation;
 } SPEED_FEATURES;
 
 typedef struct {
@@ -301,41 +303,14 @@ typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
 
-  DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
-
   DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
 
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
 
-  DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);
-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);
-
-  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
-
-  DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]);
-  DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]);
-  DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]);
-
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
@@ -357,11 +332,17 @@ typedef struct VP9_COMP {
   int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
   int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
-  // int refresh_alt_ref_frame;
+  int scaled_ref_idx[3];
+  int lst_fb_idx;
+  int gld_fb_idx;
+  int alt_fb_idx;
+  int refresh_last_frame;
+  int refresh_golden_frame;
+  int refresh_alt_ref_frame;
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tok;
-  unsigned int tok_count;
+  unsigned int tok_count[1 << 6];
 
 
   unsigned int frames_since_key;
@@ -441,7 +422,6 @@ typedef struct VP9_COMP {
   double tot_q;
   double avg_q;
 
-  int zbin_over_quant;
   int zbin_mode_boost;
   int zbin_mode_boost_enabled;
 
@@ -484,26 +464,17 @@ typedef struct VP9_COMP {
 
   nmv_context_counts NMVcount;
 
-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_probs frame_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
-  vp9_coeff_stats frame_hybrid_branch_ct_4x4[BLOCK_TYPES_4X4];
-
-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_probs frame_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
-  vp9_coeff_stats frame_hybrid_branch_ct_8x8[BLOCK_TYPES_8X8];
-
-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_probs frame_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
-  vp9_coeff_stats frame_hybrid_branch_ct_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
+  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];
+
+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
+  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];
+
+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
+  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];
+  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];
   vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES_32X32];
@@ -683,9 +654,6 @@ typedef struct VP9_COMP {
 
   int droppable;
 
-  // TODO Do we still need this??
-  int update_context;
-
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
   unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
@@ -696,6 +664,8 @@ typedef struct VP9_COMP {
   unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
 #endif
 
+  int initial_width;
+  int initial_height;
 } VP9_COMP;
 
 void vp9_encode_frame(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index b443ede6f..6f9333521 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
+#include <assert.h>
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
@@ -27,6 +27,7 @@ void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
   int yoffset;
   int linestocopy;
 
+  assert(src_ybc->y_stride == dst_ybc->y_stride);
   yheight  = src_ybc->y_height;
   ystride  = src_ybc->y_stride;
 
diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c
index eb00f4159..94394341d 100644
--- a/vp9/encoder/vp9_psnr.c
+++ b/vp9/encoder/vp9_psnr.c
@@ -11,17 +11,16 @@
 
 #include "vpx_scale/yv12config.h"
 #include "math.h"
-#include "vp9/common/vp9_systemdependent.h" /* for vp9_clear_system_state() */
 
 #define MAX_PSNR 100
 
-double vp9_mse2psnr(double Samples, double Peak, double Mse) {
+double vp9_mse2psnr(double samples, double peak, double mse) {
   double psnr;
 
-  if ((double)Mse > 0.0)
-    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
   else
-    psnr = MAX_PSNR;      // Limit to prevent / 0
+    psnr = MAX_PSNR;  // Limit to prevent / 0
 
   if (psnr > MAX_PSNR)
     psnr = MAX_PSNR;
diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h
index 121f0dc98..15dd8366b 100644
--- a/vp9/encoder/vp9_psnr.h
+++ b/vp9/encoder/vp9_psnr.h
@@ -12,6 +12,6 @@
 #ifndef VP9_ENCODER_VP9_PSNR_H_
 #define VP9_ENCODER_VP9_PSNR_H_
 
-extern double vp9_mse2psnr(double Samples, double Peak, double Mse);
+double vp9_mse2psnr(double samples, double peak, double mse);
 
 #endif  // VP9_ENCODER_VP9_PSNR_H_
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 36b656713..399e8ecda 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -21,7 +21,10 @@
 extern int enc_debug;
 #endif
 
-void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
+void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK *const b = &mb->block[b_idx];
+  BLOCKD *const d = &xd->block[b_idx];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
@@ -57,35 +60,40 @@ void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
 
   eob = -1;
 
-  for (i = 0; i < b->eob_max_offset; i++) {
-    rc   = pt_scan[i];
-    z    = coeff_ptr[rc];
-
-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-    zbin_boost_ptr ++;
-
-    sz = (z >> 31);                                 // sign of z
-    x  = (z ^ sz) - sz;                             // x = abs(z)
-
-    if (x >= zbin) {
-      x += round_ptr[rc];
-      y  = (((x * quant_ptr[rc]) >> 16) + x)
-           >> quant_shift_ptr[rc];                // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-      if (y) {
-        eob = i;                                // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+  if (!b->skip_block) {
+    for (i = 0; i < 16; i++) {
+      rc   = pt_scan[i];
+      z    = coeff_ptr[rc];
+
+      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+      zbin_boost_ptr++;
+
+      sz = (z >> 31);                                 // sign of z
+      x  = (z ^ sz) - sz;                             // x = abs(z)
+
+      if (x >= zbin) {
+        x += round_ptr[rc];
+        y  = (((x * quant_ptr[rc]) >> 16) + x)
+             >> quant_shift_ptr[rc];                // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+
+        if (y) {
+          eob = i;                                // last nonzero coeffs
+          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+        }
       }
     }
   }
 
-  d->eob = eob + 1;
+  xd->eobs[b_idx] = eob + 1;
 }
 
-void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK *const b = &mb->block[b_idx];
+  BLOCKD *const d = &xd->block[b_idx];
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
@@ -105,64 +113,55 @@ void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
 
   eob = -1;
 
-  for (i = 0; i < b->eob_max_offset; i++) {
-    rc   = vp9_default_zig_zag1d_4x4[i];
-    z    = coeff_ptr[rc];
+  if (!b->skip_block) {
+    for (i = 0; i < 16; i++) {
+      rc   = vp9_default_zig_zag1d_4x4[i];
+      z    = coeff_ptr[rc];
 
-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-    zbin_boost_ptr ++;
+      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+      zbin_boost_ptr++;
 
-    sz = (z >> 31);                                 // sign of z
-    x  = (z ^ sz) - sz;                             // x = abs(z)
+      sz = (z >> 31);                                 // sign of z
+      x  = (z ^ sz) - sz;                             // x = abs(z)
 
-    if (x >= zbin) {
-      x += round_ptr[rc];
+      if (x >= zbin) {
+        x += round_ptr[rc];
 
-      y  = (((x * quant_ptr[rc]) >> 16) + x)
-           >> quant_shift_ptr[rc];                // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+        y  = (((x * quant_ptr[rc]) >> 16) + x)
+             >> quant_shift_ptr[rc];                // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
 
-      if (y) {
-        eob = i;                                // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+        if (y) {
+          eob = i;                                // last nonzero coeffs
+          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+        }
       }
     }
   }
 
-  d->eob = eob + 1;
+  xd->eobs[b_idx] = eob + 1;
 }
 
 void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
   int i;
-  int has_2nd_order = get_2nd_order_usage(&x->e_mbd);
 
   for (i = 0; i < 16; i++) {
     TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, &x->e_mbd.block[i]);
     if (tx_type != DCT_DCT) {
-      assert(has_2nd_order == 0);
-      vp9_ht_quantize_b_4x4(&x->block[i], &x->e_mbd.block[i], tx_type);
+      vp9_ht_quantize_b_4x4(x, i, tx_type);
     } else {
-      x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
+      x->quantize_b_4x4(x, i);
     }
   }
-  if (has_2nd_order) {
-    x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);
-  } else {
-    vpx_memset(x->e_mbd.block[24].qcoeff, 0,
-               16 * sizeof(x->e_mbd.block[24].qcoeff[0]));
-    vpx_memset(x->e_mbd.block[24].dqcoeff, 0,
-               16 * sizeof(x->e_mbd.block[24].dqcoeff[0]));
-    x->e_mbd.block[24].eob = 0;
-  }
 }
 
 void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {
   int i;
 
   for (i = 16; i < 24; i++)
-    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
+    x->quantize_b_4x4(x, i);
 }
 
 void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
@@ -170,138 +169,101 @@ void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
   vp9_quantize_mbuv_4x4_c(x);
 }
 
-void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
-  int zbin_zrun_index = 0;
-  int16_t *coeff_ptr  = b->coeff;
-  int16_t *zbin_ptr   = b->zbin;
-  int16_t *round_ptr  = b->round;
-  int16_t *quant_ptr  = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
-  int16_t *qcoeff_ptr = d->qcoeff;
-  int16_t *dqcoeff_ptr = d->dqcoeff;
-  int16_t *dequant_ptr = d->dequant;
-  int zbin_oq_value    = b->zbin_extra;
-  // double q2nd = 4;
-  vpx_memset(qcoeff_ptr, 0, 32);
-  vpx_memset(dqcoeff_ptr, 0, 32);
-
-  eob = -1;
-
-  for (i = 0; i < b->eob_max_offset_8x8; i++) {
-    rc   = vp9_default_zig_zag1d_4x4[i];
-    z    = coeff_ptr[rc];
-
-    zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];
-    zbin_zrun_index += 4;
-    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
-
-    sz = (z >> 31);                               // sign of z
-    x  = (z ^ sz) - sz;                           // x = abs(z)
-
-    if (x >= zbin) {
-      x += (round_ptr[rc]);
-      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
-           >> quant_shift_ptr[rc];                // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-      if (y) {
-        eob = i;                                  // last nonzero coeffs
-        zbin_zrun_index = 0;
-      }
-    }
-  }
-
-  d->eob = eob + 1;
-}
-
-void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  int16_t *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
-  int16_t *coeff_ptr  = b->coeff;
-  int16_t *zbin_ptr   = b->zbin_8x8;
-  int16_t *round_ptr  = b->round;
-  int16_t *quant_ptr  = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
+void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK *const b = &mb->block[b_idx];
+  BLOCKD *const d = &xd->block[b_idx];
   int16_t *qcoeff_ptr = d->qcoeff;
   int16_t *dqcoeff_ptr = d->dqcoeff;
-  int16_t *dequant_ptr = d->dequant;
-  int zbin_oq_value = b->zbin_extra;
 
   vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));
 
-  eob = -1;
-
-  for (i = 0; i < b->eob_max_offset_8x8; i++) {
-    rc   = vp9_default_zig_zag1d_8x8[i];
-    z    = coeff_ptr[rc];
-
-    zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);
-    zbin_boost_ptr++;
-
-    sz = (z >> 31);                               // sign of z
-    x  = (z ^ sz) - sz;                           // x = abs(z)
-
-    if (x >= zbin) {
-      x += (round_ptr[rc != 0]);
-      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-           >> quant_shift_ptr[rc != 0];            // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
-
-      if (y) {
-        eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost_8x8;
+  if (!b->skip_block) {
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    int zero_run;
+    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+    int16_t *coeff_ptr  = b->coeff;
+    int16_t *zbin_ptr   = b->zbin;
+    int16_t *round_ptr  = b->round;
+    int16_t *quant_ptr  = b->quant;
+    uint8_t *quant_shift_ptr = b->quant_shift;
+    int16_t *dequant_ptr = d->dequant;
+    int zbin_oq_value = b->zbin_extra;
+
+    eob = -1;
+
+    // Special case for DC as it is the one triggering access in various
+    // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0]
+    {
+      z    = coeff_ptr[0];
+      zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value);
+      zero_run = 1;
+
+      sz = (z >> 31);                                // sign of z
+      x  = (z ^ sz) - sz;                            // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[0]);
+        y  = ((int)(((int)(x * quant_ptr[0]) >> 16) + x))
+             >> quant_shift_ptr[0];                  // quantize (x)
+        x  = (y ^ sz) - sz;                          // get the sign back
+        qcoeff_ptr[0]  = x;                          // write to destination
+        dqcoeff_ptr[0] = x * dequant_ptr[0];         // dequantized value
+
+        if (y) {
+          eob = 0;                                   // last nonzero coeffs
+          zero_run = 0;
+        }
+      }
+    }
+    for (i = 1; i < 64; i++) {
+      rc   = vp9_default_zig_zag1d_8x8[i];
+      z    = coeff_ptr[rc];
+      zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value);
+      // The original code was incrementing zero_run while keeping it at
+      // maximum 15 by adding "(zero_run < 15)". The same is achieved by
+      // removing the opposite of the sign mask of "(zero_run - 15)".
+      zero_run -= (zero_run - 15) >> 31;
+
+      sz = (z >> 31);                                // sign of z
+      x  = (z ^ sz) - sz;                            // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[1]) >> 16) + x))
+             >> quant_shift_ptr[1];                  // quantize (x)
+        x  = (y ^ sz) - sz;                          // get the sign back
+        qcoeff_ptr[rc]  = x;                         // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[1];        // dequantized value
+
+        if (y) {
+          eob = i;                                   // last nonzero coeffs
+          zero_run = 0;
+        }
       }
     }
+    xd->eobs[b_idx] = eob + 1;
+  } else {
+    xd->eobs[b_idx] = 0;
   }
-
-  d->eob = eob + 1;
 }
 
 void vp9_quantize_mby_8x8(MACROBLOCK *x) {
   int i;
-  int has_2nd_order = get_2nd_order_usage(&x->e_mbd);
 
-  for (i = 0; i < 16; i ++) {
-    x->e_mbd.block[i].eob = 0;
-  }
-  x->e_mbd.block[24].eob = 0;
   for (i = 0; i < 16; i += 4) {
-    int ib = (i & 8) + ((i & 4) >> 1);
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, &x->e_mbd.block[ib]);
-    if (tx_type != DCT_DCT)
-      assert(has_2nd_order == 0);
-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
-  }
-
-  if (has_2nd_order) {
-    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
-  } else {
-    vpx_memset(x->e_mbd.block[24].qcoeff, 0,
-               16 * sizeof(x->e_mbd.block[24].qcoeff[0]));
-    vpx_memset(x->e_mbd.block[24].dqcoeff, 0,
-               16 * sizeof(x->e_mbd.block[24].dqcoeff[0]));
-    x->e_mbd.block[24].eob = 0;
+    x->quantize_b_8x8(x, i);
   }
 }
 
 void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
   int i;
 
-  for (i = 16; i < 24; i ++)
-    x->e_mbd.block[i].eob = 0;
   for (i = 16; i < 24; i += 4)
-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+    x->quantize_b_8x8(x, i);
 }
 
 void vp9_quantize_mb_8x8(MACROBLOCK *x) {
@@ -310,12 +272,7 @@ void vp9_quantize_mb_8x8(MACROBLOCK *x) {
 }
 
 void vp9_quantize_mby_16x16(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 16; i++)
-    x->e_mbd.block[i].eob = 0;
-  x->e_mbd.block[24].eob = 0;
-  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
+  x->quantize_b_16x16(x, 0);
 }
 
 void vp9_quantize_mb_16x16(MACROBLOCK *x) {
@@ -324,42 +281,46 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x) {
 }
 
 static void quantize(int16_t *zbin_boost_orig_ptr,
-                     int16_t *coeff_ptr, int n_coeffs, int max_coeffs,
+                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
                      uint8_t *quant_shift_ptr,
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      int16_t *dequant_ptr, int zbin_oq_value,
-                     int *eob_ptr, const int *scan, int mul) {
+                     uint16_t *eob_ptr, const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
+  int zero_run = 0;
   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
 
   eob = -1;
-  for (i = 0; i < max_coeffs; i++) {
-    rc   = scan[i];
-    z    = coeff_ptr[rc] * mul;
-
-    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
-    zbin_boost_ptr ++;
-
-    sz = (z >> 31);                               // sign of z
-    x  = (z ^ sz) - sz;                           // x = abs(z)
-
-    if (x >= zbin) {
-      x += (round_ptr[rc!=0]);
-      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-          >> quant_shift_ptr[rc!=0];              // quantize (x)
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
-
-      if (y) {
-        eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = zbin_boost_orig_ptr;
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      rc   = scan[i];
+      z    = coeff_ptr[rc] * mul;
+
+      zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
+      zero_run += (zero_run < 15);
+
+      sz = (z >> 31);                               // sign of z
+      x  = (z ^ sz) - sz;                           // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[rc != 0]);
+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+            >> quant_shift_ptr[rc != 0];            // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc]  = x;                        // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
+
+        if (y) {
+          eob = i;                                  // last nonzero coeffs
+          zero_run = 0;
+        }
       }
     }
   }
@@ -367,49 +328,54 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
   *eob_ptr = eob + 1;
 }
 
-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
-  quantize(b->zrun_zbin_boost_16x16,
+void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK *const b = &mb->block[b_idx];
+  BLOCKD *const d = &xd->block[b_idx];
+  quantize(b->zrun_zbin_boost,
            b->coeff,
-           256, b->eob_max_offset_16x16,
-           b->zbin_16x16, b->round, b->quant, b->quant_shift,
+           256, b->skip_block,
+           b->zbin, b->round, b->quant, b->quant_shift,
            d->qcoeff,
            d->dqcoeff,
            d->dequant,
            b->zbin_extra,
-           &d->eob, vp9_default_zig_zag1d_16x16, 1);
+           &xd->eobs[b_idx], vp9_default_zig_zag1d_16x16, 1);
 }
 
 void vp9_quantize_sby_32x32(MACROBLOCK *x) {
-  x->e_mbd.block[0].eob = 0;
-  quantize(x->block[0].zrun_zbin_boost_32x32,
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+
+  quantize(b->zrun_zbin_boost,
            x->sb_coeff_data.coeff,
-           1024, x->block[0].eob_max_offset_32x32,
-           x->block[0].zbin_32x32,
-           x->block[0].round, x->block[0].quant, x->block[0].quant_shift,
-           x->e_mbd.sb_coeff_data.qcoeff,
-           x->e_mbd.sb_coeff_data.dqcoeff,
-           x->e_mbd.block[0].dequant,
-           x->block[0].zbin_extra,
-           &x->e_mbd.block[0].eob,
+           1024, b->skip_block,
+           b->zbin,
+           b->round, b->quant, b->quant_shift,
+           xd->sb_coeff_data.qcoeff,
+           xd->sb_coeff_data.dqcoeff,
+           d->dequant,
+           b->zbin_extra,
+           &xd->eobs[0],
            vp9_default_zig_zag1d_32x32, 2);
 }
 
 void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
   int i;
+  MACROBLOCKD *xd = &x->e_mbd;
 
-  x->e_mbd.block[16].eob = 0;
-  x->e_mbd.block[20].eob = 0;
   for (i = 16; i < 24; i += 4)
-    quantize(x->block[i].zrun_zbin_boost_16x16,
+    quantize(x->block[i].zrun_zbin_boost,
              x->sb_coeff_data.coeff + 1024 + (i - 16) * 64,
-             256, x->block[i].eob_max_offset_16x16,
-             x->block[i].zbin_16x16,
+             256, x->block[i].skip_block,
+             x->block[i].zbin,
              x->block[i].round, x->block[0].quant, x->block[i].quant_shift,
-             x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
-             x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,
-             x->e_mbd.block[i].dequant,
+             xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+             xd->sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,
+             xd->block[i].dequant,
              x->block[i].zbin_extra,
-             &x->e_mbd.block[i].eob,
+             &xd->eobs[i],
              vp9_default_zig_zag1d_16x16, 1);
 }
 
@@ -417,10 +383,9 @@ void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
  * these two C functions if corresponding optimized routine is not available.
  * NEON optimized version implements currently the fast quantization for pair
  * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,
-                                     BLOCKD *d1, BLOCKD *d2) {
-  vp9_regular_quantize_b_4x4(b1, d1);
-  vp9_regular_quantize_b_4x4(b2, d2);
+void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2) {
+  vp9_regular_quantize_b_4x4(x, b_idx1);
+  vp9_regular_quantize_b_4x4(x, b_idx2);
 }
 
 static void invert_quant(int16_t *quant,
@@ -439,164 +404,33 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
   int Q;
-  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,
-                                      24, 28, 32, 36, 40, 44, 44, 44
-                                    };
-
-  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,
-                                          14, 16, 18, 20, 22, 24, 26, 28,
-                                          30, 32, 34, 36, 38, 40, 42, 44,
-                                          46, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48,
-                                          48, 48, 48, 48, 48, 48, 48, 48
-                                        };
-  static const int zbin_boost_16x16[256] = {
-     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
-    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-  };
-  static const int zbin_boost_32x32[1024] = {
-    0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
-    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-  };
-  int qrounding_factor = 48;
 
+  static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
+                                     14, 16, 20, 24, 28, 32, 36, 40 };
 
   for (Q = 0; Q < QINDEX_RANGE; Q++) {
     int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
-
-#if CONFIG_LOSSLESS
-    if (cpi->oxcf.lossless) {
-      if (Q == 0) {
-        qzbin_factor = 64;
-        qrounding_factor = 64;
-      }
+    int qrounding_factor = 48;
+    if (Q == 0) {
+      qzbin_factor = 64;
+      qrounding_factor = 64;
     }
-#endif
-
     // dc values
     quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);
     invert_quant(cpi->Y1quant[Q] + 0,
                  cpi->Y1quant_shift[Q] + 0, quant_val);
     cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
     cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.Y1dequant[Q][0] = quant_val;
     cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-    cpi->zrun_zbin_boost_y1_8x8[Q][0] =
-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y1_16x16[Q][0] =
-      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-    cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->zrun_zbin_boost_y1_32x32[Q][0] =
-     ((quant_val * zbin_boost_32x32[0]) + 64) >> 7;
-
-
-    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
-    invert_quant(cpi->Y2quant[Q] + 0,
-                 cpi->Y2quant_shift[Q] + 0, quant_val);
-    cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.Y2dequant[Q][0] = quant_val;
-    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-    cpi->zrun_zbin_boost_y2_8x8[Q][0] =
-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y2_16x16[Q][0] =
-      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
 
     quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
     invert_quant(cpi->UVquant[Q] + 0,
                  cpi->UVquant_shift[Q] + 0, quant_val);
     cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
-    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
     cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.UVdequant[Q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-    cpi->zrun_zbin_boost_uv_8x8[Q][0] =
-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_uv_16x16[Q][0] =
-      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
 
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
@@ -611,15 +445,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       cpi->zrun_zbin_boost_y1[Q][i] =
         ((quant_val * zbin_boost[i]) + 64) >> 7;
 
-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
-      invert_quant(cpi->Y2quant[Q] + rc,
-                   cpi->Y2quant_shift[Q] + rc, quant_val);
-      cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.Y2dequant[Q][rc] = quant_val;
-      cpi->zrun_zbin_boost_y2[Q][i] =
-        ((quant_val * zbin_boost[i]) + 64) >> 7;
-
       quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
       invert_quant(cpi->UVquant[Q] + rc,
                    cpi->UVquant_shift[Q] + rc, quant_val);
@@ -629,57 +454,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       cpi->zrun_zbin_boost_uv[Q][i] =
         ((quant_val * zbin_boost[i]) + 64) >> 7;
     }
-
-    // 8x8 structures... only zbin seperated out for now
-    // This needs cleaning up for 8x8 especially if we are to add
-    // support for non flat Q matices
-    for (i = 1; i < 64; i++) {
-      int rc = vp9_default_zig_zag1d_8x8[i];
-
-      quant_val = vp9_ac_yquant(Q);
-      cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_8x8[Q][i] =
-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-
-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
-      cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_8x8[Q][i] =
-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-
-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-      cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_8x8[Q][i] =
-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-    }
-
-    // 16x16 structures. Same comment above applies.
-    for (i = 1; i < 256; i++) {
-      int rc = vp9_default_zig_zag1d_16x16[i];
-
-      quant_val = vp9_ac_yquant(Q);
-      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_16x16[Q][i] =
-        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-
-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
-      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_16x16[Q][i] =
-        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-
-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_16x16[Q][i] =
-        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-    }
-    // 32x32 structures. Same comment above applies.
-    for (i = 1; i < 1024; i++) {
-      int rc = vp9_default_zig_zag1d_32x32[i];
-
-      quant_val = vp9_ac_yquant(Q);
-      cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_32x32[Q][i] =
-        ((quant_val * zbin_boost_32x32[i]) + 64) >> 7;
-    }
   }
 }
 
@@ -709,106 +483,40 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
 
   // Y
   zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
+                (cpi->zbin_mode_boost +
                  x->act_zbin_adj)) >> 7;
 
   for (i = 0; i < 16; i++) {
     x->block[i].quant = cpi->Y1quant[QIndex];
     x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
     x->block[i].zbin = cpi->Y1zbin[QIndex];
-    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
-    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
-    x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex];
     x->block[i].round = cpi->Y1round[QIndex];
     x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
-    x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex];
     x->block[i].zbin_extra = (int16_t)zbin_extra;
 
-    // Segment max eob offset feature.
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
-      x->block[i].eob_max_offset =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_8x8 =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_16x16 =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_32x32 =
-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    } else {
-      x->block[i].eob_max_offset = 16;
-      x->block[i].eob_max_offset_8x8 = 64;
-      x->block[i].eob_max_offset_16x16 = 256;
-      x->block[i].eob_max_offset_32x32 = 1024;
-    }
+    // Segment skip feature.
+    x->block[i].skip_block =
+      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
   }
 
   // UV
   zbin_extra = (cpi->common.UVdequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
+                (cpi->zbin_mode_boost +
                  x->act_zbin_adj)) >> 7;
 
   for (i = 16; i < 24; i++) {
     x->block[i].quant = cpi->UVquant[QIndex];
     x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
     x->block[i].zbin = cpi->UVzbin[QIndex];
-    x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];
-    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];
     x->block[i].round = cpi->UVround[QIndex];
     x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
-
     x->block[i].zbin_extra = (int16_t)zbin_extra;
 
-    // Segment max eob offset feature.
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
-      x->block[i].eob_max_offset =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_8x8 =
-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-      x->block[i].eob_max_offset_16x16 =
-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    } else {
-      x->block[i].eob_max_offset = 16;
-      x->block[i].eob_max_offset_8x8 = 64;
-      x->block[i].eob_max_offset_16x16 = 256;
-    }
-  }
-
-  // Y2
-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
-                ((cpi->zbin_over_quant / 2) +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  x->block[24].quant = cpi->Y2quant[QIndex];
-  x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
-  x->block[24].zbin = cpi->Y2zbin[QIndex];
-  x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];
-  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];
-  x->block[24].round = cpi->Y2round[QIndex];
-  x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
-  x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
-  x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
-  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
-  x->block[24].zbin_extra = (int16_t)zbin_extra;
-
-  // TBD perhaps not use for Y2
-  // Segment max eob offset feature.
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
-    x->block[24].eob_max_offset =
-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-    x->block[24].eob_max_offset_8x8 =
-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-  } else {
-    x->block[24].eob_max_offset = 16;
-    x->block[24].eob_max_offset_8x8 = 4;
+    // Segment skip feature.
+    x->block[i].skip_block =
+      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
   }
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
@@ -822,8 +530,7 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
 
   // Y
   zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
+                (cpi->zbin_mode_boost +
                  x->act_zbin_adj)) >> 7;
   for (i = 0; i < 16; i++) {
     x->block[i].zbin_extra = (int16_t)zbin_extra;
@@ -831,21 +538,12 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
 
   // UV
   zbin_extra = (cpi->common.UVdequant[QIndex][1] *
-                (cpi->zbin_over_quant +
-                 cpi->zbin_mode_boost +
+                (cpi->zbin_mode_boost +
                  x->act_zbin_adj)) >> 7;
 
   for (i = 16; i < 24; i++) {
     x->block[i].zbin_extra = (int16_t)zbin_extra;
   }
-
-  // Y2
-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
-                ((cpi->zbin_over_quant / 2) +
-                 cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  x->block[24].zbin_extra = (int16_t)zbin_extra;
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
@@ -864,10 +562,8 @@ void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
   // if any of the delta_q values are changing update flag will
   // have to be set.
   cm->y1dc_delta_q = 0;
-  cm->y2ac_delta_q = 0;
   cm->uvdc_delta_q = 0;
   cm->uvac_delta_q = 0;
-  cm->y2dc_delta_q = 0;
 
   // quantizer has to be reinitialized if any delta_q changes.
   // As there are not any here for now this is inactive code.
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index ac44a751c..d338e620a 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -14,10 +14,10 @@
 #include "vp9/encoder/vp9_block.h"
 
 #define prototype_quantize_block(sym) \
-  void (sym)(BLOCK *b,BLOCKD *d)
+  void (sym)(MACROBLOCK *mb, int b_idx)
 
 #define prototype_quantize_block_pair(sym) \
-  void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+  void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2)
 
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
@@ -27,7 +27,7 @@
 #endif
 
 #define prototype_quantize_block_type(sym) \
-  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)
+  void (sym)(MACROBLOCK *mb, int b_ix, TX_TYPE type)
 extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);
 
 #ifndef vp9_quantize_quantb_4x4
@@ -50,11 +50,6 @@ extern prototype_quantize_block(vp9_quantize_quantb_8x8);
 #endif
 extern prototype_quantize_block(vp9_quantize_quantb_16x16);
 
-#ifndef vp9_quantize_quantb_2x2
-#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_2x2);
-
 #ifndef vp9_quantize_mb_4x4
 #define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c
 #endif
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index f663b56c9..a2a79574d 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -14,8 +14,8 @@
 #include <string.h>
 #include <limits.h>
 #include <assert.h>
+#include <math.h>
 
-#include "math.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_modecont.h"
 #include "vp9/common/vp9_common.h"
@@ -25,9 +25,10 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
 
-#define MIN_BPB_FACTOR          0.005
-#define MAX_BPB_FACTOR          50
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
 
 #ifdef MODE_STATS
 extern unsigned int y_modes[VP9_YMODES];
@@ -113,13 +114,19 @@ static int kfboost_qadjust(int qindex) {
   return retval;
 }
 
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {
-  if (frame_type == KEY_FRAME)
-    return (int)(4500000 / vp9_convert_qindex_to_q(qindex));
-  else
-    return (int)(2850000 / vp9_convert_qindex_to_q(qindex));
-}
+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                    double correction_factor) {
+  int enumerator;
+  double q = vp9_convert_qindex_to_q(qindex);
+
+  if (frame_type == KEY_FRAME) {
+    enumerator = 4500000;
+  } else {
+    enumerator = 2850000;
+  }
 
+  return (int)(0.5 + (enumerator * correction_factor / q));
+}
 
 void vp9_save_coding_context(VP9_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
@@ -168,11 +175,8 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
 
   vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4);
-  vp9_copy(cc->hybrid_coef_probs_4x4, cm->fc.hybrid_coef_probs_4x4);
   vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
-  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
   vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
-  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
   vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -226,11 +230,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
 
   vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4);
-  vp9_copy(cm->fc.hybrid_coef_probs_4x4, cc->hybrid_coef_probs_4x4);
   vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
-  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
   vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
-  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
   vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -238,68 +239,33 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 #endif
 }
 
-
 void vp9_setup_key_frame(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
-  // Setup for Key frame:
-  vp9_default_coef_probs(& cpi->common);
-  vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
-  vp9_init_mbmode_probs(& cpi->common);
-  vp9_default_bmode_probs(cm->fc.bmode_prob);
-
-  if(cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));
-
-  vp9_init_mv_probs(& cpi->common);
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
-  // cpi->common.filter_level = 0;      // Reset every key frame.
-  cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
+  vp9_setup_past_independence(cm, xd);
 
   // interval before next GF
   cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-
-  cpi->common.refresh_golden_frame = TRUE;
-  cpi->common.refresh_alt_ref_frame = TRUE;
-
-  vp9_init_mode_contexts(&cpi->common);
-  vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
-  vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
-
-  vpx_memset(cm->prev_mip, 0,
-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  vpx_memset(cm->mip, 0,
-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-
-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_in_image(cm, cm->mi);
-
-#if CONFIG_NEW_MVREF
-  if (1) {
-    MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-    // Defaults probabilities for encoding the MV ref id signal
-    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
-               sizeof(xd->mb_mv_ref_probs));
-  }
-#endif
+  /* All buffers are implicitly updated on key frames. */
+  cpi->refresh_golden_frame = TRUE;
+  cpi->refresh_alt_ref_frame = TRUE;
 }
 
 void vp9_setup_inter_frame(VP9_COMP *cpi) {
-  if (cpi->common.refresh_alt_ref_frame) {
-    vpx_memcpy(&cpi->common.fc,
-               &cpi->common.lfc_a,
-               sizeof(cpi->common.fc));
-  } else {
-    vpx_memcpy(&cpi->common.fc,
-               &cpi->common.lfc,
-               sizeof(cpi->common.fc));
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  if (cm->error_resilient_mode) {
+    vp9_setup_past_independence(cm, xd);
   }
+  assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
+  vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx],
+             sizeof(cm->fc));
 }
 
-
 static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
                               double correction_factor) {
-  int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));
+  int Bpm = (int)(vp9_bits_per_mb(frame_kind, Q, correction_factor));
 
   /* Attempt to retain reasonable accuracy without overflow. The cutoff is
    * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
@@ -358,7 +324,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
 
 
   // Special alt reference frame case
-  if (cpi->common.refresh_alt_ref_frame) {
+  if (cpi->refresh_alt_ref_frame) {
     // Per frame bit target for the alt ref frame
     cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
     cpi->this_frame_target = cpi->per_frame_bandwidth;
@@ -377,7 +343,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
   if (cpi->this_frame_target < min_frame_target)
     cpi->this_frame_target = min_frame_target;
 
-  if (!cpi->common.refresh_alt_ref_frame)
+  if (!cpi->refresh_alt_ref_frame)
     // Note the baseline target data rate for this inter frame.
     cpi->inter_frame_target = cpi->this_frame_target;
 
@@ -386,7 +352,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
     // int Boost = 0;
     int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
 
-    cpi->common.refresh_golden_frame = TRUE;
+    cpi->refresh_golden_frame = TRUE;
 
     calc_gf_params(cpi);
 
@@ -431,35 +397,18 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   if (cpi->common.frame_type == KEY_FRAME) {
     rate_correction_factor = cpi->key_frame_rate_correction_factor;
   } else {
-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       rate_correction_factor = cpi->gf_rate_correction_factor;
     else
       rate_correction_factor = cpi->rate_correction_factor;
   }
 
-  // Work out how big we would have expected the frame to be at this Q given the current correction factor.
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
   // Stay in double to avoid int overflow when values are large
   projected_size_based_on_q =
-    (int)(((.5 + rate_correction_factor *
-            vp9_bits_per_mb(cpi->common.frame_type, Q)) *
-           cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
-
-  // Make some allowance for cpi->zbin_over_quant
-  if (cpi->zbin_over_quant > 0) {
-    int Z = cpi->zbin_over_quant;
-    double Factor = 0.99;
-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
-
-    while (Z > 0) {
-      Z--;
-      projected_size_based_on_q =
-        (int)(Factor * projected_size_based_on_q);
-      Factor += factor_adjustment;
-
-      if (Factor  >= 0.999)
-        Factor = 0.999;
-    }
-  }
+    estimate_bits_at_q(cpi->common.frame_type, Q,
+                       cpi->common.MBs, rate_correction_factor);
 
   // Work out a size correction factor.
   // if ( cpi->this_frame_target > 0 )
@@ -505,7 +454,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   if (cpi->common.frame_type == KEY_FRAME)
     cpi->key_frame_rate_correction_factor = rate_correction_factor;
   else {
-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       cpi->gf_rate_correction_factor = rate_correction_factor;
     else
       cpi->rate_correction_factor = rate_correction_factor;
@@ -522,14 +471,11 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
   int bits_per_mb_at_this_q;
   double correction_factor;
 
-  // Reset Zbin OQ value
-  cpi->zbin_over_quant = 0;
-
   // Select the appropriate correction factor based upon type of frame.
   if (cpi->common.frame_type == KEY_FRAME)
     correction_factor = cpi->key_frame_rate_correction_factor;
   else {
-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       correction_factor = cpi->gf_rate_correction_factor;
     else
       correction_factor = cpi->rate_correction_factor;
@@ -545,8 +491,7 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
 
   do {
     bits_per_mb_at_this_q =
-      (int)(.5 + correction_factor *
-            vp9_bits_per_mb(cpi->common.frame_type, i));
+      (int)(vp9_bits_per_mb(cpi->common.frame_type, i, correction_factor));
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -559,45 +504,6 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
       last_error = bits_per_mb_at_this_q - target_bits_per_mb;
   } while (++i <= cpi->active_worst_quality);
 
-
-  // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
-  // the RD multiplier and zero bin size.
-  if (Q >= MAXQ) {
-    int zbin_oqmax;
-
-    double Factor = 0.99;
-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
-
-    if (cpi->common.frame_type == KEY_FRAME)
-      zbin_oqmax = 0; // ZBIN_OQ_MAX/16
-    else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
-      zbin_oqmax = 16;
-    else
-      zbin_oqmax = ZBIN_OQ_MAX;
-
-    // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
-    // The effect will be highly clip dependent and may well have sudden steps.
-    // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
-    // bin and hence decreasing the number of low magnitude non zero coefficients.
-    while (cpi->zbin_over_quant < zbin_oqmax) {
-      cpi->zbin_over_quant++;
-
-      if (cpi->zbin_over_quant > zbin_oqmax)
-        cpi->zbin_over_quant = zbin_oqmax;
-
-      // Adjust bits_per_mb_at_this_q estimate
-      bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
-      Factor += factor_adjustment;
-
-      if (Factor  >= 0.999)
-        Factor = 0.999;
-
-      if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate
-        break;
-    }
-
-  }
-
   return Q;
 }
 
@@ -671,7 +577,7 @@ void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
       *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
       *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
     } else {
-      if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {
+      if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
         *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
         *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
       } else {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index c6484817f..473317605 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -16,23 +16,24 @@
 
 #define FRAME_OVERHEAD_BITS 200
 
-extern void vp9_save_coding_context(VP9_COMP *cpi);
-extern void vp9_restore_coding_context(VP9_COMP *cpi);
+void vp9_save_coding_context(VP9_COMP *cpi);
+void vp9_restore_coding_context(VP9_COMP *cpi);
 
-extern void vp9_setup_key_frame(VP9_COMP *cpi);
-extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
-                                          int *frame_under_shoot_limit,
-                                          int *frame_over_shoot_limit);
+void vp9_setup_key_frame(VP9_COMP *cpi);
+void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
+void vp9_adjust_key_frame_context(VP9_COMP *cpi);
+void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
+                                   int *frame_under_shoot_limit,
+                                   int *frame_over_shoot_limit);
 
 // return of 0 means drop frame
-extern int vp9_pick_frame_size(VP9_COMP *cpi);
+int vp9_pick_frame_size(VP9_COMP *cpi);
 
-extern double vp9_convert_qindex_to_q(int qindex);
-extern int vp9_gfboost_qadjust(int qindex);
-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);
+double vp9_convert_qindex_to_q(int qindex);
+int vp9_gfboost_qadjust(int qindex);
+extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                           double correction_factor);
 void vp9_setup_inter_frame(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e8d0cc68e..59e33a464 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -23,7 +23,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_reconintra4x4.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -151,20 +150,21 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
 static void fill_token_costs(vp9_coeff_count *c,
                              vp9_coeff_probs *p,
                              int block_type_counts) {
-  int i, j, k;
+  int i, j, k, l;
 
   for (i = 0; i < block_type_counts; i++)
-    for (j = 0; j < COEF_BANDS; j++)
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))
-          vp9_cost_tokens_skip((int *)(c[i][j][k]),
-                               p[i][j][k],
-                               vp9_coef_tree);
-        else
-          vp9_cost_tokens((int *)(c[i][j][k]),
-                          p[i][j][k],
-                          vp9_coef_tree);
-      }
+    for (j = 0; j < REF_TYPES; j++)
+      for (k = 0; k < COEF_BANDS; k++)
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+          if (l == 0 && k > 0)
+            vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
+                                 p[i][j][k][l],
+                                 vp9_coef_tree);
+          else
+            vp9_cost_tokens((int *)(c[i][j][k][l]),
+                            p[i][j][k][l],
+                            vp9_coef_tree);
+        }
 }
 
 
@@ -218,16 +218,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
 
   cpi->RDMULT = compute_rd_mult(QIndex);
 
-  // Extend rate multiplier along side quantizer zbin increases
-  if (cpi->zbin_over_quant  > 0) {
-    double oq_factor;
-
-    // Experimental code using the same basic equation as used for Q above
-    // The units of cpi->zbin_over_quant are 1/128 of Q bin size
-    oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
-    cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);
-  }
-
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     if (cpi->twopass.next_iiratio > 31)
       cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
@@ -279,20 +269,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
   }
 
   fill_token_costs(cpi->mb.token_costs[TX_4X4],
-                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES_4X4);
-  fill_token_costs(cpi->mb.hybrid_token_costs[TX_4X4],
-                   cpi->common.fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);
-
+                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES);
   fill_token_costs(cpi->mb.token_costs[TX_8X8],
-                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES_8X8);
-  fill_token_costs(cpi->mb.hybrid_token_costs[TX_8X8],
-                   cpi->common.fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);
-
+                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES);
   fill_token_costs(cpi->mb.token_costs[TX_16X16],
-                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES_16X16);
-  fill_token_costs(cpi->mb.hybrid_token_costs[TX_16X16],
-                   cpi->common.fc.hybrid_coef_probs_16x16, BLOCK_TYPES_16X16);
-
+                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
                    cpi->common.fc.coef_probs_32x32, BLOCK_TYPES_32X32);
 
@@ -321,26 +302,7 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
   return error;
 }
 
-int vp9_mbblock_error_8x8_c(MACROBLOCK *mb, int dc) {
-  BLOCK  *be;
-  BLOCKD *bd;
-  int i, j;
-  int berror, error = 0;
-
-  for (i = 0; i < 16; i+=4) {
-    be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-    berror = 0;
-    for (j = dc; j < 64; j++) {
-      int this_diff = be->coeff[j] - bd->dqcoeff[j];
-      berror += this_diff * this_diff;
-    }
-    error += berror;
-  }
-  return error;
-}
-
-int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {
+int vp9_mbblock_error_c(MACROBLOCK *mb) {
   BLOCK  *be;
   BLOCKD *bd;
   int i, j;
@@ -350,7 +312,7 @@ int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {
     be = &mb->block[i];
     bd = &mb->e_mbd.block[i];
     berror = 0;
-    for (j = dc; j < 16; j++) {
+    for (j = 0; j < 16; j++) {
       int this_diff = be->coeff[j] - bd->dqcoeff[j];
       berror += this_diff * this_diff;
     }
@@ -417,41 +379,36 @@ int vp9_uvsse(MACROBLOCK *x) {
     sse2 += sse1;
   }
   return sse2;
-
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-#define PT pn
-#else
-#define PT pt
-#endif
-static int cost_coeffs(MACROBLOCK *mb,
-                       BLOCKD *b, PLANE_TYPE type,
-                       ENTROPY_CONTEXT *a,
-                       ENTROPY_CONTEXT *l,
-                       TX_SIZE tx_size) {
+static INLINE int cost_coeffs(MACROBLOCK *mb,
+                              BLOCKD *b, PLANE_TYPE type,
+                              ENTROPY_CONTEXT *a,
+                              ENTROPY_CONTEXT *l,
+                              TX_SIZE tx_size) {
   int pt;
-  const int eob = b->eob;
-  MACROBLOCKD *xd = &mb->e_mbd;
+  MACROBLOCKD *const xd = &mb->e_mbd;
   const int ib = (int)(b - xd->block);
-  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+  const int eob = xd->eobs[ib];
+  int c = 0;
   int cost = 0, seg_eob;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int *scan, *band;
+  const int *scan;
   int16_t *qcoeff_ptr = b->qcoeff;
+  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
   const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
                           get_tx_type(xd, b) : DCT_DCT;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-  int pn;
-#endif
-
+  unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+      mb->token_costs[tx_size][type][ref];
   ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+  ENTROPY_CONTEXT *const a1 = a +
+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
+  ENTROPY_CONTEXT *const l1 = l +
+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
 
   switch (tx_size) {
     case TX_4X4:
       scan = vp9_default_zig_zag1d_4x4;
-      band = vp9_coef_bands_4x4;
       seg_eob = 16;
       if (type == PLANE_TYPE_Y_WITH_DC) {
         if (tx_type == ADST_DCT) {
@@ -462,30 +419,32 @@ static int cost_coeffs(MACROBLOCK *mb,
       }
       break;
     case TX_8X8:
-      if (type == PLANE_TYPE_Y2) {
-        scan = vp9_default_zig_zag1d_4x4;
-        band = vp9_coef_bands_4x4;
-        seg_eob = 4;
-      } else {
-        scan = vp9_default_zig_zag1d_8x8;
-        band = vp9_coef_bands_8x8;
-        seg_eob = 64;
-      }
+      a_ec = (a[0] + a[1]) != 0;
+      l_ec = (l[0] + l[1]) != 0;
+      scan = vp9_default_zig_zag1d_8x8;
+      seg_eob = 64;
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
-      band = vp9_coef_bands_16x16;
       seg_eob = 256;
       if (type == PLANE_TYPE_UV) {
         const int uv_idx = ib - 16;
         qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx;
+        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
       }
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
-      band = vp9_coef_bands_32x32;
       seg_eob = 1024;
       qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      a_ec = (a[0] + a[1] + a[2] + a[3] +
+              a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+      l_ec = (l[0] + l[1] + l[2] + l[3] +
+              l1[0] + l1[1] + l1[2] + l1[3]) != 0;
       break;
     default:
       abort();
@@ -493,59 +452,45 @@ static int cost_coeffs(MACROBLOCK *mb,
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-#if CONFIG_NEWCOEFCONTEXT
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-  pn = pt;
-#endif
 
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
+    seg_eob = 0;
 
-  if (tx_type != DCT_DCT) {
-    for (; c < eob; c++) {
-      int v = qcoeff_ptr[scan[c]];
-      int t = vp9_dct_value_tokens_ptr[v].Token;
-      cost += mb->hybrid_token_costs[tx_size][type][band[c]][PT][t];
-      cost += vp9_dct_value_cost_ptr[v];
-      pt = vp9_prev_token_class[t];
-#if CONFIG_NEWCOEFCONTEXT
-      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))
-        pn = vp9_get_coef_neighbor_context(
-            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
-      else
-        pn = pt;
-#endif
-    }
-    if (c < seg_eob)
-      cost += mb->hybrid_token_costs[tx_size][type][band[c]]
-          [PT][DCT_EOB_TOKEN];
-  } else {
+  {
+    int recent_energy = 0;
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].Token;
-      cost += mb->token_costs[tx_size][type][band[c]][pt][t];
+      cost += token_costs[get_coef_band(tx_size, c)][pt][t];
       cost += vp9_dct_value_cost_ptr[v];
-      pt = vp9_prev_token_class[t];
-#if CONFIG_NEWCOEFCONTEXT
-      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))
-        pn = vp9_get_coef_neighbor_context(
-            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
-      else
-        pn = pt;
-#endif
+      pt = vp9_get_coef_context(&recent_energy, t);
     }
     if (c < seg_eob)
-      cost += mb->token_costs[tx_size][type][band[c]]
-          [PT][DCT_EOB_TOKEN];
+      cost += mb->token_costs[tx_size][type][ref][get_coef_band(tx_size, c)]
+          [pt][DCT_EOB_TOKEN];
   }
 
   // is eob first coefficient;
-  pt = (c > !type);
+  pt = (c > 0);
   *a = *l = pt;
+  if (tx_size >= TX_8X8) {
+    a[1] = l[1] = pt;
+    if (tx_size >= TX_16X16) {
+      if (type == PLANE_TYPE_UV) {
+        a1[0] = a1[1] = l1[0] = l1[1] = pt;
+      } else {
+        a[2] = a[3] = l[2] = l[3] = pt;
+        if (tx_size >= TX_32X32) {
+          a1[0] = a1[1] = a1[2] = a1[3] = pt;
+          l1[0] = l1[1] = l1[2] = l1[3] = pt;
+        }
+      }
+    }
+  }
   return cost;
 }
 
-static int rdcost_mby_4x4(MACROBLOCK *mb, int has_2nd_order, int backup) {
+static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -565,19 +510,11 @@ static int rdcost_mby_4x4(MACROBLOCK *mb, int has_2nd_order, int backup) {
   }
 
   for (b = 0; b < 16; b++)
-    cost += cost_coeffs(mb, xd->block + b,
-                        (has_2nd_order ?
-                         PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC),
+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
 
-  if (has_2nd_order)
-    cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
-                        ta + vp9_block2above[TX_4X4][24],
-                        tl + vp9_block2left[TX_4X4][24],
-                        TX_4X4);
-
   return cost;
 }
 
@@ -586,26 +523,17 @@ static void macro_block_yrd_4x4(MACROBLOCK *mb,
                                 int *Distortion,
                                 int *skippable, int backup) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK   *const mb_y2 = mb->block + 24;
-  BLOCKD *const x_y2  = xd->block + 24;
-  int d, has_2nd_order;
 
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  has_2nd_order = get_2nd_order_usage(xd);
-  // Fdct and building the 2nd order block
   vp9_transform_mby_4x4(mb);
   vp9_quantize_mby_4x4(mb);
-  d = vp9_mbblock_error(mb, has_2nd_order);
-  if (has_2nd_order)
-    d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
-
-  *Distortion = (d >> 2);
-  // rate
-  *Rate = rdcost_mby_4x4(mb, has_2nd_order, backup);
-  *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, has_2nd_order);
+
+  *Distortion = vp9_mbblock_error(mb) >> 2;
+  *Rate = rdcost_mby_4x4(mb, backup);
+  *skippable = vp9_mby_is_skippable_4x4(xd);
 }
 
-static int rdcost_mby_8x8(MACROBLOCK *mb, int has_2nd_order, int backup) {
+static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -625,18 +553,11 @@ static int rdcost_mby_8x8(MACROBLOCK *mb, int has_2nd_order, int backup) {
   }
 
   for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(mb, xd->block + b,
-                        (has_2nd_order ?
-                         PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC),
+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b],
                         TX_8X8);
 
-  if (has_2nd_order)
-    cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
-                            ta + vp9_block2above[TX_8X8][24],
-                            tl + vp9_block2left[TX_8X8][24],
-                            TX_8X8);
   return cost;
 }
 
@@ -645,23 +566,14 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
                                 int *Distortion,
                                 int *skippable, int backup) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK   *const mb_y2 = mb->block + 24;
-  BLOCKD *const x_y2  = xd->block + 24;
-  int d, has_2nd_order;
 
   xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-
   vp9_transform_mby_8x8(mb);
   vp9_quantize_mby_8x8(mb);
-  has_2nd_order = get_2nd_order_usage(xd);
-  d = vp9_mbblock_error_8x8_c(mb, has_2nd_order);
-  if (has_2nd_order)
-    d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
-
-  *Distortion = (d >> 2);
-  // rate
-  *Rate = rdcost_mby_8x8(mb, has_2nd_order, backup);
-  *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, has_2nd_order);
+
+  *Distortion = vp9_mbblock_error(mb) >> 2;
+  *Rate = rdcost_mby_8x8(mb, backup);
+  *skippable = vp9_mby_is_skippable_8x8(xd);
 }
 
 static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
@@ -687,7 +599,6 @@ static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
 
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
                                   int *skippable, int backup) {
-  int d;
   MACROBLOCKD *xd = &mb->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_16X16;
@@ -696,15 +607,13 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
   // TODO(jingning) is it possible to quickly determine whether to force
   //                trailing coefficients to be zero, instead of running trellis
   //                optimization in the rate-distortion optimization loop?
-  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
+  if (mb->optimize &&
+      xd->mode_info_context->mbmi.mode < I8X8_PRED)
     vp9_optimize_mby_16x16(mb);
 
-  d = vp9_mbblock_error(mb, 0);
-
-  *Distortion = (d >> 2);
-  // rate
+  *Distortion = vp9_mbblock_error(mb) >> 2;
   *Rate = rdcost_mby_16x16(mb, backup);
-  *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
+  *skippable = vp9_mby_is_skippable_16x16(xd);
 }
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -820,15 +729,15 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
 
 static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
   MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
   if (backup) {
     ta = (ENTROPY_CONTEXT *) &t_above,
     tl = (ENTROPY_CONTEXT *) &t_left;
 
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES) * 2);
   } else {
     ta = (ENTROPY_CONTEXT *) xd->above_context;
     tl = (ENTROPY_CONTEXT *) xd->left_context;
@@ -857,21 +766,18 @@ static void super_block_yrd_32x32(MACROBLOCK *x,
   SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
   MACROBLOCKD * const xd = &x->e_mbd;
   SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
-#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
+#if DEBUG_ERROR
   int16_t out[1024];
 #endif
 
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
-#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
+#if DEBUG_ERROR
   vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
 #endif
 
-#if !CONFIG_DWTDCTHYBRID
   *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
-#else
-  *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;
-#endif
+
 #if DEBUG_ERROR
   printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
          vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
@@ -1129,17 +1035,17 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
     rate = bmode_costs[mode];
 #endif
 
-    vp9_intra4x4_predict(b, mode, b->predictor);
+    vp9_intra4x4_predict(xd, b, mode, b->predictor);
     vp9_subtract_b(be, b, 16);
 
     b->bmi.as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, b);
     if (tx_type != DCT_DCT) {
-      vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
-      vp9_ht_quantize_b_4x4(be, b, tx_type);
+      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+      vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);
     } else {
-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b_4x4(be, b);
+      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+      x->quantize_b_4x4(x, be - x->block);
     }
 
     tempa = ta;
@@ -1168,9 +1074,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);
+    vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type);
   else
-    xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);
+    xd->inv_txm4x4(best_dqcoeff, b->diff, 32);
 
   vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 
@@ -1179,8 +1085,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
                                          int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd,
-                                         int update_contexts) {
+                                         int *Distortion, int64_t best_rd) {
   int i;
   MACROBLOCKD *const xd = &mb->e_mbd;
   int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
@@ -1191,18 +1096,13 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   ENTROPY_CONTEXT *ta, *tl;
   int *bmode_costs;
 
-  if (update_contexts) {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  } else {
-    vpx_memcpy(&t_above, xd->above_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_above, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES));
 
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  }
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
 
   xd->mode_info_context->mbmi.mode = B_PRED;
   bmode_costs = mb->inter_bmode_costs;
@@ -1407,8 +1307,9 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   int distortion = 0, rate = 0;
   BLOCK  *be = x->block + ib;
   BLOCKD *b = xd->block + ib;
-  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
-  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
+  ENTROPY_CONTEXT_PLANES ta, tl;
+  ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;
+  ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;
 
   /*
    * The predictor buffer is a 2d buffer with a stride of 16.  Create
@@ -1430,58 +1331,75 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     rate = mode_costs[mode];
     b->bmi.as_mode.first = mode;
 
-    vp9_intra8x8_predict(b, mode, b->predictor);
+    vp9_intra8x8_predict(xd, b, mode, b->predictor);
 
     vp9_subtract_4b_c(be, b, 16);
 
-    assert(get_2nd_order_usage(xd) == 0);
     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
       TX_TYPE tx_type = get_tx_type_8x8(xd, b);
       if (tx_type != DCT_DCT)
-        vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
+        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       else
-        x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-      x->quantize_b_8x8(x->block + idx, xd->block + idx);
+        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->quantize_b_8x8(x, idx);
 
       // compute quantization mse of 8x8 block
       distortion = vp9_block_error_c((x->block + idx)->coeff,
                                      (xd->block + idx)->dqcoeff, 64);
-      ta0 = a[vp9_block2above[TX_8X8][idx]];
-      tl0 = l[vp9_block2left[TX_8X8][idx]];
+
+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
+
+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx];
+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx];
+      ta1 = ta0 + 1;
+      tl1 = tl0 + 1;
 
       rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
-                           &ta0, &tl0, TX_8X8);
+                           ta0, tl0, TX_8X8);
 
       rate += rate_t;
-      ta1 = ta0;
-      tl1 = tl0;
     } else {
       static const int iblock[4] = {0, 1, 4, 5};
       TX_TYPE tx_type;
       int i;
-      ta0 = a[vp9_block2above[TX_4X4][ib]];
-      ta1 = a[vp9_block2above[TX_4X4][ib + 1]];
-      tl0 = l[vp9_block2left[TX_4X4][ib]];
-      tl1 = l[vp9_block2left[TX_4X4][ib + 4]];
+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib];
+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib];
+      ta1 = ta0 + 1;
+      tl1 = tl0 + 1;
       distortion = 0;
       rate_t = 0;
       for (i = 0; i < 4; ++i) {
+        int do_two = 0;
         b = &xd->block[ib + iblock[i]];
         be = &x->block[ib + iblock[i]];
         tx_type = get_tx_type_4x4(xd, b);
         if (tx_type != DCT_DCT) {
-          vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
-          vp9_ht_quantize_b_4x4(be, b, tx_type);
+          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
+          vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
+        } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
+          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
+          do_two = 1;
         } else {
-          x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-          x->quantize_b_4x4(be, b);
+          x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+          x->quantize_b_4x4(x, ib + iblock[i]);
         }
-        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16);
+        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
         rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,
-                              // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
-                              &ta0, &tl0,
+                              i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
+        if (do_two) {
+          i++;
+          rate_t += cost_coeffs(x, b + 1, PLANE_TYPE_Y_WITH_DC,
+                                i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
+                                TX_4X4);
+        }
       }
+      b = &xd->block[ib];
+      be = &x->block[ib];
       rate += rate_t;
     }
 
@@ -1491,10 +1409,10 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       *bestrate = rate;
       *bestratey = rate_t;
       *bestdistortion = distortion;
-      besta0 = ta0;
-      besta1 = ta1;
-      bestl0 = tl0;
-      bestl1 = tl1;
+      besta0 = *ta0;
+      besta1 = *ta1;
+      bestl0 = *tl0;
+      bestl1 = *tl1;
       best_rd = this_rd;
       *best_mode = mode;
       copy_predictor_8x8(best_predictor, b->predictor);
@@ -1647,12 +1565,12 @@ static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
   if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
 
     ta = (ENTROPY_CONTEXT *) &t_above;
     tl = (ENTROPY_CONTEXT *) &t_left;
@@ -1752,8 +1670,9 @@ static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 }
 
 static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skip, int fullpixel) {
-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
+                              int *distortion, int *skip, int fullpixel,
+                              int mb_row, int mb_col) {
+  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     x->e_mbd.predictor, x->src.uv_stride);
   return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);
@@ -2082,12 +2001,8 @@ int vp9_cost_mv_ref(VP9_COMP *cpi,
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
 
-  // If the mode coding is done entirely at the segment level
-  // we should not account for it at the per mb level in rd code.
-  // Note that if the segment level coding is expanded from single mode
-  // to multiple mode masks as per reference frame coding we will need
-  // to do something different here.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+  // Dont account for mode here if segment skip is enabled.
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     VP9_COMMON *pc = &cpi->common;
 
     vp9_prob p [VP9_MVREFS - 1];
@@ -2156,14 +2071,18 @@ static int labels2mode(
           }
           break;
         case LEFT4X4:
-          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);
+          this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int :
+                                  left_block_mv(xd, mic, i);
           if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);
+            this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int :
+                                           left_block_second_mv(xd, mic, i);
           break;
         case ABOVE4X4:
-          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);
+          this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int :
+                                  above_block_mv(mic, i, mis);
           if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);
+            this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int :
+                                           above_block_second_mv(mic, i, mis);
           break;
         case ZERO4X4:
           this_mv->as_int = 0;
@@ -2178,11 +2097,11 @@ static int labels2mode(
         int_mv left_mv, left_second_mv;
 
         left_second_mv.as_int = 0;
-        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :
-                         left_block_mv(mic, i);
+        left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int :
+                         left_block_mv(xd, mic, i);
         if (mbmi->second_ref_frame > 0)
-          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :
-                                  left_block_second_mv(mic, i);
+          left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int :
+                                  left_block_second_mv(xd, mic, i);
 
         if (left_mv.as_int == this_mv->as_int &&
             (mbmi->second_ref_frame <= 0 ||
@@ -2198,9 +2117,9 @@ static int labels2mode(
 #endif
     }
 
-    d->bmi.as_mv.first.as_int = this_mv->as_int;
+    d->bmi.as_mv[0].as_int = this_mv->as_int;
     if (mbmi->second_ref_frame > 0)
-      d->bmi.as_mv.second.as_int = this_second_mv->as_int;
+      d->bmi.as_mv[1].as_int = this_second_mv->as_int;
 
     x->partition_info->bmi[i].mode = m;
     x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
@@ -2230,12 +2149,25 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
       BLOCK *be = &x->block[i];
       int thisdistortion;
 
-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4);
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0)
-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4);
+      vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,
+                                bd->pre_stride,
+                                bd->predictor, 16,
+                                &bd->bmi.as_mv[0],
+                                &xd->scale_factor[0],
+                                4, 4, 0 /* no avg */, &xd->subpix);
+
+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+        vp9_build_inter_predictor(*(bd->base_second_pre) + bd->pre,
+                                  bd->pre_stride,
+                                  bd->predictor, 16,
+                                  &bd->bmi.as_mv[1],
+                                  &xd->scale_factor[1],
+                                  4, 4, 1 /* avg */, &xd->subpix);
+      }
+
       vp9_subtract_b(be, bd, 16);
-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b_4x4(be, bd);
+      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
+      x->quantize_b_4x4(x, i);
       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
       *distortion += thisdistortion;
       *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
@@ -2274,20 +2206,31 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
     int ib = vp9_i8x8_block[i];
 
     if (labels[ib] == which_label) {
+      const int use_second_ref =
+          xd->mode_info_context->mbmi.second_ref_frame > 0;
+      int which_mv;
       int idx = (ib & 8) + ((ib & 2) << 1);
       BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
       BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
       int thisdistortion;
 
-      vp9_build_inter_predictors4b(xd, bd, 16);
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0)
-        vp9_build_2nd_inter_predictors4b(xd, bd, 16);
+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+        uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre;
+
+        vp9_build_inter_predictor(*base_pre + bd->pre,
+                                  bd->pre_stride,
+                                  bd->predictor, 16,
+                                  &bd->bmi.as_mv[which_mv],
+                                  &xd->scale_factor[which_mv],
+                                  8, 8, which_mv, &xd->subpix);
+      }
+
       vp9_subtract_4b_c(be, bd, 16);
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
         if (otherrd) {
-          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
-          x->quantize_b_8x8(be2, bd2);
+          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
+          x->quantize_b_8x8(x, idx);
           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
           otherdist += thisdistortion;
           othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
@@ -2298,8 +2241,8 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
         for (j = 0; j < 4; j += 2) {
           bd = &xd->block[ib + iblock[j]];
           be = &x->block[ib + iblock[j]];
-          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
-          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
+          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
           *distortion += thisdistortion;
           *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
@@ -2316,8 +2259,8 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
           for (j = 0; j < 4; j += 2) {
             BLOCKD *bd = &xd->block[ib + iblock[j]];
             BLOCK *be = &x->block[ib + iblock[j]];
-            x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
-            x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
+            x->fwd_txm8x4(be->src_diff, be->coeff, 32);
+            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);
             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
             otherdist += thisdistortion;
             othercost += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
@@ -2330,8 +2273,8 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
                            TX_4X4);
           }
         }
-        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
-        x->quantize_b_8x8(be2, bd2);
+        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
+        x->quantize_b_8x8(x, idx);
         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
         *distortion += thisdistortion;
         *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
@@ -2373,8 +2316,7 @@ typedef struct {
 
 } BEST_SEG_INFO;
 
-static __inline
-int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
+static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   int r = 0;
   r |= (mv->as_mv.row >> 3) < x->mv_row_min;
   r |= (mv->as_mv.row >> 3) > x->mv_row_max;
@@ -2487,9 +2429,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           // use previous block's result as next block's MV predictor.
           if (segmentation == PARTITIONING_4X4 && i > 0) {
-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
+            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int;
             if (i == 4 || i == 8 || i == 12)
-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
+              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int;
             step_param = 2;
           }
         }
@@ -2528,11 +2470,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
             if (thissme < bestsme) {
               bestsme = thissme;
-              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
+              mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int;
             } else {
               /* The full search result is actually worse so re-instate the
                * previous best vector */
-              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
+              e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int;
             }
           }
         }
@@ -2595,13 +2537,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
           for (j = 0; j < 16; j++)
             if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.block[j].eob;
+              best_eobs[j] = x->e_mbd.eobs[j];
         } else {
           for (j = 0; j < 4; j++) {
             int ib = vp9_i8x8_block[j], idx = j * 4;
 
             if (labels[ib] == i)
-              best_eobs[idx] = x->e_mbd.block[idx].eob;
+              best_eobs[idx] = x->e_mbd.eobs[idx];
           }
         }
         if (other_rd < best_other_rd)
@@ -2734,8 +2676,9 @@ static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
       if (base_rd < txfm_cache[ONLY_4X4]) {
         txfm_cache[ONLY_4X4] = base_rd;
       }
-      if (base_rd + diff < txfm_cache[1]) {
-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;
+      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {
+        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =
+            txfm_cache[ALLOW_32X32] = base_rd + diff;
       }
       if (diff < 0) {
         base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
@@ -2749,7 +2692,7 @@ static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static __inline void cal_step_param(int sr, int *sp) {
+static INLINE void cal_step_param(int sr, int *sp) {
   int step = 0;
 
   if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
@@ -2872,18 +2815,18 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 16; i++) {
     BLOCKD *bd = &x->e_mbd.block[i];
 
-    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;
+    bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int;
     if (mbmi->second_ref_frame > 0)
-      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;
-    bd->eob = bsi.eobs[i];
+      bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int;
+    x->e_mbd.eobs[i] = bsi.eobs[i];
   }
 
   *returntotrate = bsi.r;
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
   *skippable = bsi.txfm_size == TX_4X4 ?
-                    vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :
-                    vp9_mby_is_skippable_8x8(&x->e_mbd, 0);
+                    vp9_mby_is_skippable_4x4(&x->e_mbd) :
+                    vp9_mby_is_skippable_8x8(&x->e_mbd);
 
   /* save partitions */
   mbmi->txfm_size = bsi.txfm_size;
@@ -3016,7 +2959,8 @@ static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3],
   }
 }
 
-static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {
+static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1,
+                                     int idx, int val, int weight) {
   unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;
   unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;
   // weight is 16-bit fixed point, so this basically calculates:
@@ -3160,43 +3104,104 @@ static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x,
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                int block_size,
-                               int recon_yoffset, int recon_uvoffset,
+                               int mb_row, int mb_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
                                int frame_mdcounts[4][4],
-                               uint8_t *y_buffer[4],
-                               uint8_t *u_buffer[4],
-                               uint8_t *v_buffer[4]) {
-  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
+                               YV12_BUFFER_CONFIG yv12_mb[4],
+                               struct scale_factors scale[MAX_REF_FRAMES]) {
+  VP9_COMMON *cm = &cpi->common;
+  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  int use_prev_in_find_mv_refs, use_prev_in_find_best_ref;
+
+  // set up scaling factors
+  scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
+  scale[frame_type].x_offset_q4 =
+      (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf;
+  scale[frame_type].y_offset_q4 =
+      (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf;
 
-  y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;
-  u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;
-  v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;
+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+  // use the UV scaling factors.
+  setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col,
+                   &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(xd, xd->mode_info_context,
-                   xd->prev_mode_info_context,
+  use_prev_in_find_mv_refs = cm->Width == cm->last_width &&
+                             cm->Height == cm->last_height &&
+                             !cpi->common.error_resilient_mode;
+  vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,
+                   use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL,
                    frame_type,
                    mbmi->ref_mvs[frame_type],
                    cpi->common.ref_frame_sign_bias);
 
   // Candidate refinement carried out at encoder and decoder
-  vp9_find_best_ref_mvs(xd, y_buffer[frame_type],
+  use_prev_in_find_best_ref =
+      scale[frame_type].x_num == scale[frame_type].x_den &&
+      scale[frame_type].y_num == scale[frame_type].y_den &&
+      !cm->error_resilient_mode &&
+      !cm->frame_parallel_decoding_mode;
+  vp9_find_best_ref_mvs(xd,
+                        use_prev_in_find_best_ref ?
+                            yv12_mb[frame_type].y_buffer : NULL,
                         yv12->y_stride,
                         mbmi->ref_mvs[frame_type],
                         &frame_nearest_mv[frame_type],
                         &frame_near_mv[frame_type]);
 
-
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
-  mv_pred(cpi, x, y_buffer[frame_type], yv12->y_stride,
+  mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,
           frame_type, block_size);
 
 }
 
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+                                     int *rate, int *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  // The function is implemented as piecewise approximation to the
+  // exact computation.
+  // TODO(debargha): Implement the functions by interpolating from a
+  // look-up table
+  vp9_clear_system_state();
+  {
+    double D, R;
+    double s2 = (double) var / n;
+    double s = sqrt(s2);
+    double x = qstep / s;
+    if (x > 1.0) {
+      double y = exp(-x / 2);
+      double y2 = y * y;
+      D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275;
+      R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017;
+    } else {
+      double x2 = x * x;
+      D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807;
+      if (x > 0.125)
+        R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x +
+                 0.1626989668625);
+      else
+        R = -1.442252874826093 * log(x) + 1.944647760719664;
+    }
+    if (R < 0) {
+      *rate = 0;
+      *dist = var;
+    } else {
+      *rate = (n * R * 256 + 0.5);
+      *dist = (n * D * s2 + 0.5);
+    }
+  }
+  vp9_clear_system_state();
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  enum BlockSize block_size,
                                  int *saddone, int near_sadidx[],
@@ -3209,9 +3214,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int *rate_y, int *distortion_y,
                                  int *rate_uv, int *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
-                                 int recon_yoffset, int mode_index,
+                                 int mode_index,
+                                 INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv frame_mv[MB_MODE_COUNT]
-                                                [MAX_REF_FRAMES]) {
+                                                [MAX_REF_FRAMES],
+                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
+                                 int mb_row, int mb_col) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -3229,6 +3237,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv cur_mv[2];
   int_mv ref_mv[2];
   int64_t this_rd = 0;
+  unsigned char tmp_ybuf[64 * 64];
+  unsigned char tmp_ubuf[32 * 32];
+  unsigned char tmp_vbuf[32 * 32];
+  int pred_exists = 0;
+  int interpolating_intpel_seen = 0;
+  int intpel_mv;
+  int64_t rd, best_rd = INT64_MAX;
 
   switch (this_mode) {
     case NEWMV:
@@ -3248,6 +3263,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
       } else {
+        YV12_BUFFER_CONFIG backup_yv12 = xd->pre;
         int bestsme = INT_MAX;
         int further_steps, step_param = cpi->sf.first_step;
         int sadpb = x->sadperbit16;
@@ -3259,6 +3275,16 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
+        if (scaled_ref_frame) {
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          xd->pre = *scaled_ref_frame;
+          xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16;
+          xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
+          xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
+        }
+
         vp9_clamp_mv_min_max(x, &ref_mv[0]);
 
         // mvp_full.as_int = ref_mv[0].as_int;
@@ -3267,9 +3293,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         mvp_full.as_mv.col >>= 3;
         mvp_full.as_mv.row >>= 3;
-        if (mvp_full.as_int != mvp_full.as_int) {
-          mvp_full.as_int = mvp_full.as_int;
-        }
 
         // adjust search range according to sr from mv prediction
         step_param = MAX(step_param, sr);
@@ -3297,22 +3320,22 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        x->nmvjointcost, x->mvcost,
                                        &dis, &sse);
         }
-        d->bmi.as_mv.first.as_int = tmp_mv.as_int;
-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
+        d->bmi.as_mv[0].as_int = tmp_mv.as_int;
+        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
                                   x->nmvjointcost, x->mvcost,
                                   96, xd->allow_high_precision_mv);
+
+        // restore the predictor, if required
+        if (scaled_ref_frame) {
+          xd->pre = backup_yv12;
+        }
       }
       break;
-    case NEARESTMV:
     case NEARMV:
-      // Do not bother proceeding if the vector (from newmv, nearest or
-      // near) is 0,0 as this should then be coded using the zeromv mode.
-      for (i = 0; i < num_refs; ++i)
-        if (frame_mv[this_mode][refs[i]].as_int == 0)
-          return INT64_MAX;
+    case NEARESTMV:
     case ZEROMV:
     default:
       break;
@@ -3326,11 +3349,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-  }
 
   /* We don't include the cost of the second reference here, because there
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
@@ -3355,36 +3373,363 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif
 
+  pred_exists = 0;
+  interpolating_intpel_seen = 0;
+  // Are all MVs integer pel for Y and UV
+  intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
+              (mbmi->mv[0].as_mv.col & 15) == 0;
+  if (is_comp_pred)
+    intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
+                 (mbmi->mv[1].as_mv.col & 15) == 0;
+  // Search for best switchable filter by checking the variance of
+  // pred error irrespective of whether the filter will be used
   if (block_size == BLOCK_64X64) {
-    vp9_build_inter64x64_predictors_sb(xd,
-                                       xd->dst.y_buffer,
-                                       xd->dst.u_buffer,
-                                       xd->dst.v_buffer,
-                                       xd->dst.y_stride,
-                                       xd->dst.uv_stride);
+    int switchable_filter_index, newbest;
+    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
+    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
+    for (switchable_filter_index = 0;
+         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+         ++switchable_filter_index) {
+      int rs = 0;
+      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+      }
+      if (interpolating_intpel_seen && intpel_mv &&
+          vp9_is_interpolating_filter[mbmi->interp_filter]) {
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
+                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+      } else {
+        unsigned int sse, var;
+        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
+        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        vp9_build_inter64x64_predictors_sb(xd,
+                                           xd->dst.y_buffer,
+                                           xd->dst.u_buffer,
+                                           xd->dst.v_buffer,
+                                           xd->dst.y_stride,
+                                           xd->dst.uv_stride,
+                                           mb_row, mb_col);
+        var = vp9_variance64x64(*(b->base_src), b->src_stride,
+                                xd->dst.y_buffer, xd->dst.y_stride, &sse);
+        // Note our transform coeffs are 8 times an orthogonal transform.
+        // Hence quantizer step is also 8 times. To get effective quantizer
+        // we need to divide by 8 before sending to modeling function.
+        model_rd_from_var_lapndz(var, 64 * 64, xd->block[0].dequant[1] >> 3,
+                                 &tmp_rate_y, &tmp_dist_y);
+        var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,
+                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);
+        model_rd_from_var_lapndz(var, 32 * 32, xd->block[16].dequant[1] >> 3,
+                                 &tmp_rate_u, &tmp_dist_u);
+        var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,
+                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);
+        model_rd_from_var_lapndz(var, 32 * 32, xd->block[20].dequant[1] >> 3,
+                                 &tmp_rate_v, &tmp_dist_v);
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
+                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
+        if (!interpolating_intpel_seen && intpel_mv &&
+            vp9_is_interpolating_filter[mbmi->interp_filter]) {
+          tmp_rate_y_i = tmp_rate_y;
+          tmp_rate_u_i = tmp_rate_u;
+          tmp_rate_v_i = tmp_rate_v;
+          tmp_dist_y_i = tmp_dist_y;
+          tmp_dist_u_i = tmp_dist_u;
+          tmp_dist_v_i = tmp_dist_v;
+        }
+      }
+      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      if (newbest) {
+        best_rd = rd;
+        *best_filter = mbmi->interp_filter;
+      }
+      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
+          (cm->mcomp_filter_type != SWITCHABLE &&
+           cm->mcomp_filter_type == mbmi->interp_filter)) {
+        int i;
+        for (i = 0; i < 64; ++i)
+          vpx_memcpy(tmp_ybuf + i * 64,
+                     xd->dst.y_buffer + i * xd->dst.y_stride,
+                     sizeof(unsigned char) * 64);
+        for (i = 0; i < 32; ++i)
+          vpx_memcpy(tmp_ubuf + i * 32,
+                     xd->dst.u_buffer + i * xd->dst.uv_stride,
+                     sizeof(unsigned char) * 32);
+        for (i = 0; i < 32; ++i)
+          vpx_memcpy(tmp_vbuf + i * 32,
+                     xd->dst.v_buffer + i * xd->dst.uv_stride,
+                     sizeof(unsigned char) * 32);
+        pred_exists = 1;
+      }
+      interpolating_intpel_seen |=
+        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+    }
   } else if (block_size == BLOCK_32X32) {
-    vp9_build_inter32x32_predictors_sb(xd,
-                                       xd->dst.y_buffer,
-                                       xd->dst.u_buffer,
-                                       xd->dst.v_buffer,
-                                       xd->dst.y_stride,
-                                       xd->dst.uv_stride);
+    int switchable_filter_index, newbest;
+    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
+    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
+    for (switchable_filter_index = 0;
+       switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+       ++switchable_filter_index) {
+      int rs = 0;
+      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+      }
+      if (interpolating_intpel_seen && intpel_mv &&
+          vp9_is_interpolating_filter[mbmi->interp_filter]) {
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
+                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+      } else {
+        unsigned int sse, var;
+        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
+        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        vp9_build_inter32x32_predictors_sb(xd,
+                                           xd->dst.y_buffer,
+                                           xd->dst.u_buffer,
+                                           xd->dst.v_buffer,
+                                           xd->dst.y_stride,
+                                           xd->dst.uv_stride,
+                                           mb_row, mb_col);
+        var = vp9_variance32x32(*(b->base_src), b->src_stride,
+                                xd->dst.y_buffer, xd->dst.y_stride, &sse);
+        // Note our transform coeffs are 8 times an orthogonal transform.
+        // Hence quantizer step is also 8 times. To get effective quantizer
+        // we need to divide by 8 before sending to modeling function.
+        model_rd_from_var_lapndz(var, 32 * 32, xd->block[0].dequant[1] >> 3,
+                                 &tmp_rate_y, &tmp_dist_y);
+        var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
+                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);
+        model_rd_from_var_lapndz(var, 16 * 16, xd->block[16].dequant[1] >> 3,
+                                 &tmp_rate_u, &tmp_dist_u);
+        var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
+                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);
+        model_rd_from_var_lapndz(var, 16 * 16, xd->block[20].dequant[1] >> 3,
+                                 &tmp_rate_v, &tmp_dist_v);
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
+                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
+        if (!interpolating_intpel_seen && intpel_mv &&
+            vp9_is_interpolating_filter[mbmi->interp_filter]) {
+          tmp_rate_y_i = tmp_rate_y;
+          tmp_rate_u_i = tmp_rate_u;
+          tmp_rate_v_i = tmp_rate_v;
+          tmp_dist_y_i = tmp_dist_y;
+          tmp_dist_u_i = tmp_dist_u;
+          tmp_dist_v_i = tmp_dist_v;
+        }
+      }
+      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      if (newbest) {
+        best_rd = rd;
+        *best_filter = mbmi->interp_filter;
+      }
+      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
+          (cm->mcomp_filter_type != SWITCHABLE &&
+           cm->mcomp_filter_type == mbmi->interp_filter)) {
+        int i;
+        for (i = 0; i < 32; ++i)
+          vpx_memcpy(tmp_ybuf + i * 64,
+                     xd->dst.y_buffer + i * xd->dst.y_stride,
+                     sizeof(unsigned char) * 32);
+        for (i = 0; i < 16; ++i)
+          vpx_memcpy(tmp_ubuf + i * 32,
+                     xd->dst.u_buffer + i * xd->dst.uv_stride,
+                     sizeof(unsigned char) * 16);
+        for (i = 0; i < 16; ++i)
+          vpx_memcpy(tmp_vbuf + i * 32,
+                     xd->dst.v_buffer + i * xd->dst.uv_stride,
+                     sizeof(unsigned char) * 16);
+        pred_exists = 1;
+      }
+      interpolating_intpel_seen |=
+        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+    }
   } else {
+    int switchable_filter_index, newbest;
+    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
+    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
     assert(block_size == BLOCK_16X16);
-    vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-    if (is_comp_pred)
-      vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
+    for (switchable_filter_index = 0;
+       switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+       ++switchable_filter_index) {
+      int rs = 0;
+      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+      }
+      if (interpolating_intpel_seen && intpel_mv &&
+          vp9_is_interpolating_filter[mbmi->interp_filter]) {
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
+                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+      } else {
+        unsigned int sse, var;
+        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
+        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        // TODO(jkoleszar): these 2 y/uv should be replaced with one call to
+        // vp9_build_interintra_16x16_predictors_mb().
+        vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16,
+                                            mb_row, mb_col);
+
 #if CONFIG_COMP_INTERINTRA_PRED
-    if (is_comp_interintra_pred) {
-      vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
+        if (is_comp_interintra_pred) {
+          vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
+        }
+#endif
+
+        vp9_build_inter16x16_predictors_mbuv(xd, xd->predictor + 256,
+                                             xd->predictor + 320, 8,
+                                             mb_row, mb_col);
+
+#if CONFIG_COMP_INTERINTRA_PRED
+        if (is_comp_interintra_pred) {
+          vp9_build_interintra_16x16_predictors_mbuv(xd, xd->predictor + 256,
+                                                     xd->predictor + 320, 8);
+        }
+#endif
+        var = vp9_variance16x16(*(b->base_src), b->src_stride,
+                                xd->predictor, 16, &sse);
+        // Note our transform coeffs are 8 times an orthogonal transform.
+        // Hence quantizer step is also 8 times. To get effective quantizer
+        // we need to divide by 8 before sending to modeling function.
+        model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3,
+                                 &tmp_rate_y, &tmp_dist_y);
+        var = vp9_variance8x8(x->src.u_buffer, x->src.uv_stride,
+                              &xd->predictor[256], 8, &sse);
+        model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3,
+                                 &tmp_rate_u, &tmp_dist_u);
+        var = vp9_variance8x8(x->src.v_buffer, x->src.uv_stride,
+                              &xd->predictor[320], 8, &sse);
+        model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3,
+                                 &tmp_rate_v, &tmp_dist_v);
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
+                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
+        if (!interpolating_intpel_seen && intpel_mv &&
+            vp9_is_interpolating_filter[mbmi->interp_filter]) {
+          tmp_rate_y_i = tmp_rate_y;
+          tmp_rate_u_i = tmp_rate_u;
+          tmp_rate_v_i = tmp_rate_v;
+          tmp_dist_y_i = tmp_dist_y;
+          tmp_dist_u_i = tmp_dist_u;
+          tmp_dist_v_i = tmp_dist_v;
+        }
+      }
+      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      if (newbest) {
+        best_rd = rd;
+        *best_filter = mbmi->interp_filter;
+      }
+      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
+          (cm->mcomp_filter_type != SWITCHABLE &&
+           cm->mcomp_filter_type == mbmi->interp_filter)) {
+        vpx_memcpy(tmp_ybuf, xd->predictor, sizeof(unsigned char) * 256);
+        vpx_memcpy(tmp_ubuf, xd->predictor + 256, sizeof(unsigned char) * 64);
+        vpx_memcpy(tmp_vbuf, xd->predictor + 320, sizeof(unsigned char) * 64);
+        pred_exists = 1;
+      }
+      interpolating_intpel_seen |=
+        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
     }
+  }
+
+  // Set the appripriate filter
+  if (cm->mcomp_filter_type != SWITCHABLE)
+    mbmi->interp_filter = cm->mcomp_filter_type;
+  else
+    mbmi->interp_filter = *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+  if (pred_exists) {
+    if (block_size == BLOCK_64X64) {
+      for (i = 0; i < 64; ++i)
+        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,
+                   sizeof(unsigned char) * 64);
+      for (i = 0; i < 32; ++i)
+        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,
+                   sizeof(unsigned char) * 32);
+      for (i = 0; i < 32; ++i)
+        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,
+                   sizeof(unsigned char) * 32);
+    } else if (block_size == BLOCK_32X32) {
+      for (i = 0; i < 32; ++i)
+        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,
+                   sizeof(unsigned char) * 32);
+      for (i = 0; i < 16; ++i)
+        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,
+                   sizeof(unsigned char) * 16);
+      for (i = 0; i < 16; ++i)
+        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,
+                   sizeof(unsigned char) * 16);
+    } else {
+      vpx_memcpy(xd->predictor, tmp_ybuf, sizeof(unsigned char) * 256);
+      vpx_memcpy(xd->predictor + 256, tmp_ubuf, sizeof(unsigned char) * 64);
+      vpx_memcpy(xd->predictor + 320, tmp_vbuf, sizeof(unsigned char) * 64);
+    }
+  } else {
+    // Handles the special case when a filter that is not in the
+    // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
+    if (block_size == BLOCK_64X64) {
+      vp9_build_inter64x64_predictors_sb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
+    } else if (block_size == BLOCK_32X32) {
+      vp9_build_inter32x32_predictors_sb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
+    } else {
+      // TODO(jkoleszar): These y/uv fns can be replaced with their mb
+      // equivalent
+      vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16,
+                                          mb_row, mb_col);
+#if CONFIG_COMP_INTERINTRA_PRED
+      if (is_comp_interintra_pred) {
+        vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
+      }
+#endif
+      vp9_build_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                           &xd->predictor[320], 8,
+                                           mb_row, mb_col);
+#if CONFIG_COMP_INTERINTRA_PRED
+      if (is_comp_interintra_pred) {
+        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                                   &xd->predictor[320], 8);
+      }
 #endif
+    }
+  }
+
+  if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+    const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
   }
 
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
     x->skip = 1;
   else if (x->encode_breakout) {
-    unsigned int sse, var;
+    unsigned int var, sse;
     int threshold = (xd->block[0].dequant[1]
                      * xd->block[0].dequant[1] >> 4);
 
@@ -3404,9 +3749,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if ((int)sse < threshold) {
-      unsigned int q2dc = xd->block[24].dequant[0];
+      unsigned int q2dc = xd->block[0].dequant[0];
       /* If there is no codeable 2nd order dc
-       or a very small uniform pixel change change */
+         or a very small uniform pixel change change */
       if ((sse - var < q2dc * q2dc >> 4) ||
           (sse / 2 > var && sse - var < 64)) {
         // Check u and v to make sure skip is ok
@@ -3447,17 +3792,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  if (!(*mode_excluded)) {
-    if (is_comp_pred) {
-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-    } else {
-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;
-#endif
-  }
-
   if (!x->skip) {
     if (block_size == BLOCK_64X64) {
       int skippable_y, skippable_uv;
@@ -3491,30 +3825,32 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       *skippable = skippable_y && skippable_uv;
     } else {
       assert(block_size == BLOCK_16X16);
-
-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                 &xd->predictor[320], 8);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (is_comp_interintra_pred) {
-        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                   &xd->predictor[320], 8);
-      }
-#endif
       inter_mode_cost(cpi, x, rate2, distortion,
                       rate_y, distortion_y, rate_uv, distortion_uv,
                       skippable, txfm_cache);
     }
   }
+
+  if (!(*mode_excluded)) {
+    if (is_comp_pred) {
+      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+    } else {
+      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+    }
+#if CONFIG_COMP_INTERINTRA_PRED
+    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;
+#endif
+  }
+
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
 static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                               int recon_yoffset, int recon_uvoffset,
+                               int mb_row, int mb_col,
                                int *returnrate, int *returndistortion,
                                int64_t *returnintra) {
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+    VP9_ALT_FLAG };
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   union b_mode_info best_bmodes[16];
@@ -3544,6 +3880,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
+  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
   int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
   int uv_intra_skippable = 0;
   int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
@@ -3551,7 +3888,6 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
   int distortion_uv = INT_MAX;
   int64_t best_yrd = INT64_MAX;
-  int switchable_filter_index = 0;
 
   MB_PREDICTION_MODE uv_intra_mode;
   MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
@@ -3561,7 +3897,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   int frame_mdcounts[4][4];
-  uint8_t *y_buffer[4], *u_buffer[4], *v_buffer[4];
+  YV12_BUFFER_CONFIG yv12_mb[4];
 
   unsigned int ref_costs[MAX_REF_FRAMES];
   int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
@@ -3569,6 +3905,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y1dc_delta_q);
 
+  struct scale_factors scale_factor[4];
+
   vpx_memset(mode8x8, 0, sizeof(mode8x8));
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@@ -3592,24 +3930,24 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,
-                       BLOCK_16X16, recon_yoffset, recon_uvoffset,
+    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,
+                       LAST_FRAME, BLOCK_16X16, mb_row, mb_col,
                        frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
+                       frame_mdcounts, yv12_mb, scale_factor);
   }
 
   if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,
-                       BLOCK_16X16, recon_yoffset, recon_uvoffset,
+    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,
+                       GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col,
                        frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
+                       frame_mdcounts, yv12_mb, scale_factor);
   }
 
   if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,
-                       BLOCK_16X16, recon_yoffset, recon_uvoffset,
+    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,
+                       ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col,
                        frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
+                       frame_mdcounts, yv12_mb, scale_factor);
   }
 
   *returnintra = INT64_MAX;
@@ -3638,8 +3976,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // that depend on the current prediction etc.
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
 
-  for (mode_index = 0; mode_index < MAX_MODES;
-       mode_index += (!switchable_filter_index)) {
+  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0, skippable = 0;
     int other_cost = 0;
@@ -3649,6 +3986,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
     int mode_excluded = 0;
     int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
+    YV12_BUFFER_CONFIG *scaled_ref_frame;
 
     // These variables hold are rolling total cost and distortion for this mode
     rate2 = 0;
@@ -3664,24 +4002,38 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
-    // Evaluate all sub-pel filters irrespective of whether we can use
-    // them for this frame.
-    if (this_mode >= NEARESTMV && this_mode <= SPLITMV) {
-      mbmi->interp_filter =
-          vp9_switchable_interp[switchable_filter_index++];
-      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
-        switchable_filter_index = 0;
-      if ((cm->mcomp_filter_type != SWITCHABLE) &&
-          (cm->mcomp_filter_type != mbmi->interp_filter)) {
-        mode_excluded = 1;
-      }
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-    }
+    mbmi->interp_filter = cm->mcomp_filter_type;
+
+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                      scale_factor);
+
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
     // Test best rd so far against threshold for trying this mode.
     if (best_rd <= cpi->rd_threshes[mode_index])
       continue;
 
+    // Ensure that the references used by this mode are available.
+    if (mbmi->ref_frame &&
+        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))
+      continue;
+
+    if (mbmi->second_ref_frame > 0 &&
+        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))
+      continue;
+
+    // only scale on zeromv.
+    if (mbmi->ref_frame > 0 &&
+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode != ZEROMV)
+      continue;
+    if (mbmi->second_ref_frame > 0 &&
+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode != ZEROMV)
+      continue;
+
     // current coding mode under rate-distortion optimization test loop
 #if CONFIG_COMP_INTERINTRA_PRED
     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
@@ -3693,18 +4045,16 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
         !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
       continue;
-    // If the segment mode feature is enabled....
+    // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
-               (this_mode !=
-                vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
+               (this_mode != ZEROMV)) {
       continue;
-    // Disable this drop out case if either the mode or ref frame
-    // segment level feature is enabled for this segment. This is to
+    // Disable this drop out case if  the ref frame segment
+    // level feature is enabled for this segment. This is to
     // prevent the possibility that the we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative
       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
@@ -3716,22 +4066,31 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* everything but intra */
+    scaled_ref_frame = NULL;
     if (mbmi->ref_frame) {
       int ref = mbmi->ref_frame;
+      int fb;
 
-      xd->pre.y_buffer = y_buffer[ref];
-      xd->pre.u_buffer = u_buffer[ref];
-      xd->pre.v_buffer = v_buffer[ref];
+      xd->pre = yv12_mb[ref];
       best_ref_mv = mbmi->ref_mvs[ref][0];
       vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
+
+      if (mbmi->ref_frame == LAST_FRAME) {
+        fb = cpi->lst_fb_idx;
+      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
+        fb = cpi->gld_fb_idx;
+      } else {
+        fb = cpi->alt_fb_idx;
+      }
+
+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
     }
 
     if (mbmi->second_ref_frame > 0) {
       int ref = mbmi->second_ref_frame;
 
-      xd->second_pre.y_buffer = y_buffer[ref];
-      xd->second_pre.u_buffer = u_buffer[ref];
-      xd->second_pre.v_buffer = v_buffer[ref];
+      xd->second_pre = yv12_mb[ref];
       second_best_ref_mv = mbmi->ref_mvs[ref][0];
     }
 
@@ -3798,8 +4157,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
           mbmi->txfm_size = TX_4X4;
           tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                             &distortion, best_yrd,
-                                             cpi->update_context);
+                                             &distortion, best_yrd);
           rate2 += rate;
           rate2 += intra_cost_penalty;
           distortion2 += distortion;
@@ -3898,29 +4256,108 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     // special case it.
     else if (this_mode == SPLITMV) {
       const int is_comp_pred = mbmi->second_ref_frame > 0;
-      int64_t tmp_rd, this_rd_thresh;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+      int switchable_filter_index;
       int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
+      union b_mode_info tmp_best_bmodes[16];
+      MB_MODE_INFO tmp_best_mbmode;
+      PARTITION_INFO tmp_best_partition;
+      int pred_exists = 0;
 
       this_rd_thresh =
-              (mbmi->ref_frame == LAST_FRAME) ?
+          (mbmi->ref_frame == LAST_FRAME) ?
           cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
       this_rd_thresh =
-              (mbmi->ref_frame == GOLDEN_FRAME) ?
+          (mbmi->ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
 
-      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                           second_ref, best_yrd, mdcounts,
-                                           &rate, &rate_y, &distortion,
-                                           &skippable,
-                                           (int)this_rd_thresh, seg_mvs,
-                                           txfm_cache);
+      for (switchable_filter_index = 0;
+           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+           ++switchable_filter_index) {
+        int newbest;
+        mbmi->interp_filter =
+            vp9_switchable_interp[switchable_filter_index];
+        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+                                             second_ref, best_yrd, mdcounts,
+                                             &rate, &rate_y, &distortion,
+                                             &skippable,
+                                             (int)this_rd_thresh, seg_mvs,
+                                             txfm_cache);
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+                   [vp9_get_pred_context(&cpi->common, xd,
+                                         PRED_SWITCHABLE_INTERP)]
+                   [vp9_switchable_interp_map[mbmi->interp_filter]];
+          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        newbest = (tmp_rd < tmp_best_rd);
+        if (newbest) {
+          tmp_best_filter = mbmi->interp_filter;
+          tmp_best_rd = tmp_rd;
+        }
+        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+            (mbmi->interp_filter == cm->mcomp_filter_type &&
+             cm->mcomp_filter_type != SWITCHABLE)) {
+          tmp_best_rdu = tmp_rd;
+          tmp_best_rate = rate;
+          tmp_best_ratey = rate_y;
+          tmp_best_distortion = distortion;
+          tmp_best_skippable = skippable;
+          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+          vpx_memcpy(&tmp_best_partition, x->partition_info,
+                     sizeof(PARTITION_INFO));
+          for (i = 0; i < 16; i++) {
+            tmp_best_bmodes[i] = xd->block[i].bmi;
+          }
+          pred_exists = 1;
+        }
+      }  // switchable_filter_index loop
+
+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
+                             tmp_best_filter : cm->mcomp_filter_type);
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+                                             second_ref, best_yrd, mdcounts,
+                                             &rate, &rate_y, &distortion,
+                                             &skippable,
+                                             (int)this_rd_thresh, seg_mvs,
+                                             txfm_cache);
+      } else {
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+                   [vp9_get_pred_context(&cpi->common, xd,
+                                         PRED_SWITCHABLE_INTERP)]
+                   [vp9_switchable_interp_map[mbmi->interp_filter]];
+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        tmp_rd = tmp_best_rdu;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
+        vpx_memcpy(x->partition_info, &tmp_best_partition,
+                   sizeof(PARTITION_INFO));
+        for (i = 0; i < 16; i++) {
+          xd->block[i].bmi = tmp_best_bmodes[i];
+        }
+      }
+
       rate2 += rate;
       distortion2 += distortion;
 
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
         rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
             [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-                [vp9_switchable_interp_map[mbmi->interp_filter]];
+            [vp9_switchable_interp_map[mbmi->interp_filter]];
 
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
@@ -3928,7 +4365,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int uv_skippable;
 
         rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                       cpi->common.full_pixel);
+                       cpi->common.full_pixel, mb_row, mb_col);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -3969,8 +4406,9 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
                                   &rate_y, &distortion,
                                   &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip, recon_yoffset,
-                                  mode_index, frame_mv);
+                                  &mode_excluded, &disable_skip,
+                                  mode_index, &tmp_best_filter, frame_mv,
+                                  scaled_ref_frame, mb_row, mb_col);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -3995,10 +4433,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (cpi->common.mb_no_coeff_skip) {
         int mb_skip_allowed;
 
-        // Is Mb level skip allowed for this mb.
-        mb_skip_allowed =
-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+        // Is Mb level skip allowed (i.e. not coded at segment level).
+        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
         if (skippable) {
           mbmi->mb_skip_coeff = 1;
@@ -4061,7 +4497,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (this_rd < best_overall_rd) {
       best_overall_rd = this_rd;
-      best_filter = mbmi->interp_filter;
+      best_filter = tmp_best_filter;
       best_mode = this_mode;
 #if CONFIG_COMP_INTERINTRA_PRED
       is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
@@ -4175,7 +4611,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (x->skip && !mode_excluded)
       break;
-    }
+  }
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
@@ -4204,12 +4640,11 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         cpi->rd_thresh_mult[best_mode_index];
   }
 
-  // This code force Altref,0,0 and skip for the frame that overlays a
+  // This code forces Altref,0,0 and skip for the frame that overlays a
   // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame or mode is enabled for this
+  // segment level coding of ref frame is enabled for this
   // segment.
   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
       cpi->is_src_frame_alt_ref &&
       (cpi->oxcf.arnr_max_frames == 0) &&
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
@@ -4224,6 +4659,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mb_skip_coeff =
       (cpi->common.mb_no_coeff_skip) ? 1 : 0;
     mbmi->partitioning = 0;
+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                      scale_factor);
 
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
@@ -4244,10 +4681,12 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (best_mbmode.mode == SPLITMV) {
     for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;
+      xd->mode_info_context->bmi[i].as_mv[0].as_int =
+          best_bmodes[i].as_mv[0].as_int;
     if (mbmi->second_ref_frame > 0)
       for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;
+        xd->mode_info_context->bmi[i].as_mv[1].as_int =
+            best_bmodes[i].as_mv[1].as_int;
 
     vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
 
@@ -4265,7 +4704,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (!x->skip) {
     for (i = 0; i < NB_TXFM_MODES; i++) {
       if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = INT_MIN;
+        best_txfm_diff[i] = 0;
       else
         best_txfm_diff[i] = best_rd - best_txfm_rd[i];
     }
@@ -4274,6 +4713,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
 end:
+  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                    scale_factor);
   store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
                        best_mode_index, &best_partition,
                        &mbmi->ref_mvs[mbmi->ref_frame][0],
@@ -4291,22 +4732,28 @@ void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
   int rate_y_tokenonly = 0, rate_uv_tokenonly;
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
-  int64_t txfm_cache[NB_TXFM_MODES];
+  int64_t txfm_cache[NB_TXFM_MODES], err;
+  int i;
 
-  rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                   &dist_y, &y_skip, txfm_cache);
+  err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                               &dist_y, &y_skip, txfm_cache);
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                                     &dist_uv, &uv_skip);
+                          &dist_uv, &uv_skip);
 
   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                   vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     *returndist = dist_y + (dist_uv >> 2);
+    memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0,
+           sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
   } else {
     *returnrate = rate_y + rate_uv;
     if (cpi->common.mb_no_coeff_skip)
       *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i];
+    }
   }
 }
 
@@ -4319,22 +4766,28 @@ void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
   int rate_y_tokenonly = 0, rate_uv_tokenonly;
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
-  int64_t txfm_cache[NB_TXFM_MODES];
+  int64_t txfm_cache[NB_TXFM_MODES], err;
+  int i;
 
-  rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                     &dist_y, &y_skip, txfm_cache);
+  err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                 &dist_y, &y_skip, txfm_cache);
   rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                                       &dist_uv, &uv_skip);
+                            &dist_uv, &uv_skip);
 
   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
     vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     *returndist = dist_y + (dist_uv >> 2);
+    memset(x->sb64_context.txfm_rd_diff, 0,
+           sizeof(x->sb64_context.txfm_rd_diff));
   } else {
     *returnrate = rate_y + rate_uv;
     if (cm->mb_no_coeff_skip)
       *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i];
+    }
   }
 }
 
@@ -4392,10 +4845,10 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   mode8x8[2]= xd->mode_info_context->bmi[8].as_mode.first;
   mode8x8[3]= xd->mode_info_context->bmi[10].as_mode.first;
 
+  mbmi->txfm_size = TX_4X4;
   error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
                                        &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16,
-                                       cpi->update_context);
+                                       &dist4x4, error16x16);
 
   mbmi->mb_skip_coeff = 0;
   if (cpi->common.mb_no_coeff_skip &&
@@ -4457,7 +4910,7 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                         int recon_yoffset, int recon_uvoffset,
+                                         int mb_row, int mb_col,
                                          int *returnrate,
                                          int *returndistortion,
                                          int block_size) {
@@ -4471,13 +4924,13 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   int frame_mdcounts[4][4];
-  uint8_t *y_buffer[4];
-  uint8_t *u_buffer[4];
-  uint8_t *v_buffer[4];
+  YV12_BUFFER_CONFIG yv12_mb[4];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,
-                      cpi->common.alt_fb_idx };
+  int idx_list[4] = {0,
+                     cpi->lst_fb_idx,
+                     cpi->gld_fb_idx,
+                     cpi->alt_fb_idx};
   int mdcounts[4];
   int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
   int saddone = 0;
@@ -4496,16 +4949,16 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
+  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
   int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0,
       rate_uv_tokenonly_8x8 = 0;
   int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;
   MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;
-  int switchable_filter_index = 0;
   int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
   int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
   MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;
+  struct scale_factors scale_factor[4];
 
-  x->skip = 0;
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@@ -4518,9 +4971,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
-                         recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+                         mb_row, mb_col, frame_mv[NEARESTMV],
                          frame_mv[NEARMV], frame_mdcounts,
-                         y_buffer, u_buffer, v_buffer);
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -4570,8 +5023,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  for (mode_index = 0; mode_index < MAX_MODES;
-       mode_index += (!switchable_filter_index)) {
+  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
@@ -4588,10 +5040,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     // Test best rd so far against threshold for trying this mode.
     if (best_rd <= cpi->rd_threshes[mode_index] ||
         cpi->rd_threshes[mode_index] == INT_MAX) {
-      switchable_filter_index = 0;
       continue;
     }
 
+    x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
     if (!(ref_frame == INTRA_FRAME ||
@@ -4600,6 +5052,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                      scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
@@ -4607,19 +5061,11 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
     mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
+
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    if (this_mode >= NEARESTMV && this_mode <= SPLITMV) {
-      mbmi->interp_filter =
-          vp9_switchable_interp[switchable_filter_index++];
-      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
-        switchable_filter_index = 0;
-      if ((cm->mcomp_filter_type != SWITCHABLE) &&
-          (cm->mcomp_filter_type != mbmi->interp_filter)) {
-        mode_excluded = 1;
-      }
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-    }
+    mbmi->interp_filter = cm->mcomp_filter_type;
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
     // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
     //  continue;
@@ -4640,10 +5086,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (!(cpi->ref_frame_flags & flag_list[second_ref]))
         continue;
       mbmi->second_ref_frame = second_ref;
+      set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                        scale_factor);
 
-      xd->second_pre.y_buffer = y_buffer[second_ref];
-      xd->second_pre.u_buffer = u_buffer[second_ref];
-      xd->second_pre.v_buffer = v_buffer[second_ref];
+      xd->second_pre = yv12_mb[second_ref];
       mode_excluded =
           mode_excluded ?
               mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
@@ -4661,9 +5107,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    xd->pre.y_buffer = y_buffer[ref_frame];
-    xd->pre.u_buffer = u_buffer[ref_frame];
-    xd->pre.v_buffer = v_buffer[ref_frame];
+    xd->pre = yv12_mb[ref_frame];
     vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
 
     // If the segment reference frame feature is enabled....
@@ -4671,16 +5115,15 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
         !vp9_check_segref(xd, segment_id, ref_frame)) {
       continue;
-    // If the segment mode feature is enabled....
+    // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
-               (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
+               (this_mode != ZEROMV)) {
       continue;
-    // Disable this drop out case if either the mode or ref frame
+    // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative
@@ -4722,6 +5165,20 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
       distortion2 = distortion_y + distortion_uv;
     } else {
+      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      int fb;
+
+      if (mbmi->ref_frame == LAST_FRAME) {
+        fb = cpi->lst_fb_idx;
+      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
+        fb = cpi->gld_fb_idx;
+      } else {
+        fb = cpi->alt_fb_idx;
+      }
+
+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
 #if CONFIG_COMP_INTERINTRA_PRED
       if (mbmi->second_ref_frame == INTRA_FRAME) {
         if (best_intra16_mode == DC_PRED - 1) continue;
@@ -4742,8 +5199,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip, recon_yoffset,
-                                  mode_index, frame_mv);
+                                  &mode_excluded, &disable_skip,
+                                  mode_index, &tmp_best_filter, frame_mv,
+                                  scaled_ref_frame, mb_row, mb_col);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -4769,10 +5227,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (cpi->common.mb_no_coeff_skip) {
         int mb_skip_allowed;
 
-        // Is Mb level skip allowed for this mb.
-        mb_skip_allowed =
-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+        // Is Mb level skip allowed (i.e. not coded at segment level).
+        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
         if (skippable) {
           // Back out the coefficient coding costs
@@ -4832,7 +5288,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (this_rd < best_overall_rd) {
       best_overall_rd = this_rd;
-      best_filter = mbmi->interp_filter;
+      best_filter = tmp_best_filter;
       best_mode = this_mode;
 #if CONFIG_COMP_INTERINTRA_PRED
       is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
@@ -4956,10 +5412,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   // This code forces Altref,0,0 and skip for the frame that overlays a
   // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame or mode is enabled for this
-  // segment.
+  // segment level coding of ref frame is enabled for this segment.
   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
       cpi->is_src_frame_alt_ref &&
       (cpi->oxcf.arnr_max_frames == 0) &&
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
@@ -4971,7 +5425,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
     mbmi->partitioning = 0;
     mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?
-                      TX_16X16 : cm->txfm_mode;
+                      TX_32X32 : cm->txfm_mode;
 
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
@@ -4991,7 +5445,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (!x->skip) {
     for (i = 0; i < NB_TXFM_MODES; i++) {
       if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = INT_MIN;
+        best_txfm_diff[i] = 0;
       else
         best_txfm_diff[i] = best_rd - best_txfm_rd[i];
     }
@@ -5000,6 +5454,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
  end:
+  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                    scale_factor);
   {
     PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?
                             &x->sb32_context[xd->sb_index] :
@@ -5015,24 +5471,23 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int recon_yoffset, int recon_uvoffset,
+                                    int mb_row, int mb_col,
                                     int *returnrate,
                                     int *returndistortion) {
-  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,
+  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,
                                    returnrate, returndistortion, BLOCK_32X32);
 }
 
 int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int recon_yoffset, int recon_uvoffset,
+                                    int mb_row, int mb_col,
                                     int *returnrate,
                                     int *returndistortion) {
-  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,
+  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,
                                    returnrate, returndistortion, BLOCK_64X64);
 }
 
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int recon_yoffset,
-                                    int recon_uvoffset,
+                                    int mb_row, int mb_col,
                                     int *totalrate, int *totaldist) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -5050,7 +5505,7 @@ void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
   {
     int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
 
-    rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+    rd_pick_inter_mode(cpi, x, mb_row, mb_col, &rate,
                        &distortion, &intra_error);
 
     /* restore cpi->zbin_mode_boost_enabled */
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 8ee2c0bf9..01b156044 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -29,15 +29,15 @@ extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
                                         int *r, int *d);
 
 extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int ref_yoffset, int ref_uvoffset,
+                                           int mb_row, int mb_col,
                                            int *r, int *d);
 
 extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int ref_yoffset, int ref_uvoffset,
+                                           int mb_row, int mb_col,
                                            int *r, int *d);
 
 extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int ref_yoffset, int ref_uvoffset,
+                                           int mb_row, int mb_col,
                                            int *r, int *d);
 
 extern void vp9_init_me_luts();
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 84121f79c..dc21f02f6 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -13,12 +13,13 @@
 #include "vp9/common/vp9_sadmxn.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "./vp9_rtcd.h"
 
 unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
                             int  ref_stride,
-                            int max_sad) {
+                            unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
 }
 
@@ -26,7 +27,7 @@ unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
                             int  ref_stride,
-                            int max_sad) {
+                            unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
 }
 
@@ -34,7 +35,7 @@ unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
                             int  ref_stride,
-                            int max_sad) {
+                            unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
 }
 
@@ -42,7 +43,7 @@ unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
                           int  src_stride,
                           const uint8_t *ref_ptr,
                           int  ref_stride,
-                          int max_sad) {
+                          unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
 }
 
@@ -51,7 +52,7 @@ unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
                            int  src_stride,
                            const uint8_t *ref_ptr,
                            int  ref_stride,
-                           int max_sad) {
+                           unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
 }
 
@@ -59,7 +60,7 @@ unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
                            int  src_stride,
                            const uint8_t *ref_ptr,
                            int  ref_stride,
-                           int max_sad) {
+                           unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
 }
 
@@ -68,7 +69,7 @@ unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
                           int  src_stride,
                           const uint8_t *ref_ptr,
                           int  ref_stride,
-                          int max_sad) {
+                          unsigned int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
 
@@ -77,12 +78,12 @@ void vp9_sad64x64x3_c(const uint8_t *src_ptr,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad32x32x3_c(const uint8_t *src_ptr,
@@ -90,74 +91,74 @@ void vp9_sad32x32x3_c(const uint8_t *src_ptr,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
 void vp9_sad64x64x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr, ref_stride,
-                                          0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 1, ref_stride,
-                                          0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 2, ref_stride,
-                                          0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 3, ref_stride,
-                                          0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 4, ref_stride,
-                                          0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 5, ref_stride,
-                                          0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 6, ref_stride,
-                                          0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
-                                          ref_ptr + 7, ref_stride,
-                                          0x7fffffff);
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad32x32x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr, ref_stride,
-                                          0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 1, ref_stride,
-                                          0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 2, ref_stride,
-                                          0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 3, ref_stride,
-                                          0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 4, ref_stride,
-                                          0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 5, ref_stride,
-                                          0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 6, ref_stride,
-                                          0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
-                                          ref_ptr + 7, ref_stride,
-                                          0x7fffffff);
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad16x16x3_c(const uint8_t *src_ptr,
@@ -165,43 +166,43 @@ void vp9_sad16x16x3_c(const uint8_t *src_ptr,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
 void vp9_sad16x16x8_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
                       int  ref_stride,
-                      uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr, ref_stride,
-                                          0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 1, ref_stride,
-                                          0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 2, ref_stride,
-                                          0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 3, ref_stride,
-                                          0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 4, ref_stride,
-                                          0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 5, ref_stride,
-                                          0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 6, ref_stride,
-                                          0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
-                                          ref_ptr + 7, ref_stride,
-                                          0x7fffffff);
+                      uint32_t *sad_array) {
+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr, ref_stride,
+                              0x7fffffff);
+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 1, ref_stride,
+                              0x7fffffff);
+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 2, ref_stride,
+                              0x7fffffff);
+  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 3, ref_stride,
+                              0x7fffffff);
+  sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 4, ref_stride,
+                              0x7fffffff);
+  sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 5, ref_stride,
+                              0x7fffffff);
+  sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 6, ref_stride,
+                              0x7fffffff);
+  sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr + 7, ref_stride,
+                              0x7fffffff);
 }
 
 void vp9_sad16x8x3_c(const uint8_t *src_ptr,
@@ -209,43 +210,43 @@ void vp9_sad16x8x3_c(const uint8_t *src_ptr,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
 void vp9_sad16x8x8_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
-                     uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr, ref_stride,
-                                         0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 1, ref_stride,
-                                         0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 2, ref_stride,
-                                         0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 3, ref_stride,
-                                         0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 4, ref_stride,
-                                         0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 5, ref_stride,
-                                         0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 6, ref_stride,
-                                         0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
-                                         ref_ptr + 7, ref_stride,
-                                         0x7fffffff);
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad8x8x3_c(const uint8_t *src_ptr,
@@ -253,43 +254,43 @@ void vp9_sad8x8x3_c(const uint8_t *src_ptr,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
 void vp9_sad8x8x8_c(const uint8_t *src_ptr,
                     int  src_stride,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
-                    uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                    uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr, ref_stride,
+                            0x7fffffff);
+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride,
+                            0x7fffffff);
+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride,
+                            0x7fffffff);
+  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 3, ref_stride,
+                            0x7fffffff);
+  sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 4, ref_stride,
+                            0x7fffffff);
+  sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 5, ref_stride,
+                            0x7fffffff);
+  sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 6, ref_stride,
+                            0x7fffffff);
+  sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr + 7, ref_stride,
+                            0x7fffffff);
 }
 
 void vp9_sad8x16x3_c(const uint8_t *src_ptr,
@@ -297,43 +298,43 @@ void vp9_sad8x16x3_c(const uint8_t *src_ptr,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
 void vp9_sad8x16x8_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t *ref_ptr,
                      int  ref_stride,
-                     uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr, ref_stride,
-                                         0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 1, ref_stride,
-                                         0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 2, ref_stride,
-                                         0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 3, ref_stride,
-                                         0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 4, ref_stride,
-                                         0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 5, ref_stride,
-                                         0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 6, ref_stride,
-                                         0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
-                                         ref_ptr + 7, ref_stride,
-                                         0x7fffffff);
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad4x4x3_c(const uint8_t *src_ptr,
@@ -341,204 +342,147 @@ void vp9_sad4x4x3_c(const uint8_t *src_ptr,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
 void vp9_sad4x4x8_c(const uint8_t *src_ptr,
                     int  src_stride,
                     const uint8_t *ref_ptr,
                     int  ref_stride,
-                    uint16_t *sad_array) {
-  sad_array[0] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr, ref_stride,
-                                        0x7fffffff);
-  sad_array[1] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 1, ref_stride,
-                                        0x7fffffff);
-  sad_array[2] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 2, ref_stride,
-                                        0x7fffffff);
-  sad_array[3] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 3, ref_stride,
-                                        0x7fffffff);
-  sad_array[4] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 4, ref_stride,
-                                        0x7fffffff);
-  sad_array[5] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 5, ref_stride,
-                                        0x7fffffff);
-  sad_array[6] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 6, ref_stride,
-                                        0x7fffffff);
-  sad_array[7] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
-                                        ref_ptr + 7, ref_stride,
-                                        0x7fffffff);
+                    uint32_t *sad_array) {
+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr, ref_stride,
+                            0x7fffffff);
+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 1, ref_stride,
+                            0x7fffffff);
+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 2, ref_stride,
+                            0x7fffffff);
+  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 3, ref_stride,
+                            0x7fffffff);
+  sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 4, ref_stride,
+                            0x7fffffff);
+  sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 5, ref_stride,
+                            0x7fffffff);
+  sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 6, ref_stride,
+                            0x7fffffff);
+  sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr + 7, ref_stride,
+                            0x7fffffff);
 }
 
 void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       uint8_t *ref_ptr[],
+                       const uint8_t *ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad64x64_c(src_ptr, src_stride,
-                                ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       uint8_t *ref_ptr[],
+                       const uint8_t *ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,
-                                ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp9_sad16x16x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       uint8_t *ref_ptr[],
+                       const uint8_t *ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,
-                                ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp9_sad16x8x4d_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      uint8_t *ref_ptr[],
+                      const uint8_t *ref_ptr[],
                       int  ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,
-                               ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
+                             ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp9_sad8x8x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     uint8_t *ref_ptr[],
+                     const uint8_t *ref_ptr[],
                      int  ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
+                            ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      uint8_t *ref_ptr[],
+                      const uint8_t *ref_ptr[],
                       int  ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,
-                               ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
+                             ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     uint8_t *ref_ptr[],
+                     const uint8_t *ref_ptr[],
                      int  ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
+                            ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-/* Copy 2 macroblocks to a buffer */
-void vp9_copy32xn_c(uint8_t *src_ptr,
-                    int  src_stride,
-                    uint8_t *dst_ptr,
-                    int  dst_stride,
-                    int height) {
-  int r;
-
-  for (r = 0; r < height; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst_ptr[0] = src_ptr[0];
-    dst_ptr[1] = src_ptr[1];
-    dst_ptr[2] = src_ptr[2];
-    dst_ptr[3] = src_ptr[3];
-    dst_ptr[4] = src_ptr[4];
-    dst_ptr[5] = src_ptr[5];
-    dst_ptr[6] = src_ptr[6];
-    dst_ptr[7] = src_ptr[7];
-    dst_ptr[8] = src_ptr[8];
-    dst_ptr[9] = src_ptr[9];
-    dst_ptr[10] = src_ptr[10];
-    dst_ptr[11] = src_ptr[11];
-    dst_ptr[12] = src_ptr[12];
-    dst_ptr[13] = src_ptr[13];
-    dst_ptr[14] = src_ptr[14];
-    dst_ptr[15] = src_ptr[15];
-    dst_ptr[16] = src_ptr[16];
-    dst_ptr[17] = src_ptr[17];
-    dst_ptr[18] = src_ptr[18];
-    dst_ptr[19] = src_ptr[19];
-    dst_ptr[20] = src_ptr[20];
-    dst_ptr[21] = src_ptr[21];
-    dst_ptr[22] = src_ptr[22];
-    dst_ptr[23] = src_ptr[23];
-    dst_ptr[24] = src_ptr[24];
-    dst_ptr[25] = src_ptr[25];
-    dst_ptr[26] = src_ptr[26];
-    dst_ptr[27] = src_ptr[27];
-    dst_ptr[28] = src_ptr[28];
-    dst_ptr[29] = src_ptr[29];
-    dst_ptr[30] = src_ptr[30];
-    dst_ptr[31] = src_ptr[31];
-#else
-    ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
-    ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
-    ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
-    ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
-    ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
-    ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
-    ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
-    ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
-#endif
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-
-  }
-}
diff --git a/vp9/encoder/vp9_satd_c.c b/vp9/encoder/vp9_satd_c.c
deleted file mode 100644
index 212c2243d..000000000
--- a/vp9/encoder/vp9_satd_c.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include "vpx_ports/mem.h"
-#include "./vp9_rtcd.h"
-
-unsigned int vp9_satd16x16_c(const uint8_t *src_ptr,
-                             int  src_stride,
-                             const uint8_t *ref_ptr,
-                             int  ref_stride,
-                             unsigned int *psatd) {
-  int r, c, i;
-  unsigned int satd = 0;
-  DECLARE_ALIGNED(16, int16_t, diff_in[256]);
-  DECLARE_ALIGNED(16, int16_t, diff_out[16]);
-  int16_t *in;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];
-    }
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  in = diff_in;
-  for (r = 0; r < 16; r += 4) {
-    for (c = 0; c < 16; c += 4) {
-      vp9_short_walsh4x4_c(in + c, diff_out, 32);
-      for (i = 0; i < 16; i++)
-        satd += abs(diff_out[i]);
-    }
-    in += 64;
-  }
-
-  if (psatd)
-    *psatd = satd;
-
-  return satd;
-}
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 49195e80c..cfaf5f592 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -9,10 +9,11 @@
  */
 
 
-#include "limits.h"
+#include <limits.h>
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
 void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
   int mb_row, mb_col;
@@ -21,7 +22,7 @@ void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
 
   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
 
-  if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
+  if ((cm->frame_type == KEY_FRAME) || (cpi->refresh_golden_frame)) {
     // Reset Gf useage monitors
     vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
@@ -143,11 +144,74 @@ static int cost_segmap(MACROBLOCKD *xd,
   return cost;
 }
 
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs_pred(MACROBLOCKD *xd,
+                                    int (*segcounts)[MAX_MB_SEGMENTS],
+                                    vp9_prob *segment_tree_probs,
+                                    vp9_prob *mod_probs) {
+  int count[4];
+
+  assert(!segcounts[0][0] && !segcounts[1][1] &&
+         !segcounts[2][2] && !segcounts[3][3]);
+
+  // Total count for all segments
+  count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0];
+  count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1];
+  count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2];
+  count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3];
+
+  // Work out probabilities of each segment
+  segment_tree_probs[0] = get_binary_prob(count[0] + count[1],
+                                          count[2] + count[3]);
+  segment_tree_probs[1] = get_binary_prob(count[0], count[1]);
+  segment_tree_probs[2] = get_binary_prob(count[2], count[3]);
+
+  // now work out modified counts that the decoder would have
+  count[0] =        segment_tree_probs[0]  *        segment_tree_probs[1];
+  count[1] =        segment_tree_probs[0]  * (256 - segment_tree_probs[1]);
+  count[2] = (256 - segment_tree_probs[0]) *        segment_tree_probs[2];
+  count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]);
+
+  // Work out modified probabilties depending on what segment was predicted
+  mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]);
+  mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]);
+  mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]);
+  mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]);
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap_pred(MACROBLOCKD *xd,
+                            int (*segcounts)[MAX_MB_SEGMENTS],
+                            vp9_prob *probs, vp9_prob *mod_probs) {
+  int pred_seg, cost = 0;
+
+  for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) {
+    int count1, count2;
+
+    // Cost the top node of the tree
+    count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1];
+    count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3];
+    cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) +
+            count2 * vp9_cost_one(mod_probs[pred_seg]);
+
+    // Now add the cost of each individual segment branch
+    if (pred_seg >= 2 && count1) {
+      cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) +
+              segcounts[pred_seg][1] * vp9_cost_one(probs[1]);
+    } else if (pred_seg < 2 && count2 > 0) {
+      cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) +
+              segcounts[pred_seg][3] * vp9_cost_one(probs[2]);
+    }
+  }
+
+  return cost;
+}
+
 static void count_segs(VP9_COMP *cpi,
                        MODE_INFO *mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
-                       int *t_unpred_seg_counts,
+                       int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS],
                        int mb_size, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -166,8 +230,8 @@ static void count_segs(VP9_COMP *cpi,
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
     // Test to see if the segment id matches the predicted value.
-    const int seg_predicted =
-        (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
+    const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, segmap_index);
+    const int seg_predicted = (segment_id == pred_seg_id);
 
     // Get the segment id prediction context
     const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);
@@ -179,7 +243,7 @@ static void count_segs(VP9_COMP *cpi,
 
     if (!seg_predicted)
       // Update the "unpredicted" segment count
-      t_unpred_seg_counts[segment_id]++;
+      t_unpred_seg_counts[pred_seg_id][segment_id]++;
   }
 }
 
@@ -191,18 +255,19 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   int t_pred_cost = INT_MAX;
 
   int i;
-  int mb_row, mb_col;
+  int tile_col, mb_row, mb_col;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
   int no_pred_segcounts[MAX_MB_SEGMENTS];
-  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
+  int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS];
 
   vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
   vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
+  vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
   const int mis = cm->mode_info_stride;
-  MODE_INFO *mi_ptr = cm->mi, *mi;
+  MODE_INFO *mi_ptr, *mi;
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
@@ -218,42 +283,49 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   // First of all generate stats regarding how well the last segment map
   // predicts this one
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
-    mi = mi_ptr;
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
-      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-        count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-                   t_unpred_seg_counts, 4, mb_row, mb_col);
-      } else {
-        for (i = 0; i < 4; i++) {
-          int x_idx = (i & 1) << 1, y_idx = i & 2;
-          MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;
-
-          if (mb_col + x_idx >= cm->mb_cols ||
-              mb_row + y_idx >= cm->mb_rows) {
-            continue;
-          }
-
-          if (sb_mi->mbmi.sb_type) {
-            assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);
-            count_segs(cpi, sb_mi, no_pred_segcounts, temporal_predictor_count,
-                       t_unpred_seg_counts, 2, mb_row + y_idx, mb_col + x_idx);
-          } else {
-            int j;
-
-            for (j = 0; j < 4; j++) {
-              const int x_idx_mb = x_idx + (j & 1), y_idx_mb = y_idx + (j >> 1);
-              MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;
+  for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+    vp9_get_tile_col_offsets(cm, tile_col);
+    mi_ptr = cm->mi + cm->cur_tile_mb_col_start;
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
+      mi = mi_ptr;
+      for (mb_col = cm->cur_tile_mb_col_start;
+           mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) {
+        if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+          count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+                     t_unpred_seg_counts, 4, mb_row, mb_col);
+        } else {
+          for (i = 0; i < 4; i++) {
+            int x_idx = (i & 1) << 1, y_idx = i & 2;
+            MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;
+
+            if (mb_col + x_idx >= cm->mb_cols ||
+                mb_row + y_idx >= cm->mb_rows) {
+              continue;
+            }
 
-              if (mb_col + x_idx_mb >= cm->mb_cols ||
-                  mb_row + y_idx_mb >= cm->mb_rows) {
-                continue;
+            if (sb_mi->mbmi.sb_type) {
+              assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+              count_segs(cpi, sb_mi, no_pred_segcounts,
+                         temporal_predictor_count, t_unpred_seg_counts, 2,
+                         mb_row + y_idx, mb_col + x_idx);
+            } else {
+              int j;
+
+              for (j = 0; j < 4; j++) {
+                const int x_idx_mb = x_idx + (j & 1);
+                const int y_idx_mb = y_idx + (j >> 1);
+                MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;
+
+                if (mb_col + x_idx_mb >= cm->mb_cols ||
+                    mb_row + y_idx_mb >= cm->mb_rows) {
+                  continue;
+                }
+
+                assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+                count_segs(cpi, mb_mi, no_pred_segcounts,
+                           temporal_predictor_count, t_unpred_seg_counts,
+                           1, mb_row + y_idx_mb, mb_col + x_idx_mb);
               }
-
-              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-              count_segs(cpi, mb_mi, no_pred_segcounts,
-                         temporal_predictor_count, t_unpred_seg_counts,
-                         1, mb_row + y_idx_mb, mb_col + x_idx_mb);
             }
           }
         }
@@ -270,8 +342,10 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   if (cm->frame_type != KEY_FRAME) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
-    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
-    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
+    calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree,
+                            t_pred_tree_mod);
+    t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree,
+                                   t_pred_tree_mod);
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
@@ -291,6 +365,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
     cm->temporal_update = 1;
     vpx_memcpy(xd->mb_segment_tree_probs,
                t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(xd->mb_segment_mispred_tree_probs,
+               t_pred_tree_mod, sizeof(t_pred_tree_mod));
     vpx_memcpy(&cm->segment_pred_probs,
                t_nopred_prob, sizeof(t_nopred_prob));
   } else {
diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h
index 3c75c68d8..1c90c2f2d 100644
--- a/vp9/encoder/vp9_segmentation.h
+++ b/vp9/encoder/vp9_segmentation.h
@@ -9,23 +9,20 @@
  */
 
 
-#include "string.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-
 #ifndef VP9_ENCODER_VP9_SEGMENTATION_H_
 #define VP9_ENCODER_VP9_SEGMENTATION_H_
 
-extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,
-                                      MACROBLOCK *x);
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+
+void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x);
 
-extern void vp9_enable_segmentation(VP9_PTR ptr);
-extern void vp9_disable_segmentation(VP9_PTR ptr);
+void vp9_enable_segmentation(VP9_PTR ptr);
+void vp9_disable_segmentation(VP9_PTR ptr);
 
 // Valid values for a segment are 0 to 3
 // Segmentation map is arrange as [Rows][Columns]
-extern void vp9_set_segmentation_map(VP9_PTR ptr,
-                                     unsigned char *segmentation_map);
+void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);
 
 // The values given for each segment can be either deltas (from the default
 // value chosen for the frame) or absolute values.
@@ -37,10 +34,9 @@ extern void vp9_set_segmentation_map(VP9_PTR ptr,
 //
 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
 // the absolute values given).
-//
-extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
-                                 unsigned char abs_delta);
+void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
+                          unsigned char abs_delta);
 
-extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
+void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_SEGMENTATION_H_
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 8bbe53486..a6cd1c0c3 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -8,8 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+#include <limits.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_quantize.h"
@@ -26,9 +29,6 @@
 #include "vp9/common/vp9_swapyv12buffer.h"
 #include "vpx_ports/vpx_timer.h"
 
-#include <math.h>
-#include <limits.h>
-
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
 
@@ -43,39 +43,35 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             int mv_row,
                                             int mv_col,
                                             uint8_t *pred) {
-  int offset;
-  uint8_t *yptr, *uptr, *vptr;
-  int omv_row, omv_col;
-
-  // Y
-  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+  const int which_mv = 0;
+  int_mv subpel_mv;
+  int_mv fullpel_mv;
+
+  subpel_mv.as_mv.row = mv_row;
+  subpel_mv.as_mv.col = mv_col;
+  // TODO(jkoleszar): Make this rounding consistent with the rest of the code
+  fullpel_mv.as_mv.row = (mv_row >> 1) & ~7;
+  fullpel_mv.as_mv.col = (mv_col >> 1) & ~7;
+
+  vp9_build_inter_predictor(y_mb_ptr, stride,
+                            &pred[0], 16,
+                            &subpel_mv,
+                            &xd->scale_factor[which_mv],
+                            16, 16, which_mv, &xd->subpix);
 
-  if ((mv_row | mv_col) & 7) {
-    xd->subpixel_predict16x16(yptr, stride,
-                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
-  } else {
-    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
-  }
-
-  // U & V
-  omv_row = mv_row;
-  omv_col = mv_col;
-  mv_row >>= 1;
-  mv_col >>= 1;
   stride = (stride + 1) >> 1;
-  offset = (mv_row >> 3) * stride + (mv_col >> 3);
-  uptr = u_mb_ptr + offset;
-  vptr = v_mb_ptr + offset;
-
-  if ((omv_row | omv_col) & 15) {
-    xd->subpixel_predict8x8(uptr, stride,
-                           (omv_col & 15), (omv_row & 15), &pred[256], 8);
-    xd->subpixel_predict8x8(vptr, stride,
-                           (omv_col & 15), (omv_row & 15), &pred[320], 8);
-  } else {
-    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
-    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
-  }
+
+  vp9_build_inter_predictor_q4(u_mb_ptr, stride,
+                               &pred[256], 8,
+                               &fullpel_mv, &subpel_mv,
+                               &xd->scale_factor_uv[which_mv],
+                               8, 8, which_mv, &xd->subpix);
+
+  vp9_build_inter_predictor_q4(v_mb_ptr, stride,
+                               &pred[320], 8,
+                               &fullpel_mv, &subpel_mv,
+                               &xd->scale_factor_uv[which_mv],
+                               8, 8, which_mv, &xd->subpix);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -170,7 +166,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,
+  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0],
                            step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
                            NULL, NULL, NULL, NULL,
                            &best_ref_mv1);
@@ -182,7 +178,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
     int distortion;
     unsigned int sse;
     // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,
+    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0],
                                            &best_ref_mv1,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
@@ -262,8 +258,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         if (cpi->frames[frame] == NULL)
           continue;
 
-        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;
-        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;
+        mbd->block[0].bmi.as_mv[0].as_mv.row = 0;
+        mbd->block[0].bmi.as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -296,8 +292,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            cpi->frames[frame]->u_buffer + mb_uv_offset,
            cpi->frames[frame]->v_buffer + mb_uv_offset,
            cpi->frames[frame]->y_stride,
-           mbd->block[0].bmi.as_mv.first.as_mv.row,
-           mbd->block[0].bmi.as_mv.first.as_mv.col,
+           mbd->block[0].bmi.as_mv[0].as_mv.row,
+           mbd->block[0].bmi.as_mv[0].as_mv.col,
            predictor);
 
           // Apply the filter (YUV)
@@ -375,11 +371,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   mbd->pre.v_buffer = v_buffer;
 }
 
-void vp9_temporal_filter_prepare
-(
-  VP9_COMP *cpi,
-  int distance
-) {
+void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   int frame = 0;
 
   int num_frames_backward = 0;
@@ -464,6 +456,13 @@ void vp9_temporal_filter_prepare
 , start_frame);
 #endif
 
+  // Setup scaling factors. Scaling on each of the arnr frames is not supported
+  vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
+      &cpi->common.yv12_fb[cpi->common.new_fb_idx],
+      16 * cpi->common.mb_cols,
+      16 * cpi->common.mb_rows);
+  cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
+
   // Setup frame pointers, NULL indicates frame not included in filter
   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
   for (frame = 0; frame < frames_to_blur; frame++) {
diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h
index 27fc35f82..f3ca8c616 100644
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -11,6 +11,6 @@
 #ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 
-extern void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
+void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
 
 #endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index fc99311ae..95a2e1227 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -25,20 +25,14 @@
    compressions, then generating vp9_context.c = initial stats. */
 
 #ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4];
-vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4];
-vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8];
-vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8];
-vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16];
-vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16];
+vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];
+vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];
+vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];
 vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];
 
-extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4];
-extern vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4];
-extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8];
-extern vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8];
-extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16];
-extern vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16];
+extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
+extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
+extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
 extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];
 #endif  /* ENTROPY_STATS */
 
@@ -100,12 +94,6 @@ static void fill_value_tokens() {
   vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-#define PT pn
-#else
-#define PT pt
-#endif
-
 static void tokenize_b(VP9_COMP *cpi,
                        MACROBLOCKD *xd,
                        const int ib,
@@ -114,22 +102,20 @@ static void tokenize_b(VP9_COMP *cpi,
                        TX_SIZE tx_size,
                        int dry_run) {
   int pt; /* near block/prev token context index */
-  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+  int c = 0;
+  int recent_energy = 0;
   const BLOCKD * const b = xd->block + ib;
-  const int eob = b->eob;     /* one beyond last nonzero coeff */
+  const int eob = xd->eobs[ib];     /* one beyond last nonzero coeff */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int16_t *qcoeff_ptr = b->qcoeff;
   int seg_eob;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int *bands, *scan;
+  const int *scan;
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
   const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
                           get_tx_type(xd, b) : DCT_DCT;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-  int pn;
-#endif
+  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
 
   ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
       vp9_block2above[tx_size][ib];
@@ -147,45 +133,26 @@ static void tokenize_b(VP9_COMP *cpi,
     default:
     case TX_4X4:
       seg_eob = 16;
-      bands = vp9_coef_bands_4x4;
       scan = vp9_default_zig_zag1d_4x4;
       if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_4x4;
-        probs = cpi->common.fc.hybrid_coef_probs_4x4;
         if (tx_type == ADST_DCT) {
           scan = vp9_row_scan_4x4;
         } else if (tx_type == DCT_ADST) {
           scan = vp9_col_scan_4x4;
         }
-      } else {
-        counts = cpi->coef_counts_4x4;
-        probs = cpi->common.fc.coef_probs_4x4;
       }
+      counts = cpi->coef_counts_4x4;
+      probs = cpi->common.fc.coef_probs_4x4;
       break;
     case TX_8X8:
-      if (type == PLANE_TYPE_Y2) {
-        seg_eob = 4;
-        bands = vp9_coef_bands_4x4;
-        scan = vp9_default_zig_zag1d_4x4;
-      } else {
-#if CONFIG_CNVCONTEXT
-        a_ec = (a[0] + a[1]) != 0;
-        l_ec = (l[0] + l[1]) != 0;
-#endif
-        seg_eob = 64;
-        bands = vp9_coef_bands_8x8;
-        scan = vp9_default_zig_zag1d_8x8;
-      }
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_8x8;
-        probs = cpi->common.fc.hybrid_coef_probs_8x8;
-      } else {
-        counts = cpi->coef_counts_8x8;
-        probs = cpi->common.fc.coef_probs_8x8;
-      }
+      a_ec = (a[0] + a[1]) != 0;
+      l_ec = (l[0] + l[1]) != 0;
+      seg_eob = 64;
+      scan = vp9_default_zig_zag1d_8x8;
+      counts = cpi->coef_counts_8x8;
+      probs = cpi->common.fc.coef_probs_8x8;
       break;
     case TX_16X16:
-#if CONFIG_CNVCONTEXT
       if (type != PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
@@ -193,33 +160,23 @@ static void tokenize_b(VP9_COMP *cpi,
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
-#endif
       seg_eob = 256;
-      bands = vp9_coef_bands_16x16;
       scan = vp9_default_zig_zag1d_16x16;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_16x16;
-        probs = cpi->common.fc.hybrid_coef_probs_16x16;
-      } else {
-        counts = cpi->coef_counts_16x16;
-        probs = cpi->common.fc.coef_probs_16x16;
-      }
+      counts = cpi->coef_counts_16x16;
+      probs = cpi->common.fc.coef_probs_16x16;
       if (type == PLANE_TYPE_UV) {
         int uv_idx = (ib - 16) >> 2;
         qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx;
       }
       break;
     case TX_32X32:
-#if CONFIG_CNVCONTEXT
       a_ec = a[0] + a[1] + a[2] + a[3] +
              a1[0] + a1[1] + a1[2] + a1[3];
       l_ec = l[0] + l[1] + l[2] + l[3] +
              l1[0] + l1[1] + l1[2] + l1[3];
       a_ec = a_ec != 0;
       l_ec = l_ec != 0;
-#endif
       seg_eob = 1024;
-      bands = vp9_coef_bands_32x32;
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
@@ -228,16 +185,12 @@ static void tokenize_b(VP9_COMP *cpi,
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-#if CONFIG_NEWCOEFCONTEXT
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-  pn = pt;
-#endif
 
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
+    seg_eob = 0;
 
   do {
-    const int band = bands[c];
+    const int band = get_coef_band(tx_size, c);
     int token;
 
     if (c < eob) {
@@ -252,30 +205,23 @@ static void tokenize_b(VP9_COMP *cpi,
     }
 
     t->Token = token;
-    t->context_tree = probs[type][band][PT];
-    t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
-                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));
+    t->context_tree = probs[type][ref][band][pt];
+    t->skip_eob_node = (pt == 0) && (band > 0);
     assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
     if (!dry_run) {
-      ++counts[type][band][PT][token];
+      ++counts[type][ref][band][pt][token];
     }
-    pt = vp9_prev_token_class[token];
-#if CONFIG_NEWCOEFCONTEXT
-    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(bands[c + 1]))
-      pn = vp9_get_coef_neighbor_context(
-          qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
-    else
-      pn = pt;
-#endif
+
+    pt = vp9_get_coef_context(&recent_energy, token);
     ++t;
   } while (c < eob && ++c < seg_eob);
 
   *tp = t;
-  a_ec = l_ec = (c > !type); /* 0 <-> all coeff data is zero */
+  a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */
   a[0] = a_ec;
   l[0] = l_ec;
 
-  if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) {
+  if (tx_size == TX_8X8) {
     a[1] = a_ec;
     l[1] = l_ec;
   } else if (tx_size == TX_16X16) {
@@ -294,18 +240,13 @@ static void tokenize_b(VP9_COMP *cpi,
   }
 }
 
-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) {
+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
-  if (has_2nd_order) {
-    for (i = 0; i < 16; i++)
-      skip &= (xd->block[i].eob < 2);
-    skip &= (!xd->block[24].eob);
-  } else {
-    for (i = 0; i < 16; i++)
-      skip &= (!xd->block[i].eob);
-  }
+  for (i = 0; i < 16; i++)
+    skip &= (!xd->eobs[i]);
+
   return skip;
 }
 
@@ -314,47 +255,42 @@ int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {
   int i;
 
   for (i = 16; i < 24; i++)
-    skip &= (!xd->block[i].eob);
+    skip &= (!xd->eobs[i]);
   return skip;
 }
 
-static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) {
-  return (vp9_mby_is_skippable_4x4(xd, has_2nd_order) &
+static int mb_is_skippable_4x4(MACROBLOCKD *xd) {
+  return (vp9_mby_is_skippable_4x4(xd) &
           vp9_mbuv_is_skippable_4x4(xd));
 }
 
-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) {
+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd) {
   int skip = 1;
   int i = 0;
 
-  if (has_2nd_order) {
-    for (i = 0; i < 16; i += 4)
-      skip &= (xd->block[i].eob < 2);
-    skip &= (!xd->block[24].eob);
-  } else {
-    for (i = 0; i < 16; i += 4)
-      skip &= (!xd->block[i].eob);
-  }
+  for (i = 0; i < 16; i += 4)
+    skip &= (!xd->eobs[i]);
+
   return skip;
 }
 
 int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {
-  return (!xd->block[16].eob) & (!xd->block[20].eob);
+  return (!xd->eobs[16]) & (!xd->eobs[20]);
 }
 
-static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) {
-  return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) &
+static int mb_is_skippable_8x8(MACROBLOCKD *xd) {
+  return (vp9_mby_is_skippable_8x8(xd) &
           vp9_mbuv_is_skippable_8x8(xd));
 }
 
-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_2nd_order) {
-  return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) &
+static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd) {
+  return (vp9_mby_is_skippable_8x8(xd) &
           vp9_mbuv_is_skippable_4x4(xd));
 }
 
 int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
   int skip = 1;
-  skip &= !xd->block[0].eob;
+  skip &= !xd->eobs[0];
   return skip;
 }
 
@@ -364,12 +300,12 @@ static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
 
 int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {
   int skip = 1;
-  skip &= !xd->block[0].eob;
+  skip &= !xd->eobs[0];
   return skip;
 }
 
 int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {
-  return (!xd->block[16].eob) & (!xd->block[20].eob);
+  return (!xd->eobs[16]) & (!xd->eobs[20]);
 }
 
 static int sb_is_skippable_32x32(MACROBLOCKD *xd) {
@@ -384,14 +320,9 @@ void vp9_tokenize_sb(VP9_COMP *cpi,
   VP9_COMMON * const cm = &cpi->common;
   MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
   TOKENEXTRA *t_backup = *t;
-  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),
-                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };
-  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),
-                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };
   const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
   const int segment_id = mbmi->segment_id;
-  const int skip_inc =  !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-                        (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0);
+  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
   int b;
 
   mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
@@ -419,7 +350,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi,
     tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
                TX_16X16, dry_run);
   }
-  A[0][8] = L[0][8] = A[1][8] = L[1][8] = 0;
   if (dry_run)
     *t = t_backup;
 }
@@ -428,8 +358,6 @@ void vp9_tokenize_mb(VP9_COMP *cpi,
                      MACROBLOCKD *xd,
                      TOKENEXTRA **t,
                      int dry_run) {
-  PLANE_TYPE plane_type;
-  int has_2nd_order;
   int b;
   int tx_size = xd->mode_info_context->mbmi.txfm_size;
   int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
@@ -441,14 +369,11 @@ void vp9_tokenize_mb(VP9_COMP *cpi,
   int skip_inc;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
 
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-      (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_inc = 1;
   } else
     skip_inc = 0;
 
-  has_2nd_order = get_2nd_order_usage(xd);
-
   switch (tx_size) {
     case TX_16X16:
 
@@ -458,15 +383,15 @@ void vp9_tokenize_mb(VP9_COMP *cpi,
       if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
           xd->mode_info_context->mbmi.mode == SPLITMV)
         xd->mode_info_context->mbmi.mb_skip_coeff =
-            mb_is_skippable_8x8_4x4uv(xd, 0);
+            mb_is_skippable_8x8_4x4uv(xd);
       else
         xd->mode_info_context->mbmi.mb_skip_coeff =
-            mb_is_skippable_8x8(xd, has_2nd_order);
+            mb_is_skippable_8x8(xd);
       break;
 
     default:
       xd->mode_info_context->mbmi.mb_skip_coeff =
-          mb_is_skippable_4x4(xd, has_2nd_order);
+          mb_is_skippable_4x4(xd);
       break;
   }
 
@@ -487,15 +412,6 @@ void vp9_tokenize_mb(VP9_COMP *cpi,
   if (!dry_run)
     cpi->skip_false_count[mb_skip_context] += skip_inc;
 
-  if (has_2nd_order) {
-    tokenize_b(cpi, xd, 24, t, PLANE_TYPE_Y2, tx_size, dry_run);
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-  }
-
   if (tx_size == TX_16X16) {
     tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
     for (b = 16; b < 24; b += 4) {
@@ -503,7 +419,7 @@ void vp9_tokenize_mb(VP9_COMP *cpi,
     }
   } else if (tx_size == TX_8X8) {
     for (b = 0; b < 16; b += 4) {
-      tokenize_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);
+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
     }
     if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
         xd->mode_info_context->mbmi.mode == SPLITMV) {
@@ -516,11 +432,10 @@ void vp9_tokenize_mb(VP9_COMP *cpi,
       }
     }
   } else {
-    for (b = 0; b < 24; b++) {
-      if (b >= 16)
-        plane_type = PLANE_TYPE_UV;
-      tokenize_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run);
-    }
+    for (b = 0; b < 16; b++)
+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
+    for (b = 16; b < 24; b++)
+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
   }
   if (dry_run)
     *t = t_backup;
@@ -531,25 +446,13 @@ void init_context_counters(void) {
   FILE *f = fopen("context.bin", "rb");
   if (!f) {
     vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4));
-    vpx_memset(hybrid_context_counters_4x4, 0,
-               sizeof(hybrid_context_counters_4x4));
     vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
-    vpx_memset(hybrid_context_counters_8x8, 0,
-               sizeof(hybrid_context_counters_8x8));
     vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
-    vpx_memset(hybrid_context_counters_16x16, 0,
-               sizeof(hybrid_context_counters_16x16));
     vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32));
   } else {
     fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f);
-    fread(hybrid_context_counters_4x4,
-          sizeof(hybrid_context_counters_4x4), 1, f);
     fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
-    fread(hybrid_context_counters_8x8,
-          sizeof(hybrid_context_counters_8x8), 1, f);
     fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
-    fread(hybrid_context_counters_16x16,
-          sizeof(hybrid_context_counters_16x16), 1, f);
     fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f);
     fclose(f);
   }
@@ -557,25 +460,13 @@ void init_context_counters(void) {
   f = fopen("treeupdate.bin", "rb");
   if (!f) {
     vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4));
-    vpx_memset(hybrid_tree_update_hist_4x4, 0,
-               sizeof(hybrid_tree_update_hist_4x4));
     vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
-    vpx_memset(hybrid_tree_update_hist_8x8, 0,
-               sizeof(hybrid_tree_update_hist_8x8));
     vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
-    vpx_memset(hybrid_tree_update_hist_16x16, 0,
-               sizeof(hybrid_tree_update_hist_16x16));
     vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32));
   } else {
     fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);
-    fread(hybrid_tree_update_hist_4x4,
-          sizeof(hybrid_tree_update_hist_4x4), 1, f);
     fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
-    fread(hybrid_tree_update_hist_8x8,
-          sizeof(hybrid_tree_update_hist_8x8), 1, f);
     fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
-    fread(hybrid_tree_update_hist_16x16,
-          sizeof(hybrid_tree_update_hist_16x16), 1, f);
     fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);
     fclose(f);
   }
@@ -583,33 +474,38 @@ void init_context_counters(void) {
 
 static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
                           int block_types, const char *header) {
-  int type, band, pt, t;
+  int type, ref, band, pt, t;
 
   fprintf(f, "static const vp9_coeff_count %s = {\n", header);
 
 #define Comma(X) (X ? "," : "")
   type = 0;
   do {
+    ref = 0;
     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
     do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
+      fprintf(f, "%s\n    { /* %s */", Comma(type), ref ? "Inter" : "Intra");
+      band = 0;
       do {
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
+        fprintf(f, "%s\n      { /* Coeff Band %d */", Comma(band), band);
+        pt = 0;
         do {
-          const int64_t x = context_counters[type][band][pt][t];
-          const int y = (int) x;
-
-          assert(x == (int64_t) y);  /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-        } while (++t < MAX_ENTROPY_TOKENS);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
+          fprintf(f, "%s\n        {", Comma(pt));
+
+          t = 0;
+          do {
+            const int64_t x = context_counters[type][ref][band][pt][t];
+            const int y = (int) x;
+
+            assert(x == (int64_t) y);  /* no overflow handling yet */
+            fprintf(f, "%s %d", Comma(t), y);
+          } while (++t < MAX_ENTROPY_TOKENS);
+          fprintf(f, "}");
+        } while (++pt < PREV_COEF_CONTEXTS);
+        fprintf(f, "\n      }");
+      } while (++band < COEF_BANDS);
       fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
+    } while (++ref < REF_TYPES);
     fprintf(f, "\n  }");
   } while (++type < block_types);
   fprintf(f, "\n};\n");
@@ -617,7 +513,7 @@ static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
 
 static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
                         int block_types, const char *header) {
-  int type, band, pt, t;
+  int type, ref, band, pt, t;
 
   fprintf(f, "static const vp9_coeff_probs %s = {", header);
 
@@ -626,32 +522,38 @@ static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
   do {
     fprintf(f, "%s%s{ /* block Type %d */",
             Comma(type), Newline(type, "  "), type);
-    band = 0;
+    ref = 0;
     do {
-      fprintf(f, "%s%s{ /* Coeff Band %d */",
-              Comma(band), Newline(band, "    "), band);
-      pt = 0;
+      fprintf(f, "%s%s{ /* %s */",
+              Comma(band), Newline(band, "    "), ref ? "Inter" : "Intra");
+      band = 0;
       do {
-        unsigned int branch_ct[ENTROPY_NODES][2];
-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
-        vp9_prob coef_probs[ENTROPY_NODES];
-
-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters[type][band][pt][t];
-        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
-                                         vp9_coef_encodings, vp9_coef_tree,
-                                         coef_probs, branch_ct, coef_counts);
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
+        fprintf(f, "%s%s{ /* Coeff Band %d */",
+                Comma(band), Newline(band, "      "), band);
+        pt = 0;
         do {
-          fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
-        } while (++t < ENTROPY_NODES);
-
-        fprintf(f, " }");
-      } while (++pt < PREV_COEF_CONTEXTS);
+          unsigned int branch_ct[ENTROPY_NODES][2];
+          unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+          vp9_prob coef_probs[ENTROPY_NODES];
+
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            coef_counts[t] = context_counters[type][ref][band][pt][t];
+          vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
+                                           vp9_coef_encodings, vp9_coef_tree,
+                                           coef_probs, branch_ct, coef_counts);
+          fprintf(f, "%s\n      {", Comma(pt));
+
+          t = 0;
+          do {
+            fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
+          } while (++t < ENTROPY_NODES);
+
+          fprintf(f, " }");
+        } while (++pt < PREV_COEF_CONTEXTS);
+        fprintf(f, "\n      }");
+      } while (++band < COEF_BANDS);
       fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
+    } while (++ref < REF_TYPES);
     fprintf(f, "\n  }");
   } while (++type < block_types);
   fprintf(f, "\n};\n");
@@ -664,34 +566,22 @@ void print_context_counters() {
   fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
 
   /* print counts */
-  print_counter(f, context_counters_4x4, BLOCK_TYPES_4X4,
+  print_counter(f, context_counters_4x4, BLOCK_TYPES,
                 "vp9_default_coef_counts_4x4[BLOCK_TYPES_4X4]");
-  print_counter(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4,
-                "vp9_default_hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]");
-  print_counter(f, context_counters_8x8, BLOCK_TYPES_8X8,
+  print_counter(f, context_counters_8x8, BLOCK_TYPES,
                 "vp9_default_coef_counts_8x8[BLOCK_TYPES_8X8]");
-  print_counter(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8,
-                "vp9_default_hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]");
-  print_counter(f, context_counters_16x16, BLOCK_TYPES_16X16,
+  print_counter(f, context_counters_16x16, BLOCK_TYPES,
                 "vp9_default_coef_counts_16x16[BLOCK_TYPES_16X16]");
-  print_counter(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16,
-                "vp9_default_hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]");
   print_counter(f, context_counters_32x32, BLOCK_TYPES_32X32,
                 "vp9_default_coef_counts_32x32[BLOCK_TYPES_32X32]");
 
   /* print coefficient probabilities */
-  print_probs(f, context_counters_4x4, BLOCK_TYPES_4X4,
+  print_probs(f, context_counters_4x4, BLOCK_TYPES,
               "default_coef_probs_4x4[BLOCK_TYPES_4X4]");
-  print_probs(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4,
-              "default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]");
-  print_probs(f, context_counters_8x8, BLOCK_TYPES_8X8,
+  print_probs(f, context_counters_8x8, BLOCK_TYPES,
               "default_coef_probs_8x8[BLOCK_TYPES_8X8]");
-  print_probs(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8,
-              "default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]");
-  print_probs(f, context_counters_16x16, BLOCK_TYPES_16X16,
+  print_probs(f, context_counters_16x16, BLOCK_TYPES,
               "default_coef_probs_16x16[BLOCK_TYPES_16X16]");
-  print_probs(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16,
-              "default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]");
   print_probs(f, context_counters_32x32, BLOCK_TYPES_32X32,
               "default_coef_probs_32x32[BLOCK_TYPES_32X32]");
 
@@ -699,14 +589,8 @@ void print_context_counters() {
 
   f = fopen("context.bin", "wb");
   fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f);
-  fwrite(hybrid_context_counters_4x4,
-         sizeof(hybrid_context_counters_4x4), 1, f);
   fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
-  fwrite(hybrid_context_counters_8x8,
-         sizeof(hybrid_context_counters_8x8), 1, f);
   fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
-  fwrite(hybrid_context_counters_16x16,
-         sizeof(hybrid_context_counters_16x16), 1, f);
   fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f);
   fclose(f);
 }
@@ -716,21 +600,18 @@ void vp9_tokenize_initialize() {
   fill_value_tokens();
 }
 
-static __inline void stuff_b(VP9_COMP *cpi,
-                             MACROBLOCKD *xd,
-                             const int ib,
-                             TOKENEXTRA **tp,
-                             PLANE_TYPE type,
-                             TX_SIZE tx_size,
-                             int dry_run) {
-  const BLOCKD * const b = xd->block + ib;
-  const int *bands;
+static INLINE void stuff_b(VP9_COMP *cpi,
+                           MACROBLOCKD *xd,
+                           const int ib,
+                           TOKENEXTRA **tp,
+                           PLANE_TYPE type,
+                           TX_SIZE tx_size,
+                           int dry_run) {
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
   int pt, band;
   TOKENEXTRA *t = *tp;
-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, b) : DCT_DCT;
+  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
   ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
       vp9_block2above[tx_size][ib];
   ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +
@@ -744,33 +625,16 @@ static __inline void stuff_b(VP9_COMP *cpi,
   switch (tx_size) {
     default:
     case TX_4X4:
-      bands = vp9_coef_bands_4x4;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_4x4;
-        probs = cpi->common.fc.hybrid_coef_probs_4x4;
-      } else {
-        counts = cpi->coef_counts_4x4;
-        probs = cpi->common.fc.coef_probs_4x4;
-      }
+      counts = cpi->coef_counts_4x4;
+      probs = cpi->common.fc.coef_probs_4x4;
       break;
     case TX_8X8:
-#if CONFIG_CNVCONTEXT
-      if (type != PLANE_TYPE_Y2) {
-        a_ec = (a[0] + a[1]) != 0;
-        l_ec = (l[0] + l[1]) != 0;
-      }
-#endif
-      bands = vp9_coef_bands_8x8;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_8x8;
-        probs = cpi->common.fc.hybrid_coef_probs_8x8;
-      } else {
-        counts = cpi->coef_counts_8x8;
-        probs = cpi->common.fc.coef_probs_8x8;
-      }
+      a_ec = (a[0] + a[1]) != 0;
+      l_ec = (l[0] + l[1]) != 0;
+      counts = cpi->coef_counts_8x8;
+      probs = cpi->common.fc.coef_probs_8x8;
       break;
     case TX_16X16:
-#if CONFIG_CNVCONTEXT
       if (type != PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
@@ -778,26 +642,16 @@ static __inline void stuff_b(VP9_COMP *cpi,
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
-#endif
-      bands = vp9_coef_bands_16x16;
-      if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts_16x16;
-        probs = cpi->common.fc.hybrid_coef_probs_16x16;
-      } else {
-        counts = cpi->coef_counts_16x16;
-        probs = cpi->common.fc.coef_probs_16x16;
-      }
+      counts = cpi->coef_counts_16x16;
+      probs = cpi->common.fc.coef_probs_16x16;
       break;
     case TX_32X32:
-#if CONFIG_CNVCONTEXT
       a_ec = a[0] + a[1] + a[2] + a[3] +
              a1[0] + a1[1] + a1[2] + a1[3];
       l_ec = l[0] + l[1] + l[2] + l[3] +
              l1[0] + l1[1] + l1[2] + l1[3];
       a_ec = a_ec != 0;
       l_ec = l_ec != 0;
-#endif
-      bands = vp9_coef_bands_32x32;
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
       break;
@@ -805,14 +659,14 @@ static __inline void stuff_b(VP9_COMP *cpi,
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
 
-  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
+  band = get_coef_band(tx_size, 0);
   t->Token = DCT_EOB_TOKEN;
-  t->context_tree = probs[type][band][pt];
+  t->context_tree = probs[type][ref][band][pt];
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
   *a = *l = 0;
-  if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) {
+  if (tx_size == TX_8X8) {
     a[1] = 0;
     l[1] = 0;
   } else if (tx_size == TX_16X16) {
@@ -831,32 +685,18 @@ static __inline void stuff_b(VP9_COMP *cpi,
   }
 
   if (!dry_run) {
-    ++counts[type][band][pt][DCT_EOB_TOKEN];
+    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
   }
 }
 
 static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
                          TOKENEXTRA **t, int dry_run) {
-  PLANE_TYPE plane_type;
   int b;
-  int has_2nd_order = get_2nd_order_usage(xd);
-
-  if (has_2nd_order) {
-    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run);
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else {
-#if CONFIG_CNVCONTEXT
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
-#endif
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-  }
 
-  for (b = 0; b < 24; b += 4) {
-    if (b >= 16)
-      plane_type = PLANE_TYPE_UV;
-    stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);
-  }
+  for (b = 0; b < 16; b += 4)
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
+  for (b = 16; b < 24; b += 4)
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
 }
 
 static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
@@ -867,56 +707,26 @@ static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
   for (b = 16; b < 24; b += 4) {
     stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
   }
-#if CONFIG_CNVCONTEXT
-  xd->above_context->y2 = 0;
-  xd->left_context->y2 = 0;
-#endif
 }
 
 static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
                          TOKENEXTRA **t, int dry_run) {
   int b;
-  PLANE_TYPE plane_type;
-  int has_2nd_order = get_2nd_order_usage(xd);
-
-  if (has_2nd_order) {
-    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_4X4, dry_run);
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-  }
 
-  for (b = 0; b < 24; b++) {
-    if (b >= 16)
-      plane_type = PLANE_TYPE_UV;
-    stuff_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run);
-  }
+  for (b = 0; b < 16; b++)
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
+  for (b = 16; b < 24; b++)
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
 }
 
 static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
                                TOKENEXTRA **t, int dry_run) {
-  PLANE_TYPE plane_type;
   int b;
 
-  int has_2nd_order = get_2nd_order_usage(xd);
-  if (has_2nd_order) {
-    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run);
-    plane_type = PLANE_TYPE_Y_NO_DC;
-  } else {
-    xd->above_context->y2 = 0;
-    xd->left_context->y2 = 0;
-    plane_type = PLANE_TYPE_Y_WITH_DC;
-  }
-
-  for (b = 0; b < 16; b += 4) {
-    stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);
-  }
-
-  for (b = 16; b < 24; b++) {
+  for (b = 0; b < 16; b += 4)
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
+  for (b = 16; b < 24; b++)
     stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-  }
 }
 
 void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 3eeb8fa5a..6ac19ba71 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -28,42 +28,39 @@ typedef struct {
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
-typedef int64_t vp9_coeff_accum[COEF_BANDS][PREV_COEF_CONTEXTS]
+typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                [MAX_ENTROPY_TOKENS];
 
-extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);
-extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
-extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
-extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
-extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
-extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
-extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd);
+int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd);
+int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
+int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
+int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
 
 struct VP9_COMP;
 
-extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                            TOKENEXTRA **t, int dry_run);
-extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                            TOKENEXTRA **t, int dry_run);
+void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                     TOKENEXTRA **t, int dry_run);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                     TOKENEXTRA **t, int dry_run);
 
-extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run);
-extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run);
+void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                  TOKENEXTRA **t, int dry_run);
+void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                  TOKENEXTRA **t, int dry_run);
+
+void vp9_fix_contexts_sb(MACROBLOCKD *xd);
 
-extern void vp9_fix_contexts_sb(MACROBLOCKD *xd);
 #ifdef ENTROPY_STATS
 void init_context_counters();
 void print_context_counters();
 
-extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4];
-extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8];
-extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16];
+extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];
+extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];
+extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];
 extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];
-
-extern vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4];
-extern vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8];
-extern vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16];
 #endif
 
 extern const int *vp9_dct_value_cost_ptr;
diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index 4e0e5e12c..832471aa8 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h
@@ -36,30 +36,28 @@ typedef BOOL_CODER vp9_writer;
 
 
 /* Both of these return bits, not scaled bits. */
-
-static __inline unsigned int cost_branch(const unsigned int ct[2],
-                                         vp9_prob p) {
+static INLINE unsigned int cost_branch256(const unsigned int ct[2],
+                                          vp9_prob p) {
   /* Imitate existing calculation */
-  return ((ct[0] * vp9_cost_zero(p))
-          + (ct[1] * vp9_cost_one(p))) >> 8;
+  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
-static __inline unsigned int cost_branch256(const unsigned int ct[2],
-                                            vp9_prob p) {
+static INLINE unsigned int cost_branch(const unsigned int ct[2],
+                                       vp9_prob p) {
   /* Imitate existing calculation */
-  return ((ct[0] * vp9_cost_zero(p))
-          + (ct[1] * vp9_cost_one(p)));
+  return cost_branch256(ct, p) >> 8;
 }
 
+
 /* Small functions to write explicit values and tokens, as well as
    estimate their lengths. */
 
-static __inline void treed_write(vp9_writer *const w,
-                                 vp9_tree t,
-                                 const vp9_prob *const p,
-                                 int v,
-                                 /* number of bits in v, assumed nonzero */
-                                 int n) {
+static INLINE void treed_write(vp9_writer *const w,
+                               vp9_tree t,
+                               const vp9_prob *const p,
+                               int v,
+                               /* number of bits in v, assumed nonzero */
+                               int n) {
   vp9_tree_index i = 0;
 
   do {
@@ -69,18 +67,18 @@ static __inline void treed_write(vp9_writer *const w,
   } while (n);
 }
 
-static __inline void write_token(vp9_writer *const w,
-                                 vp9_tree t,
-                                 const vp9_prob *const p,
-                                 vp9_token *const x) {
+static INLINE void write_token(vp9_writer *const w,
+                               vp9_tree t,
+                               const vp9_prob *const p,
+                               vp9_token *const x) {
   treed_write(w, t, p, x->value, x->Len);
 }
 
-static __inline int treed_cost(vp9_tree t,
-                               const vp9_prob *const p,
-                               int v,
-                               /* number of bits in v, assumed nonzero */
-                               int n) {
+static INLINE int treed_cost(vp9_tree t,
+                             const vp9_prob *const p,
+                             int v,
+                             /* number of bits in v, assumed nonzero */
+                             int n) {
   int c = 0;
   vp9_tree_index i = 0;
 
@@ -93,9 +91,9 @@ static __inline int treed_cost(vp9_tree t,
   return c;
 }
 
-static __inline int cost_token(vp9_tree t,
-                               const vp9_prob *const p,
-                               vp9_token *const x) {
+static INLINE int cost_token(vp9_tree t,
+                             const vp9_prob *const p,
+                             vp9_token *const x) {
   return treed_cost(t, p, x->value, x->Len);
 }
 
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 675dbb63e..7120c5fe7 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -19,12 +19,6 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int ref_stride,
                                     unsigned int max_sad);
 
-typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr,
-                                  int source_stride,
-                                  const uint8_t *ref_ptr,
-                                  int ref_stride,
-                                  int n);
-
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
                                    const uint8_t *ref_ptr,
@@ -35,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
                                     int  ref_stride,
-                                    unsigned short *sad_array);
+                                    unsigned int *sad_array);
 
 typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
@@ -79,7 +73,6 @@ typedef struct vp9_variance_vtable {
     vp9_sad_multi_fn_t      sdx3f;
     vp9_sad_multi1_fn_t     sdx8f;
     vp9_sad_multi_d_fn_t    sdx4df;
-    vp9_copy32xn_fn_t       copymem;
 } vp9_variance_fn_ptr_t;
 
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index d03e285c6..d07a65b45 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
   const int16_t *HFilter, *VFilter;
   uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   // First filter 1d Horizontal
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
@@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
@@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
@@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
   uint8_t temp2[68 * 64];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
                                     1, 65, 64, HFilter);
@@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
   uint8_t temp2[36 * 32];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
@@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
@@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
                                     1, 17, 8, HFilter);
diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
new file mode 100644
index 000000000..ff884d999
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
@@ -0,0 +1,272 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
+
+void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
+  const int stride = pitch >> 1;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/substract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/substract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/substract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    _mm_storeu_si128 ((__m128i *)(output + 0 * 8), in0);
+    _mm_storeu_si128 ((__m128i *)(output + 1 * 8), in1);
+    _mm_storeu_si128 ((__m128i *)(output + 2 * 8), in2);
+    _mm_storeu_si128 ((__m128i *)(output + 3 * 8), in3);
+    _mm_storeu_si128 ((__m128i *)(output + 4 * 8), in4);
+    _mm_storeu_si128 ((__m128i *)(output + 5 * 8), in5);
+    _mm_storeu_si128 ((__m128i *)(output + 6 * 8), in6);
+    _mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7);
+  }
+}
diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm
index 5d9f7769d..90c793d4f 100644
--- a/vp9/encoder/x86/vp9_encodeopt.asm
+++ b/vp9/encoder/x86/vp9_encodeopt.asm
@@ -125,7 +125,7 @@ sym(vp9_block_error_mmx):
     ret
 
 
-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
 global sym(vp9_mbblock_error_mmx_impl) PRIVATE
 sym(vp9_mbblock_error_mmx_impl):
     push        rbp
@@ -142,10 +142,6 @@ sym(vp9_mbblock_error_mmx_impl):
         mov         rdi,        arg(1) ;dcoef_ptr
         pxor        mm2,        mm2
 
-        movd        mm1,        dword ptr arg(2) ;dc
-        por         mm1,        mm2
-
-        pcmpeqw     mm1,        mm7
         mov         rcx,        16
 
 .mberror_loop_mmx:
@@ -160,7 +156,6 @@ sym(vp9_mbblock_error_mmx_impl):
         pmaddwd     mm5,        mm5
 
         psubw       mm3,        mm4
-        pand        mm3,        mm1
 
         pmaddwd     mm3,        mm3
         paddd       mm2,        mm5
@@ -202,28 +197,24 @@ sym(vp9_mbblock_error_mmx_impl):
     ret
 
 
-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
 global sym(vp9_mbblock_error_xmm_impl) PRIVATE
 sym(vp9_mbblock_error_xmm_impl):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 6
+    SAVE_XMM 5
     push rsi
     push rdi
     ; end prolog
 
 
         mov         rsi,        arg(0) ;coeff_ptr
-        pxor        xmm6,       xmm6
+        pxor        xmm5,       xmm5
 
         mov         rdi,        arg(1) ;dcoef_ptr
         pxor        xmm4,       xmm4
 
-        movd        xmm5,       dword ptr arg(2) ;dc
-        por         xmm5,       xmm4
-
-        pcmpeqw     xmm5,       xmm6
         mov         rcx,        16
 
 .mberror_loop:
@@ -238,7 +229,6 @@ sym(vp9_mbblock_error_xmm_impl):
         pmaddwd     xmm2,       xmm2
 
         psubw       xmm0,       xmm1
-        pand        xmm0,       xmm5
 
         pmaddwd     xmm0,       xmm0
         add         rsi,        32
@@ -252,9 +242,9 @@ sym(vp9_mbblock_error_xmm_impl):
         jnz         .mberror_loop
 
         movdqa      xmm0,       xmm4
-        punpckldq   xmm0,       xmm6
+        punpckldq   xmm0,       xmm5
 
-        punpckhdq   xmm4,       xmm6
+        punpckhdq   xmm4,       xmm5
         paddd       xmm0,       xmm4
 
         movdqa      xmm1,       xmm0
diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm
new file mode 100644
index 000000000..3716d91ec
--- /dev/null
+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm
@@ -0,0 +1,225 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+  movd                  m0, [srcq +%2]
+%if %1 == 1
+  movd                  m6, [ref1q+%3]
+  movd                  m4, [ref2q+%3]
+  movd                  m7, [ref3q+%3]
+  movd                  m5, [ref4q+%3]
+  punpckldq             m0, [srcq +%4]
+  punpckldq             m6, [ref1q+%5]
+  punpckldq             m4, [ref2q+%5]
+  punpckldq             m7, [ref3q+%5]
+  punpckldq             m5, [ref4q+%5]
+  psadbw                m6, m0
+  psadbw                m4, m0
+  psadbw                m7, m0
+  psadbw                m5, m0
+  punpckldq             m6, m4
+  punpckldq             m7, m5
+%else
+  movd                  m1, [ref1q+%3]
+  movd                  m2, [ref2q+%3]
+  movd                  m3, [ref3q+%3]
+  movd                  m4, [ref4q+%3]
+  punpckldq             m0, [srcq +%4]
+  punpckldq             m1, [ref1q+%5]
+  punpckldq             m2, [ref2q+%5]
+  punpckldq             m3, [ref3q+%5]
+  punpckldq             m4, [ref4q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  psadbw                m4, m0
+  punpckldq             m1, m2
+  punpckldq             m3, m4
+  paddd                 m6, m1
+  paddd                 m7, m3
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+  movh                  m0, [srcq +%2]
+%if %1 == 1
+  movh                  m4, [ref1q+%3]
+  movh                  m5, [ref2q+%3]
+  movh                  m6, [ref3q+%3]
+  movh                  m7, [ref4q+%3]
+  movhps                m0, [srcq +%4]
+  movhps                m4, [ref1q+%5]
+  movhps                m5, [ref2q+%5]
+  movhps                m6, [ref3q+%5]
+  movhps                m7, [ref4q+%5]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movh                  m1, [ref1q+%3]
+  movh                  m2, [ref2q+%3]
+  movh                  m3, [ref3q+%3]
+  movhps                m0, [srcq +%4]
+  movhps                m1, [ref1q+%5]
+  movhps                m2, [ref2q+%5]
+  movhps                m3, [ref3q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movh                  m1, [ref4q+%3]
+  movhps                m1, [ref4q+%5]
+  paddd                 m5, m2
+  paddd                 m6, m3
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+  ; 1st 16 px
+  mova                  m0, [srcq +%2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3]
+  movu                  m5, [ref2q+%3]
+  movu                  m6, [ref3q+%3]
+  movu                  m7, [ref4q+%3]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movu                  m1, [ref1q+%3]
+  movu                  m2, [ref2q+%3]
+  movu                  m3, [ref3q+%3]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movu                  m1, [ref4q+%3]
+  paddd                 m5, m2
+  paddd                 m6, m3
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endif
+
+  ; 2nd 16 px
+  mova                  m0, [srcq +%4]
+  movu                  m1, [ref1q+%5]
+  movu                  m2, [ref2q+%5]
+  movu                  m3, [ref3q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movu                  m1, [ref4q+%5]
+  paddd                 m5, m2
+  paddd                 m6, m3
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; void vp9_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         unsigned int res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro SADNXN4D 2
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if mmsize == 16
+  pslldq                m5, 4
+  pslldq                m7, 4
+  por                   m4, m5
+  por                   m6, m7
+  mova                  m5, m4
+  mova                  m7, m6
+  punpcklqdq            m4, m6
+  punpckhqdq            m5, m7
+  movifnidn             r4, r4mp
+  paddd                 m4, m5
+  movu                [r4], m4
+  RET
+%else
+  movifnidn             r4, r4mp
+  movq               [r4+0], m6
+  movq               [r4+8], m7
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 64, 64
+SADNXN4D 32, 32
+SADNXN4D 16, 16
+SADNXN4D 16,  8
+SADNXN4D  8, 16
+SADNXN4D  8,  8
+
+INIT_MMX sse
+SADNXN4D  4,  4
diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm
index 33271635c..ea482e071 100644
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -8,403 +8,175 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_sad16x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad16x16_wmt) PRIVATE
-sym(vp9_sad16x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            xmm6,       xmm6
-
-.x16x16sad_wmt_loop:
-
-        movq            xmm0,       QWORD PTR [rsi]
-        movq            xmm2,       QWORD PTR [rsi+8]
-
-        movq            xmm1,       QWORD PTR [rdi]
-        movq            xmm3,       QWORD PTR [rdi+8]
-
-        movq            xmm4,       QWORD PTR [rsi+rax]
-        movq            xmm5,       QWORD PTR [rdi+rdx]
-
-
-        punpcklbw       xmm0,       xmm2
-        punpcklbw       xmm1,       xmm3
-
-        psadbw          xmm0,       xmm1
-        movq            xmm2,       QWORD PTR [rsi+rax+8]
-
-        movq            xmm3,       QWORD PTR [rdi+rdx+8]
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        punpcklbw       xmm4,       xmm2
-
-        punpcklbw       xmm5,       xmm3
-        psadbw          xmm4,       xmm5
-
-        paddw           xmm6,       xmm0
-        paddw           xmm6,       xmm4
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_wmt_loop
-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movq            rax,        xmm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp9_sad8x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_err)
-global sym(vp9_sad8x16_wmt) PRIVATE
-sym(vp9_sad8x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-
-        lea             rcx,        [rcx+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x16sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        jg              .x8x16sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        QWORD PTR [rsi+rbx]
-        movq            mm3,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm2
-
-        cmp             rsi,        rcx
-        jne             .x8x16sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x8x16sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad8x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad8x8_wmt) PRIVATE
-sym(vp9_sad8x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        jg              .x8x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rbx]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x8x8sad_wmt_loop
-
-        movq            rax,        mm7
-.x8x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp9_sad4x4_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad4x4_wmt) PRIVATE
-sym(vp9_sad4x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        movd            mm4,        DWORD PTR [rsi]
-
-        movd            mm5,        DWORD PTR [rdi]
-        movd            mm6,        DWORD PTR [rsi+rax]
-
-        movd            mm7,        DWORD PTR [rdi+rdx]
-        punpcklbw       mm4,        mm6
-
-        punpcklbw       mm5,        mm7
-        psadbw          mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            rax,        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad16x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp9_sad16x8_wmt) PRIVATE
-sym(vp9_sad16x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x16x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        jg              .x16x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        QWORD PTR [rsi+rbx]
-        movq            mm5,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [rsi+rbx+8]
-        movq            mm3,        QWORD PTR [rdi+rdx+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x16x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_copy32xn_sse2(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp9_copy32xn_sse2) PRIVATE
-sym(vp9_copy32xn_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;dst_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;dst_stride
-        movsxd          rcx,        dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,        [rsi+rax*2]
-
-        movdqu          xmm4,       XMMWORD PTR [rsi]
-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,    [rsi+rax*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        movdqa          XMMWORD PTR [rdi + rdx], xmm2
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
-
-        lea             rdi,    [rdi+rdx*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm4
-        movdqa          XMMWORD PTR [rdi + 16], xmm5
-        movdqa          XMMWORD PTR [rdi + rdx], xmm6
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
-
-        lea             rdi,    [rdi+rdx*2]
-
-        sub             rcx,     4
-        cmp             rcx,     4
-        jge             .block_copy_sse2_loopx4
-
-        cmp             rcx, 0
-        je              .copy_is_done
-
-.block_copy_sse2_loop:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        lea             rsi,    [rsi+rax]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        lea             rdi,    [rdi+rdx]
-
-        sub             rcx,     1
-        jne             .block_copy_sse2_loop
-
-.copy_is_done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+INIT_XMM sse2
+cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov              n_rowsd, 64
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  add                 refq, ref_strideq
+  paddd                 m0, m1
+  add                 srcq, src_strideq
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+
+; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+INIT_XMM sse2
+cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov              n_rowsd, 16
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq]
+  movu                  m4, [refq+ref_strideq+16]
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+src_strideq]
+  psadbw                m4, [srcq+src_strideq+16]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+
+; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro SAD16XN 1
+cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
+                           src_stride3, ref_stride3, n_rows
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_stride3q]
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+src_strideq]
+  psadbw                m3, [srcq+src_strideq*2]
+  psadbw                m4, [srcq+src_stride3q]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN  8 ; sad16x8_sse2
+
+; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD8XN 1
+cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
+                          src_stride3, ref_stride3, n_rows
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movh                  m1, [refq]
+  movhps                m1, [refq+ref_strideq]
+  movh                  m2, [refq+ref_strideq*2]
+  movhps                m2, [refq+ref_stride3q]
+  movh                  m3, [srcq]
+  movhps                m3, [srcq+src_strideq]
+  movh                  m4, [srcq+src_strideq*2]
+  movhps                m4, [srcq+src_stride3q]
+  psadbw                m1, m3
+  psadbw                m2, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN  8 ; sad8x8_sse2
+
+; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,
+;                             uint8_t *ref, int ref_stride);
+INIT_MMX sse
+cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  movd                  m0, [refq]
+  movd                  m1, [refq+ref_strideq]
+  movd                  m2, [srcq]
+  movd                  m3, [srcq+src_strideq]
+  lea                 refq, [refq+ref_strideq*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  movd                  m4, [refq]
+  movd                  m5, [refq+ref_strideq]
+  movd                  m6, [srcq]
+  movd                  m7, [srcq+src_strideq]
+  punpckldq             m0, m1
+  punpckldq             m2, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  psadbw                m0, m2
+  psadbw                m4, m6
+  paddd                 m0, m4
+  movd                 eax, m0
+  RET
diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm
index 1c39a08f8..2b90a5d54 100644
--- a/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/vp9/encoder/x86/vp9_sad_sse3.asm
@@ -83,87 +83,6 @@
     ret
 %endmacro
 
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     r0_ptr        rcx
-  %define     r1_ptr        rdx
-  %define     r2_ptr        rbx
-  %define     r3_ptr        rdi
-  %define     ref_stride    rbp
-  %define     result_ptr    arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    push        rbp
-    mov         rdi,        arg(2)              ; ref_ptr_base
-
-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
-    mov         rsi,        arg(0)              ; src_ptr
-
-    movsxd      rbx,        dword ptr arg(1)    ; src_stride
-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
-
-    xchg        rbx,        rax
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     r0_ptr      rsi
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      r8
-    %define     ref_stride  r9
-    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
-    push        rsi
-
-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     r0_ptr      r9
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      rdx
-    %define     ref_stride  rcx
-    %define     result_ptr  r8
-
-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
-  %define     src_ptr
-  %define     src_stride
-  %define     r0_ptr
-  %define     r1_ptr
-  %define     r2_ptr
-  %define     r3_ptr
-  %define     ref_stride
-  %define     result_ptr
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
 %macro PROCESS_16X2X3 5
 %if %1==0
         movdqa          xmm0,       XMMWORD PTR [%2]
@@ -250,130 +169,6 @@
         paddw           mm7,       mm3
 %endmacro
 
-%macro LOAD_X4_ADDRESSES 5
-        mov             %2,         [%1+REG_SZ_BYTES*0]
-        mov             %3,         [%1+REG_SZ_BYTES*1]
-
-        mov             %4,         [%1+REG_SZ_BYTES*2]
-        mov             %5,         [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm4,       XMMWORD PTR [%3]
-        lddqu           xmm5,       XMMWORD PTR [%4]
-        lddqu           xmm6,       XMMWORD PTR [%5]
-        lddqu           xmm7,       XMMWORD PTR [%6]
-
-        psadbw          xmm4,       xmm0
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%4]
-        lddqu           xmm3,       XMMWORD PTR [%5]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%7]
-        lddqu           xmm1,       XMMWORD PTR [%3+%8]
-        lddqu           xmm2,       XMMWORD PTR [%4+%8]
-        lddqu           xmm3,       XMMWORD PTR [%5+%8]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6+%8]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm4,        QWORD PTR [%3]
-        movq            mm5,        QWORD PTR [%4]
-        movq            mm6,        QWORD PTR [%5]
-        movq            mm7,        QWORD PTR [%6]
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-        psadbw          mm7,        mm0
-%else
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm1,        QWORD PTR [%3]
-        movq            mm2,        QWORD PTR [%4]
-        movq            mm3,        QWORD PTR [%5]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-%endif
-        movq            mm0,        QWORD PTR [%2+%7]
-        movq            mm1,        QWORD PTR [%3+%8]
-        movq            mm2,        QWORD PTR [%4+%8]
-        movq            mm3,        QWORD PTR [%5+%8]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6+%8]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-
-%endmacro
-
 ;void int vp9_sad16x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -581,380 +376,3 @@ sym(vp9_sad4x4x3_sse3):
         movd            [rcx+8],    mm7
 
     STACK_FRAME_DESTROY_X3
-
-;unsigned int vp9_sad16x16_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_err)
-;%define lddqu movdqu
-global sym(vp9_sad16x16_sse3) PRIVATE
-sym(vp9_sad16x16_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        mov             end_ptr,    4
-        pxor            xmm7,        xmm7
-
-.vp9_sad16x16_sse3_loop:
-        movdqa          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
-        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          xmm4,       XMMWORD PTR [src_ptr]
-        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
-
-        psadbw          xmm0,       xmm1
-
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          xmm2,       xmm3
-        psadbw          xmm4,       xmm5
-        psadbw          xmm6,       xmm1
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        paddw           xmm7,        xmm0
-        paddw           xmm7,        xmm2
-        paddw           xmm7,        xmm4
-        paddw           xmm7,        xmm6
-
-        sub             end_ptr,     1
-        jne             .vp9_sad16x16_sse3_loop
-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-        paddw           xmm0,       xmm7
-        movq            rax,        xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void vp9_copy32xn_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp9_copy32xn_sse3) PRIVATE
-sym(vp9_copy32xn_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
-        lea             end_ptr,    [src_ptr+src_stride*2]
-
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
-        movdqu          xmm4,       XMMWORD PTR [end_ptr]
-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
-
-        lea             src_ptr,    [src_ptr+src_stride*4]
-
-        lea             end_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
-        movdqa          XMMWORD PTR [end_ptr], xmm4
-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
-        lea             ref_ptr,    [ref_ptr+ref_stride*4]
-
-        sub             height,     4
-        cmp             height,     4
-        jge             .block_copy_sse3_loopx4
-
-        ;Check to see if there is more rows need to be copied.
-        cmp             height, 0
-        je              .copy_is_done
-
-.block_copy_sse3_loop:
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        lea             src_ptr,    [src_ptr+src_stride]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        lea             ref_ptr,    [ref_ptr+ref_stride]
-
-        sub             height,     1
-        jne             .block_copy_sse3_loop
-
-.copy_is_done:
-    STACK_FRAME_DESTROY_X3
-
-;void vp9_sad16x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x16x4d_sse3) PRIVATE
-sym(vp9_sad16x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void vp9_sad16x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x8x4d_sse3) PRIVATE
-sym(vp9_sad16x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad8x16x4d_sse3) PRIVATE
-sym(vp9_sad8x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad8x8x4d_sse3) PRIVATE
-sym(vp9_sad8x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad4x4x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad4x4x4d_sse3) PRIVATE
-sym(vp9_sad4x4x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [r0_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [r1_ptr]
-        movd            mm5,        DWORD PTR [r2_ptr]
-
-        movd            mm6,        DWORD PTR [r3_ptr]
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-
-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        punpcklbw       mm6,        mm7
-        psadbw          mm4,        mm0
-
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-
-
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             r0_ptr,     [r0_ptr+ref_stride*2]
-
-        lea             r1_ptr,     [r1_ptr+ref_stride*2]
-        lea             r2_ptr,     [r2_ptr+ref_stride*2]
-
-        lea             r3_ptr,     [r3_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [r0_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm7
-
-        movd            mm3,        DWORD PTR [r1_ptr]
-        movd            mm7,        DWORD PTR [r2_ptr]
-
-        psadbw          mm2,        mm0
-%if ABI_IS_32BIT
-        mov             rax,        rbp
-
-        pop             rbp
-%define     ref_stride    rax
-%endif
-        mov             rsi,        result_ptr
-
-        paddw           mm1,        mm2
-        movd            [rsi],      mm1
-
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm1
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        movd            mm2,        DWORD PTR [r3_ptr]
-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        movd            [rsi+4],    mm3
-        punpcklbw       mm2,        mm1
-
-        movd            [rsi+8],    mm7
-        psadbw          mm2,        mm0
-
-        paddw           mm2,        mm6
-        movd            [rsi+12],   mm2
-
-
-    STACK_FRAME_DESTROY_X4
-
diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm
index b42982a1f..faf1768a9 100644
--- a/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/vp9/encoder/x86/vp9_sad_sse4.asm
@@ -154,6 +154,16 @@
         paddw           xmm1,       xmm5
 %endmacro
 
+%macro WRITE_AS_INTS 0
+    mov             rdi,        arg(4)           ;Results
+    pxor            xmm0, xmm0
+    movdqa          xmm2, xmm1
+    punpcklwd       xmm1, xmm0
+    punpckhwd       xmm2, xmm0
+
+    movdqa          [rdi],    xmm1
+    movdqa          [rdi + 16],    xmm2
+%endmacro
 
 ;void vp9_sad16x16x8_sse4(
 ;    const unsigned char *src_ptr,
@@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
 
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
@@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4):
     push        rdi
     ; end prolog
 
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
 
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
+    PROCESS_4X2X8 1
+    PROCESS_4X2X8 0
 
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
+    WRITE_AS_INTS
 
     ; begin epilog
     pop         rdi
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 36fae6e8c..fc363b6b0 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -186,6 +186,7 @@ unsigned int vp9_variance16x16_wmt
   *sse = sse0;
   return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
 }
+
 unsigned int vp9_mse16x16_wmt(
   const unsigned char *src_ptr,
   int  source_stride,
@@ -305,20 +306,16 @@ unsigned int vp9_sub_pixel_variance8x8_wmt
   return (xxsum - (((unsigned int)xsum * xsum) >> 6));
 }
 
-unsigned int vp9_sub_pixel_variance16x16_wmt
-(
-  const unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  const unsigned char *dst_ptr,
-  int dst_pixels_per_line,
-  unsigned int *sse
-) {
+static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
+                                         int src_pixels_per_line,
+                                         int xoffset,
+                                         int yoffset,
+                                         const uint8_t *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse, int *avg) {
   int xsum0, xsum1;
   unsigned int xxsum0, xxsum1;
 
-
   // note we could avoid these if statements if the calling function
   // just called the appropriate functions inside.
   if (xoffset == HALFNDX && yoffset == 0) {
@@ -355,10 +352,136 @@ unsigned int vp9_sub_pixel_variance16x16_wmt
   }
 
   *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
+  *avg = xsum0;
+}
+
+unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
+                                              int src_pixels_per_line,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse_ptr) {
+  int avg;
+  unsigned int sse;
+
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse, &avg);
+  *sse_ptr = sse;
+
+  return (sse - (((unsigned int) avg * avg) >> 8));
+}
+
+unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
+                                              int src_pixels_per_line,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse_ptr) {
+  int avg0, avg1, avg2, avg3;
+  unsigned int sse0, sse1, sse2, sse3;
+
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse0, &avg0);
+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 16, dst_pixels_per_line,
+                               &sse1, &avg1);
+  src_ptr += 16 * src_pixels_per_line;
+  dst_ptr += 16 * dst_pixels_per_line;
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse2, &avg2);
+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 16, dst_pixels_per_line,
+                               &sse3, &avg3);
+  sse0 += sse1 + sse2 + sse3;
+  avg0 += avg1 + avg2 + avg3;
+  *sse_ptr = sse0;
+
+  return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
+}
+
+unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
+                                              int src_pixels_per_line,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse_ptr) {
+  int avg0, avg1, avg2, avg3, avg4;
+  unsigned int sse0, sse1, sse2, sse3, sse4;
+
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse0, &avg0);
+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 16, dst_pixels_per_line,
+                               &sse1, &avg1);
+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 32, dst_pixels_per_line,
+                               &sse2, &avg2);
+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 48, dst_pixels_per_line,
+                               &sse3, &avg3);
+  src_ptr += 16 * src_pixels_per_line;
+  dst_ptr += 16 * dst_pixels_per_line;
+  avg0 += avg1 + avg2 + avg3;
+  sse0 += sse1 + sse2 + sse3;
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse1, &avg1);
+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 16, dst_pixels_per_line,
+                               &sse2, &avg2);
+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 32, dst_pixels_per_line,
+                               &sse3, &avg3);
+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 48, dst_pixels_per_line,
+                               &sse4, &avg4);
+  src_ptr += 16 * src_pixels_per_line;
+  dst_ptr += 16 * dst_pixels_per_line;
+  avg0 += avg1 + avg2 + avg3 + avg4;
+  sse0 += sse1 + sse2 + sse3 + sse4;
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse1, &avg1);
+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 16, dst_pixels_per_line,
+                               &sse2, &avg2);
+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 32, dst_pixels_per_line,
+                               &sse3, &avg3);
+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 48, dst_pixels_per_line,
+                               &sse4, &avg4);
+  src_ptr += 16 * src_pixels_per_line;
+  dst_ptr += 16 * dst_pixels_per_line;
+  avg0 += avg1 + avg2 + avg3 + avg4;
+  sse0 += sse1 + sse2 + sse3 + sse4;
+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr, dst_pixels_per_line,
+                               &sse1, &avg1);
+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 16, dst_pixels_per_line,
+                               &sse2, &avg2);
+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 32, dst_pixels_per_line,
+                               &sse3, &avg3);
+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
+                               yoffset, dst_ptr + 48, dst_pixels_per_line,
+                               &sse4, &avg4);
+  avg0 += avg1 + avg2 + avg3 + avg4;
+  sse0 += sse1 + sse2 + sse3 + sse4;
+  *sse_ptr = sse0;
+
+  return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
 }
 
-unsigned int vp9_sub_pixel_mse16x16_wmt(
+unsigned int vp9_sub_pixel_mse16x16_sse2(
   const unsigned char  *src_ptr,
   int  src_pixels_per_line,
   int  xoffset,
@@ -367,7 +490,8 @@ unsigned int vp9_sub_pixel_mse16x16_wmt(
   int dst_pixels_per_line,
   unsigned int *sse
 ) {
-  vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
+                                   yoffset, dst_ptr, dst_pixels_per_line, sse);
   return *sse;
 }
 
diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c
index 3beef53a2..2bf32c569 100644
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c
@@ -23,11 +23,11 @@ void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }
 
-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
+int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
+int vp9_mbblock_error_mmx(MACROBLOCK *mb) {
   short *coeff_ptr =  mb->block[0].coeff;
   short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr);
 }
 
 int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
@@ -51,11 +51,11 @@ void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
 #endif
 
 #if HAVE_SSE2
-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
+int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
+int vp9_mbblock_error_xmm(MACROBLOCK *mb) {
   short *coeff_ptr =  mb->block[0].coeff;
   short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr);
 }
 
 int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 0d208e9a3..f330b464a 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -16,6 +16,8 @@ VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h
+VP9_COMMON_SRCS-yes += common/vp9_convolve.c
+VP9_COMMON_SRCS-yes += common/vp9_convolve.h
 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
 VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
@@ -36,6 +38,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_extend.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
 VP9_COMMON_SRCS-yes += common/vp9_header.h
+VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_invtrans.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
 VP9_COMMON_SRCS-yes += common/vp9_modecont.h
@@ -46,7 +49,6 @@ VP9_COMMON_SRCS-yes += common/vp9_pred_common.c
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.h
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
-VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
 VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
@@ -54,10 +56,11 @@ VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
-VP9_COMMON_SRCS-yes += common/vp9_subpixel.h
 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
+VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
+VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.h
 VP9_COMMON_SRCS-yes += common/vp9_invtrans.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
@@ -79,7 +82,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
@@ -88,7 +90,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
@@ -96,10 +97,8 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
 ifeq ($(CONFIG_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
@@ -111,19 +110,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 endif
 
-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c
-ifeq ($(HAVE_SSE4_1),yes)
-vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4
-vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4
-endif
-
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
 ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2
+vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2
+vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
 endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 1ef5ff19e..2653954d0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -26,7 +26,8 @@ struct vp8_extracfg {
   unsigned int                noise_sensitivity;
   unsigned int                Sharpness;
   unsigned int                static_thresh;
-  unsigned int                token_partitions;
+  unsigned int                tile_columns;
+  unsigned int                tile_rows;
   unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
   unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
   unsigned int                arnr_type;        /* alt_ref filter type */
@@ -34,9 +35,7 @@ struct vp8_extracfg {
   vp8e_tuning                 tuning;
   unsigned int                cq_level;         /* constrained quality level */
   unsigned int                rc_max_intra_bitrate_pct;
-#if CONFIG_LOSSLESS
   unsigned int                lossless;
-#endif
 };
 
 struct extraconfig_map {
@@ -54,7 +53,8 @@ static const struct extraconfig_map extracfg_map[] = {
       0,                          /* noise_sensitivity */
       0,                          /* Sharpness */
       0,                          /* static_thresh */
-      VP8_ONE_TOKENPARTITION,     /* token_partitions */
+      0,                          /* tile_columns */
+      0,                          /* tile_rows */
       0,                          /* arnr_max_frames */
       3,                          /* arnr_strength */
       3,                          /* arnr_type*/
@@ -62,9 +62,7 @@ static const struct extraconfig_map extracfg_map[] = {
       0,                          /* tuning*/
       10,                         /* cq_level */
       0,                          /* rc_max_intra_bitrate_pct */
-#if CONFIG_LOSSLESS
       0,                          /* lossless */
-#endif
     }
   }
 };
@@ -80,7 +78,6 @@ struct vpx_codec_alg_priv {
   unsigned char          *pending_cx_data;
   unsigned int            pending_cx_data_sz;
   vpx_image_t             preview_img;
-  unsigned int            next_frame_flag;
   vp8_postproc_cfg_t      preview_ppcfg;
   vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
   unsigned int                fixed_kf_cntr;
@@ -137,13 +134,11 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
-#if CONFIG_LOSSLESS
   RANGE_CHECK_BOOL(vp8_cfg, lossless);
   if (vp8_cfg->lossless) {
     RANGE_CHECK_HI(cfg, rc_max_quantizer, 0);
     RANGE_CHECK_HI(cfg, rc_min_quantizer, 0);
   }
-#endif
 
   RANGE_CHECK_HI(cfg, g_threads,          64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
@@ -172,7 +167,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
   RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
 
-  RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
+  RANGE_CHECK(vp8_cfg, tile_columns, 0, 6);
+  RANGE_CHECK(vp8_cfg, tile_rows, 0, 2);
   RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
   RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
   RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
@@ -309,37 +305,43 @@ static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
 
   oxcf->tuning = vp8_cfg.tuning;
 
-#if CONFIG_LOSSLESS
+  oxcf->tile_columns = vp8_cfg.tile_columns;
+  oxcf->tile_rows = vp8_cfg.tile_rows;
+
   oxcf->lossless = vp8_cfg.lossless;
-#endif
 
+  oxcf->error_resilient_mode = cfg.g_error_resilient;
+  oxcf->frame_parallel_decoding_mode = cfg.g_frame_parallel_decoding;
   /*
-      printf("Current VP8 Settings: \n");
-      printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
-      printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
-      printf("Sharpness: %d\n",    oxcf->Sharpness);
-      printf("cpu_used: %d\n",  oxcf->cpu_used);
-      printf("Mode: %d\n",     oxcf->Mode);
-      printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);
-      printf("auto_key: %d\n",  oxcf->auto_key);
-      printf("key_freq: %d\n", oxcf->key_freq);
-      printf("end_usage: %d\n", oxcf->end_usage);
-      printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
-      printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
-      printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
-      printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
-      printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
-      printf("fixed_q: %d\n",  oxcf->fixed_q);
-      printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
-      printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
-      printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
-      printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
-      printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
-      printf("allow_lag: %d\n", oxcf->allow_lag);
-      printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
-      printf("play_alternate: %d\n", oxcf->play_alternate);
-      printf("Version: %d\n", oxcf->Version);
-      printf("encode_breakout: %d\n", oxcf->encode_breakout);
+  printf("Current VP9 Settings: \n");
+  printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+  printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+  printf("Sharpness: %d\n",    oxcf->Sharpness);
+  printf("cpu_used: %d\n",  oxcf->cpu_used);
+  printf("Mode: %d\n",     oxcf->Mode);
+  // printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);
+  printf("auto_key: %d\n",  oxcf->auto_key);
+  printf("key_freq: %d\n", oxcf->key_freq);
+  printf("end_usage: %d\n", oxcf->end_usage);
+  printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+  printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+  printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+  printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+  printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+  printf("fixed_q: %d\n",  oxcf->fixed_q);
+  printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+  printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+  printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+  printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+  printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+  printf("allow_lag: %d\n", oxcf->allow_lag);
+  printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+  printf("play_alternate: %d\n", oxcf->play_alternate);
+  printf("Version: %d\n", oxcf->Version);
+  printf("encode_breakout: %d\n", oxcf->encode_breakout);
+  printf("error resilient: %d\n", oxcf->error_resilient_mode);
+  printf("frame parallel detokenization: %d\n",
+         oxcf->frame_parallel_decoding_mode);
   */
   return VPX_CODEC_OK;
 }
@@ -409,7 +411,8 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
       MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);
       MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);
       MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);
-      MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);
+      MAP(VP9E_SET_TILE_COLUMNS,          xcfg.tile_columns);
+      MAP(VP9E_SET_TILE_ROWS,             xcfg.tile_rows);
 
       MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
       MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);
@@ -417,9 +420,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
       MAP(VP8E_SET_TUNING,                xcfg.tuning);
       MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
       MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
-#if CONFIG_LOSSLESS
       MAP(VP9E_SET_LOSSLESS,              xcfg.lossless);
-#endif
   }
 
   res = validate_config(ctx, &ctx->cfg, &xcfg);
@@ -670,14 +671,11 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
 
-      if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
+      if (vp9_receive_raw_frame(ctx->cpi, lib_flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
         res = update_error_state(ctx, &cpi->common.error);
       }
-
-      /* reset for next frame */
-      ctx->next_frame_flag = 0;
     }
 
     cx_data = ctx->cx_data;
@@ -979,8 +977,6 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
                                 scalemode.v_scaling_mode);
 
     if (!res) {
-      /*force next frame a key frame to effect scaling mode */
-      ctx->next_frame_flag |= FRAMEFLAGS_KEY;
       return VPX_CODEC_OK;
     } else
       return VPX_CODEC_INVALID_PARAM;
@@ -1004,7 +1000,8 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   {VP8E_SET_ENABLEAUTOALTREF,         set_param},
   {VP8E_SET_SHARPNESS,                set_param},
   {VP8E_SET_STATIC_THRESHOLD,         set_param},
-  {VP8E_SET_TOKEN_PARTITIONS,         set_param},
+  {VP9E_SET_TILE_COLUMNS,             set_param},
+  {VP9E_SET_TILE_ROWS,                set_param},
   {VP8E_GET_LAST_QUANTIZER,           get_param},
   {VP8E_GET_LAST_QUANTIZER_64,        get_param},
   {VP8E_SET_ARNR_MAXFRAMES,           set_param},
@@ -1013,9 +1010,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   {VP8E_SET_TUNING,                   set_param},
   {VP8E_SET_CQ_LEVEL,                 set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
-#if CONFIG_LOSSLESS
   {VP9E_SET_LOSSLESS,                 set_param},
-#endif
   { -1, NULL},
 };
 
@@ -1032,6 +1027,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
       {1, 30},            /* g_timebase */
 
       0,                  /* g_error_resilient */
+      0,                  /* g_frame_parallel_decoding */
 
       VPX_RC_ONE_PASS,    /* g_pass */
 
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 2d7e41369..b2ce9aa2e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -362,6 +362,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
       oxcf.Version = 9;
       oxcf.postprocess = 0;
       oxcf.max_threads = ctx->cfg.threads;
+      oxcf.inv_tile_order = ctx->cfg.inv_tile_order;
       optr = vp9_create_decompressor(&oxcf);
 
       /* If postprocessing was enabled by the application and a
@@ -431,7 +432,7 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,
                                   long                   deadline) {
   const uint8_t *data_start = data;
   const uint8_t *data_end = data + data_sz;
-  vpx_codec_err_t res;
+  vpx_codec_err_t res = 0;
 
   do {
     res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
@@ -645,9 +646,7 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
   VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
 
   if (update_info) {
-    *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
-                   + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
-                   + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+    *update_info = pbi->refresh_frame_flags;
 
     return VPX_CODEC_OK;
   } else
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 12d1ec4e7..43dba1373 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -65,7 +65,6 @@ VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
-VP9_CX_SRCS-yes += encoder/vp9_satd_c.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
@@ -95,21 +94,28 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
+#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
+#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm
+#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2_intrinsics.c
+ifeq ($(HAVE_SSE2),yes)
+vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.d: CFLAGS += -msse2
+vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.o: CFLAGS += -msse2
+endif
+
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))