29 files changed, 697 insertions, 865 deletions
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 165e2c8f0..9555a9ab5 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -36,12 +36,15 @@ typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
                                         const unsigned char *reference_ptr,
                                         int reference_stride,
                                         unsigned int max_sad);
+typedef std::tr1::tuple<int, int, sad_m_by_n_fn_t> sad_m_by_n_test_param_t;
 
 typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr,
                                      int src_stride,
                                      const unsigned char * const ref_ptr[],
                                      int ref_stride,
                                      unsigned int *sad_array);
+typedef std::tr1::tuple<int, int, sad_n_by_n_by_4_fn_t>
+        sad_n_by_n_by_4_test_param_t;
 
 using libvpx_test::ACMRandom;
 
@@ -124,8 +127,7 @@ class SADTestBase : public ::testing::Test {
 };
 
 class SADTest : public SADTestBase,
-    public ::testing::WithParamInterface<
-        std::tr1::tuple<int, int, sad_m_by_n_fn_t> > {
+    public ::testing::WithParamInterface<sad_m_by_n_test_param_t> {
  public:
   SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -156,8 +158,7 @@ class SADTest : public SADTestBase,
 };
 
 class SADx4Test : public SADTestBase,
-    public ::testing::WithParamInterface<
-        std::tr1::tuple<int, int, sad_n_by_n_by_4_fn_t> > {
+    public ::testing::WithParamInterface<sad_n_by_n_by_4_test_param_t> {
  public:
   SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -293,12 +294,6 @@ TEST_P(SADTest, MaxSAD) {
 
 using std::tr1::make_tuple;
 
-#if CONFIG_VP8_ENCODER && CONFIG_VP9_ENCODER
-#define VP8_VP9_SEPARATOR ,
-#else
-#define VP8_VP9_SEPARATOR
-#endif
-
 #if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
 const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
@@ -315,25 +310,25 @@ const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c;
 const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c;
 const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c;
 #endif
-INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values(
+const sad_m_by_n_test_param_t c_tests[] = {
 #if CONFIG_VP8_ENCODER
-                        make_tuple(16, 16, sad_16x16_c),
-                        make_tuple(8, 16, sad_8x16_c),
-                        make_tuple(16, 8, sad_16x8_c),
-                        make_tuple(8, 8, sad_8x8_c),
-                        make_tuple(4, 4, sad_4x4_c)
+  make_tuple(16, 16, sad_16x16_c),
+  make_tuple(8, 16, sad_8x16_c),
+  make_tuple(16, 8, sad_16x8_c),
+  make_tuple(8, 8, sad_8x8_c),
+  make_tuple(4, 4, sad_4x4_c),
 #endif
-                        VP8_VP9_SEPARATOR
 #if CONFIG_VP9_ENCODER
-                        make_tuple(64, 64, sad_64x64_c_vp9),
-                        make_tuple(32, 32, sad_32x32_c_vp9),
-                        make_tuple(16, 16, sad_16x16_c_vp9),
-                        make_tuple(8, 16, sad_8x16_c_vp9),
-                        make_tuple(16, 8, sad_16x8_c_vp9),
-                        make_tuple(8, 8, sad_8x8_c_vp9),
-                        make_tuple(4, 4, sad_4x4_c_vp9)
+  make_tuple(64, 64, sad_64x64_c_vp9),
+  make_tuple(32, 32, sad_32x32_c_vp9),
+  make_tuple(16, 16, sad_16x16_c_vp9),
+  make_tuple(8, 16, sad_8x16_c_vp9),
+  make_tuple(16, 8, sad_16x8_c_vp9),
+  make_tuple(8, 8, sad_8x8_c_vp9),
+  make_tuple(4, 4, sad_4x4_c_vp9),
 #endif
-                        ));
+};
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 #if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
@@ -387,23 +382,23 @@ const sad_m_by_n_fn_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
 const sad_m_by_n_fn_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
 #endif
 
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values(
+const sad_m_by_n_test_param_t mmx_tests[] = {
 #if CONFIG_VP8_ENCODER
-                        make_tuple(16, 16, sad_16x16_mmx),
-                        make_tuple(8, 16, sad_8x16_mmx),
-                        make_tuple(16, 8, sad_16x8_mmx),
-                        make_tuple(8, 8, sad_8x8_mmx),
-                        make_tuple(4, 4, sad_4x4_mmx)
+  make_tuple(16, 16, sad_16x16_mmx),
+  make_tuple(8, 16, sad_8x16_mmx),
+  make_tuple(16, 8, sad_16x8_mmx),
+  make_tuple(8, 8, sad_8x8_mmx),
+  make_tuple(4, 4, sad_4x4_mmx),
 #endif
-                        VP8_VP9_SEPARATOR
 #if CONFIG_VP9_ENCODER
-                        make_tuple(16, 16, sad_16x16_mmx_vp9),
-                        make_tuple(8, 16, sad_8x16_mmx_vp9),
-                        make_tuple(16, 8, sad_16x8_mmx_vp9),
-                        make_tuple(8, 8, sad_8x8_mmx_vp9),
-                        make_tuple(4, 4, sad_4x4_mmx_vp9)
+  make_tuple(16, 16, sad_16x16_mmx_vp9),
+  make_tuple(8, 16, sad_8x16_mmx_vp9),
+  make_tuple(16, 8, sad_16x8_mmx_vp9),
+  make_tuple(8, 8, sad_8x8_mmx_vp9),
+  make_tuple(4, 4, sad_4x4_mmx_vp9),
 #endif
-                        ));
+};
+INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
 #endif
 
 #if HAVE_SSE
@@ -434,24 +429,24 @@ const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
 const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 #endif
-INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values(
+const sad_m_by_n_test_param_t sse2_tests[] = {
 #if CONFIG_VP8_ENCODER
-                        make_tuple(16, 16, sad_16x16_wmt),
-                        make_tuple(8, 16, sad_8x16_wmt),
-                        make_tuple(16, 8, sad_16x8_wmt),
-                        make_tuple(8, 8, sad_8x8_wmt),
-                        make_tuple(4, 4, sad_4x4_wmt)
+  make_tuple(16, 16, sad_16x16_wmt),
+  make_tuple(8, 16, sad_8x16_wmt),
+  make_tuple(16, 8, sad_16x8_wmt),
+  make_tuple(8, 8, sad_8x8_wmt),
+  make_tuple(4, 4, sad_4x4_wmt),
 #endif
-                        VP8_VP9_SEPARATOR
 #if CONFIG_VP9_ENCODER
-                        make_tuple(64, 64, sad_64x64_sse2_vp9),
-                        make_tuple(32, 32, sad_32x32_sse2_vp9),
-                        make_tuple(16, 16, sad_16x16_sse2_vp9),
-                        make_tuple(8, 16, sad_8x16_sse2_vp9),
-                        make_tuple(16, 8, sad_16x8_sse2_vp9),
-                        make_tuple(8, 8, sad_8x8_sse2_vp9)
+  make_tuple(64, 64, sad_64x64_sse2_vp9),
+  make_tuple(32, 32, sad_32x32_sse2_vp9),
+  make_tuple(16, 16, sad_16x16_sse2_vp9),
+  make_tuple(8, 16, sad_8x16_sse2_vp9),
+  make_tuple(16, 8, sad_16x8_sse2_vp9),
+  make_tuple(8, 8, sad_8x8_sse2_vp9),
 #endif
-                        ));
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 #if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index b6252d93e..5c97f9863 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -60,4 +60,9 @@ static INLINE int multiple16(int value) {
   return (value + 15) & ~15;
 }
 
+#define SYNC_CODE_0 0x49
+#define SYNC_CODE_1 0x83
+#define SYNC_CODE_2 0x42
+
+
 #endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
deleted file mode 100644
index 827e4bf84..000000000
--- a/vp9/common/vp9_debugmodes.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_tile_common.h"
-typedef struct {
-  char *debug_array;
-  int w;
-  int h;
-} DEBUG_MODE_STRUCT;
-
-static void draw_rect(int r, int c, int w, int h, DEBUG_MODE_STRUCT *da) {
-  int i;
-  da->debug_array[r / 2 * da->w + c] = '+';
-  for (i = r / 2 + 1; i < r / 2 + h / 2; i++) {
-    da->debug_array[i * da->w + c] = '|';
-  }
-  for (i = c + 1; i < c + w; i++) {
-    da->debug_array[r / 2 * da->w + i] = '-';
-  }
-}
-static void debug_partitioning(VP9_COMMON * cm, MODE_INFO *m, int mi_row,
-                               int mi_col, BLOCK_SIZE_TYPE bsize,
-                               DEBUG_MODE_STRUCT *da) {
-  const int mis = cm->mode_info_stride;
-  int bwl, bhl;
-  int bw, bh;
-  int bsl = mi_width_log2(bsize), bs = (1 << bsl) / 2;
-  int n;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE_TYPE subsize;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  bwl = mi_width_log2(m->mbmi.sb_type);
-  bhl = mi_height_log2(m->mbmi.sb_type);
-  bw = 1 << bwl;
-  bh = 1 << bhl;
-
-  // parse the partition type
-  if ((bwl == bsl) && (bhl == bsl))
-    partition = PARTITION_NONE;
-  else if ((bwl == bsl) && (bhl < bsl))
-    partition = PARTITION_HORZ;
-  else if ((bwl < bsl) && (bhl == bsl))
-    partition = PARTITION_VERT;
-  else if ((bwl < bsl) && (bhl < bsl))
-    partition = PARTITION_SPLIT;
-  else
-    assert(0);
-
-#if CONFIG_AB4X4
-  if (bsize == BLOCK_SIZE_SB8X8 && m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
-  partition = PARTITION_SPLIT;
-  if (bsize < BLOCK_SIZE_SB8X8)
-  return;
-#endif
-
-#if CONFIG_AB4X4
-  if (bsize >= BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize > BLOCK_SIZE_SB8X8) {
-#endif
-  }
-
-  subsize = get_subsize(bsize, partition);
-  switch (partition) {
-    case PARTITION_NONE:
-      draw_rect(mi_row * 8, mi_col * 8, bw * 8, bh * 8, da);
-      break;
-    case PARTITION_HORZ:
-      draw_rect(mi_row * 8, mi_col * 8, bw * 8, bh * 8, da);
-      if ((mi_row + bh) < cm->mi_rows)
-        draw_rect(8 * bs + mi_row * 8, mi_col * 8, bw * 8, bh * 8, da);
-      break;
-    case PARTITION_VERT:
-      draw_rect(mi_row * 8, mi_col * 8, bw * 8, bh * 8, da);
-      if ((mi_col + bw) < cm->mi_cols)
-        draw_rect(mi_row * 8, 8 * bs + mi_col * 8, bw * 8, bh * 8, da);
-      break;
-    case PARTITION_SPLIT:
-      for (n = 0; n < 4; n++) {
-        int j = n >> 1, i = n & 0x01;
-        debug_partitioning(cm, m + j * bs * mis + i * bs, mi_row + j * bs,
-                           mi_col + i * bs, subsize, da);
-      }
-      break;
-    default:
-      assert(0);
-  }
-}
-static void debug_partitionings(VP9_COMMON *c, DEBUG_MODE_STRUCT *da) {
-  const int mis = c->mode_info_stride;
-  MODE_INFO *m, *m_ptr = c->mi;
-  int mi_row, mi_col;
-
-  m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
-
-  for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
-      mi_row += 8, m_ptr += 8 * mis) {
-    m = m_ptr;
-    for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
-        mi_col += 8, m += 8) {
-      debug_partitioning(c, m, mi_row, mi_col, BLOCK_SIZE_SB64X64, da);
-    }
-  }
-}
-void vp9_debug_tile_partitionings(VP9_COMMON *pc) {
-  int tile_row, tile_col;
-  DEBUG_MODE_STRUCT da;
-
-  da.w = pc->width;
-  da.h = pc->height / 2;
-  da.debug_array = vpx_malloc(da.h * da.w);
-  vpx_memset(da.debug_array, ' ', da.h * da.w);
-  for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
-    vp9_get_tile_row_offsets(pc, tile_row);
-    for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
-      vp9_get_tile_col_offsets(pc, tile_col);
-
-      debug_partitionings(pc, &da);
-    }
-  }
-  {
-    FILE *f = fopen("partitionings.txt", "a");
-    int i, j;
-    fprintf(f, "\n\n\nFrame: %d \n", pc->current_video_frame);
-    for (i = 0; i < da.h; i++) {
-      for (j = 0; j < da.w; j++) {
-        fprintf(f, "%c", da.debug_array[i * da.w + j]);
-      }
-      fprintf(f, "\n");
-    }
-    fclose(f);
-  }
-  vpx_free(da.debug_array);
-}
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 1b7da6cd5..1ae35864c 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -468,7 +468,7 @@ int vp9_get_coef_context(const int *scan, const int *neighbors,
     } else {
       ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
     }
-    return vp9_pt_energy_class[ctx];
+    return ctx;
   }
 };
 
@@ -642,6 +642,17 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
+void vp9_full_to_model_count(unsigned int *model_count,
+                             unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
 void vp9_full_to_model_counts(
     vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
   int i, j, k, l;
@@ -649,19 +660,10 @@ void vp9_full_to_model_counts(
     for (j = 0; j < REF_TYPES; ++j)
       for (k = 0; k < COEF_BANDS; ++k)
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          int n;
           if (l >= 3 && k == 0)
             continue;
-          model_count[i][j][k][l][ZERO_TOKEN] =
-              full_count[i][j][k][l][ZERO_TOKEN];
-          model_count[i][j][k][l][ONE_TOKEN] =
-              full_count[i][j][k][l][ONE_TOKEN];
-          model_count[i][j][k][l][TWO_TOKEN] =
-              full_count[i][j][k][l][TWO_TOKEN];
-          for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
-            model_count[i][j][k][l][TWO_TOKEN] += full_count[i][j][k][l][n];
-          model_count[i][j][k][l][DCT_EOB_MODEL_TOKEN] =
-              full_count[i][j][k][l][DCT_EOB_TOKEN];
+          vp9_full_to_model_count(model_count[i][j][k][l],
+                                  full_count[i][j][k][l]);
         }
 }
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 5d57f149e..e76211a7c 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -174,6 +174,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
 typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
                                           [PREV_COEF_CONTEXTS]
                                           [UNCONSTRAINED_NODES][2];
+extern void vp9_full_to_model_count(unsigned int *model_count,
+                                    unsigned int *full_count);
 extern void vp9_full_to_model_counts(
     vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
 
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 622f1dcf4..9c390dfd0 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -141,13 +141,6 @@ const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
   -H_PRED, -TM_PRED
 };
 
-const vp9_tree_index vp9_mv_ref_tree[8] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, 6,
-  -NEWMV, -SPLITMV
-};
-
 const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
@@ -168,7 +161,6 @@ struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
 struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
 
-struct vp9_token vp9_mv_ref_encoding_array[VP9_MVREFS];
 struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
 
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
@@ -265,8 +257,6 @@ void vp9_entropy_mode_init() {
                        vp9_switchable_interp_tree);
   vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
 
-  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
-                              vp9_mv_ref_tree, NEARESTMV);
   vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
                               vp9_sb_mv_ref_tree, NEARESTMV);
 }
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 8fbc6f20e..7a83c702e 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -29,9 +29,7 @@ extern const vp9_tree_index  vp9_kf_ymode_tree[];
 extern const vp9_tree_index  vp9_uv_mode_tree[];
 #define vp9_sb_ymode_tree vp9_uv_mode_tree
 #define vp9_sb_kf_ymode_tree vp9_uv_mode_tree
-extern const vp9_tree_index  vp9_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
-extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
 
 extern struct vp9_token vp9_bmode_encodings[VP9_BINTRAMODES];
 extern struct vp9_token vp9_kf_bmode_encodings[VP9_BINTRAMODES];
@@ -43,7 +41,6 @@ extern struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
 
 /* Inter mode values do not start at zero */
 
-extern struct vp9_token vp9_mv_ref_encoding_array[VP9_MVREFS];
 extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
 
 // probability models for partition information
diff --git a/vp9/common/vp9_header.h b/vp9/common/vp9_header.h
deleted file mode 100644
index 96b04e7d7..000000000
--- a/vp9/common/vp9_header.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_HEADER_H_
-#define VP9_COMMON_VP9_HEADER_H_
-
-/* 24 bits total */
-typedef struct {
-  unsigned int type: 1;
-  unsigned int version: 3;
-  unsigned int show_frame: 1;
-
-  /* Allow 2^20 bytes = 8 megabits for first partition */
-
-  unsigned int first_partition_length_in_bytes: 19;
-
-#ifdef PACKET_TESTING
-  unsigned int frame_number;
-  unsigned int update_gold: 1;
-  unsigned int uses_gold: 1;
-  unsigned int update_last: 1;
-  unsigned int uses_last: 1;
-#endif
-} VP9_HEADER;
-
-#ifdef PACKET_TESTING
-#define VP9_HEADER_SIZE 8
-#else
-#define VP9_HEADER_SIZE 3
-#endif
-
-#endif  // VP9_COMMON_VP9_HEADER_H_
diff --git a/vp9/common/vp9_modecont.h b/vp9/common/vp9_modecont.h
index 24f1a6cb3..30deb72d3 100644
--- a/vp9/common/vp9_modecont.h
+++ b/vp9/common/vp9_modecont.h
@@ -11,6 +11,8 @@
 #ifndef VP9_COMMON_VP9_MODECONT_H_
 #define VP9_COMMON_VP9_MODECONT_H_
 
+#include "vp9/common/vp9_entropy.h"
+
 extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4];
 
 #endif  // VP9_COMMON_VP9_MODECONT_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index c277ea3cb..b148a6377 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -24,10 +24,6 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 
-/*#ifdef PACKET_TESTING*/
-#include "vp9/common/vp9_header.h"
-/*#endif*/
-
 /* Create/destroy static data structures. */
 
 void vp9_initialize_common(void);
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index dd60a76c7..b1f327b43 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -402,7 +402,7 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
       (block_idx >> bwl) || xd->up_available;
   const int have_left =
       (block_idx & wmask) || xd->left_available;
-  const int have_right = ((block_idx & wmask) != wmask);
+  int have_right = ((block_idx & wmask) != wmask);
   const int txfm_block_size = 4 << tx_size;
 
   assert(bwl >= 0);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 6b511b504..813a26cdc 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -362,10 +362,6 @@ static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p);
 }
 
-static MB_PREDICTION_MODE read_mv_ref(vp9_reader *r, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE) treed_read(r, vp9_mv_ref_tree, p);
-}
-
 #ifdef VPX_MODE_COUNT
 unsigned int vp9_mv_cont_count[5][4] = {
   { 0, 0, 0, 0 },
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 70e0c8759..a4a8226a0 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -10,29 +10,30 @@
 
 #include <assert.h>
 
-#include "vp9/decoder/vp9_onyxd_int.h"
+#include "./vp9_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+
+#include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_modecont.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_header.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9/decoder/vp9_decodframe.h"
-#include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/common/vp9_invtrans.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
-#include "vpx_scale/vpx_scale.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
-#include "vp9/decoder/vp9_decodemv.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_modecont.h"
-#include "vpx_mem/vpx_mem.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
+#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_detokenize.h"
+#include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_read_bit_buffer.h"
 
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "./vp9_rtcd.h"
 
 // #define DEC_DEBUG
 #ifdef DEC_DEBUG
@@ -743,32 +744,24 @@ static INTERPOLATIONFILTERTYPE read_mcomp_filter_type(vp9_reader *r) {
                          : vp9_read_literal(r, 2);
 }
 
-static const uint8_t *read_frame_size(VP9_COMMON *const pc, const uint8_t *data,
-                                      const uint8_t *data_end,
-                                      int *width, int *height) {
-  if (data + 4 < data_end) {
-    const int w = read_le16(data);
-    const int h = read_le16(data + 2);
-    if (w <= 0)
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid frame width");
-
-    if (h <= 0)
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid frame height");
-    *width = w;
-    *height = h;
-    data += 4;
-  } else {
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Failed to read frame size");
-  }
-  return data;
+static void read_frame_size(VP9_COMMON *cm,
+                            struct vp9_read_bit_buffer *rb,
+                            int *width, int *height) {
+  const int w = vp9_rb_read_literal(rb, 16);
+  const int h = vp9_rb_read_literal(rb, 16);
+  if (w <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid frame width");
+
+  if (h <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid frame height");
+  *width = w;
+  *height = h;
 }
 
-static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,
-                                       const uint8_t *data,
-                                       const uint8_t *data_end) {
+static void setup_frame_size(VP9D_COMP *pbi, int scaling_active,
+                             struct vp9_read_bit_buffer *rb) {
   // If error concealment is enabled we should only parse the new size
   // if we have enough data. Otherwise we will end up with the wrong size.
   VP9_COMMON *const pc = &pbi->common;
@@ -778,9 +771,9 @@ static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,
   int height = pc->height;
 
   if (scaling_active)
-    data = read_frame_size(pc, data, data_end, &display_width, &display_height);
+    read_frame_size(pc, rb, &display_width, &display_height);
 
-  data = read_frame_size(pc, data, data_end, &width, &height);
+  read_frame_size(pc, rb, &width, &height);
 
   if (pc->width != width || pc->height != height) {
     if (!pbi->initial_width || !pbi->initial_height) {
@@ -806,8 +799,6 @@ static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,
 
     vp9_update_frame_size(pc);
   }
-
-  return data;
 }
 
 static void update_frame_context(FRAME_CONTEXT *fc) {
@@ -937,59 +928,78 @@ static void decode_tiles(VP9D_COMP *pbi,
   }
 }
 
-int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
-  vp9_reader header_bc, residual_bc;
-  VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
-  const uint8_t *data = pbi->source;
-  const uint8_t *data_end = data + pbi->source_sz;
-  size_t first_partition_size = 0;
-  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
-  int i;
 
-  xd->corrupted = 0;  // start with no corruption of current frame
-  new_fb->corrupted = 0;
+static void error_handler(void *data, int bit_offset) {
+  VP9_COMMON *const cm = (VP9_COMMON *)data;
+  vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+size_t read_uncompressed_header(VP9D_COMP *pbi,
+                                struct vp9_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+
+  int scaling_active;
+  cm->last_frame_type = cm->frame_type;
+  cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb);
+  cm->version = vp9_rb_read_literal(rb, 3);
+  cm->show_frame = vp9_rb_read_bit(rb);
+  scaling_active = vp9_rb_read_bit(rb);
+  cm->subsampling_x = vp9_rb_read_bit(rb);
+  cm->subsampling_y = vp9_rb_read_bit(rb);
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||
+        vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||
+        vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid frame sync code");
+    }
+  }
+  setup_frame_size(pbi, scaling_active, rb);
 
-  if (data_end - data < 3) {
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
+  cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LG2);
+  cm->clr_type = (YUV_TYPE)vp9_rb_read_bit(rb);
+
+  cm->error_resilient_mode = vp9_rb_read_bit(rb);
+  if (!cm->error_resilient_mode) {
+    cm->refresh_frame_context = vp9_rb_read_bit(rb);
+    cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
   } else {
-    int scaling_active;
-    pc->last_frame_type = pc->frame_type;
-    pc->frame_type = (FRAME_TYPE)(data[0] & 1);
-    pc->version = (data[0] >> 1) & 7;
-    pc->show_frame = (data[0] >> 4) & 1;
-    scaling_active = (data[0] >> 5) & 1;
-    pc->subsampling_x = (data[0] >> 6) & 1;
-    pc->subsampling_y = (data[0] >> 7) & 1;
-    first_partition_size = read_le16(data + 1);
-
-    if (!read_is_valid(data, first_partition_size, data_end))
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt partition 0 length");
+    cm->refresh_frame_context = 0;
+    cm->frame_parallel_decoding_mode = 1;
+  }
 
-    data += 3;
+  return vp9_rb_read_literal(rb, 16);
+}
 
-    vp9_setup_version(pc);
+int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
+  int i;
+  vp9_reader header_bc, residual_bc;
+  VP9_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
+  const uint8_t *data = pbi->source;
+  const uint8_t *data_end = pbi->source + pbi->source_sz;
 
-    if (pc->frame_type == KEY_FRAME) {
-      // When error concealment is enabled we should only check the sync
-      // code if we have enough bits available
-      if (data + 3 < data_end) {
-        if (data[0] != 0x49 || data[1] != 0x83 || data[2] != 0x42)
-          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
-                             "Invalid frame sync code");
-      }
-      data += 3;
-    }
+  struct vp9_read_bit_buffer rb = { data, data_end, 0,
+                                    pc, error_handler };
+  const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
+  const int keyframe = pc->frame_type == KEY_FRAME;
 
-    data = setup_frame_size(pbi, scaling_active, data, data_end);
-  }
+  data += vp9_rb_bytes_read(&rb);
+  xd->corrupted = 0;
+  new_fb->corrupted = 0;
 
-  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
+  if ((!pbi->decoded_key_frame && !keyframe) ||
       pc->width == 0 || pc->height == 0) {
     return -1;
   }
 
+  vp9_setup_version(pc);
+  if (!read_is_valid(data, first_partition_size, data_end))
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt partition 0 length");
+
   init_frame(pbi);
 
   // Reset the frame pointers to the current frame size
@@ -1001,9 +1011,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
 
-  pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);
-  pc->error_resilient_mode = vp9_read_bit(&header_bc);
-
   setup_loopfilter(pc, xd, &header_bc);
 
   setup_quantization(pbi, &header_bc);
@@ -1025,7 +1032,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   // Determine if the golden frame or ARF buffer should be updated and how.
   // For all non key frames the GF and ARF refresh flags and sign bias
   // flags must be set explicitly.
-  if (pc->frame_type == KEY_FRAME) {
+  if (keyframe) {
     for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
       pc->active_ref_idx[i] = pc->new_fb_idx;
   } else {
@@ -1050,15 +1057,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
-  if (!pc->error_resilient_mode) {
-    pc->refresh_frame_context = vp9_read_bit(&header_bc);
-    pc->frame_parallel_decoding_mode = vp9_read_bit(&header_bc);
-  } else {
-    pc->refresh_frame_context = 0;
-    pc->frame_parallel_decoding_mode = 1;
-  }
-
-  pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2);
   pc->fc = pc->frame_contexts[pc->frame_context_idx];
 
   setup_segmentation(pc, xd, &header_bc);
@@ -1068,7 +1066,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   setup_txfm_mode(pc, xd->lossless, &header_bc);
 
   // Read inter mode probability context updates
-  if (pc->frame_type != KEY_FRAME) {
+  if (!keyframe) {
     int i, j;
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       for (j = 0; j < 4; ++j)
@@ -1076,7 +1074,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
           pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc);
   }
   // Is this needed ?
-  if (pc->frame_type == KEY_FRAME)
+  if (keyframe)
     vp9_default_coef_probs(pc);
 
   update_frame_context(&pc->fc);
@@ -1109,7 +1107,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;
 
   if (!pbi->decoded_key_frame) {
-    if (pc->frame_type == KEY_FRAME && !new_fb->corrupted)
+    if (keyframe && !new_fb->corrupted)
       pbi->decoded_key_frame = 1;
     else
       vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1120,7 +1118,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
 
-    if (pc->frame_type != KEY_FRAME) {
+    if (!keyframe) {
       vp9_adapt_mode_probs(pc);
       vp9_adapt_mode_context(pc);
       vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 02900c08c..890d5d0d6 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -58,13 +58,15 @@ static const vp9_prob cat6_prob[15] = {
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 #define INCREMENT_COUNT(token)               \
   do {                                       \
     coef_counts[type][ref][band][pt]         \
                [token >= TWO_TOKEN ?     \
                 (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \
                 token]++;     \
-    token_cache[scan[c]] = token; \
+    token_cache[scan[c]] = vp9_pt_energy_class[token]; \
   } while (0)
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 24f9ca3f9..3cef88bcd 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -387,9 +387,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
     cm->current_video_frame++;
   }
 
-  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
-                                       cm->current_video_frame);*/
-
   pbi->ready_for_new_data = 0;
   pbi->last_time_stamp = time_stamp;
   pbi->source_sz = 0;
diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
new file mode 100644
index 000000000..fa2dbee8d
--- /dev/null
+++ b/vp9/decoder/vp9_read_bit_buffer.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_READ_BIT_BUFFER_
+#define VP9_READ_BIT_BUFFER_
+
+typedef void (*vp9_rb_error_handler)(void *data, int bit_offset);
+
+struct vp9_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  vp9_rb_error_handler error_handler;
+};
+
+static size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
+  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
+}
+
+static int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
+  const int off = rb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
+    rb->error_handler(rb->error_handler_data, rb->bit_offset);
+    return 0;
+  } else {
+    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+    rb->bit_offset = off + 1;
+    return bit;
+  }
+}
+
+static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= vp9_rb_read_bit(rb) << bit;
+  return value;
+}
+
+#endif  // VP9_READ_BIT_BUFFER_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 15253406d..40489d59c 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -12,35 +12,37 @@
 #include <stdio.h>
 #include <limits.h>
 
-#include "vp9/common/vp9_header.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx/vpx_encoder.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_segmentation.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_pragmas.h"
+
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_write_bit_buffer.h"
+
 
 #if defined(SECTIONBITS_OUTPUT)
 unsigned __int64 Sectionbits[500];
 #endif
 
 #ifdef ENTROPY_STATS
-int intra_mode_stats[VP9_KF_BINTRAMODES]
-                    [VP9_KF_BINTRAMODES]
-                    [VP9_KF_BINTRAMODES];
+int intra_mode_stats[VP9_BINTRAMODES]
+                    [VP9_BINTRAMODES]
+                    [VP9_BINTRAMODES];
 vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
@@ -524,15 +526,6 @@ static void pack_mb_tokens(vp9_writer* const bc,
   *tp = p;
 }
 
-static void write_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
-                         const vp9_prob *p) {
-#if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m <= SPLITMV);
-#endif
-  write_token(bc, vp9_mv_ref_tree, p,
-              vp9_mv_ref_encoding_array - NEARESTMV + m);
-}
-
 static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
                             const vp9_prob *p) {
 #if CONFIG_DEBUG
@@ -1045,7 +1038,7 @@ static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
   fclose(f);
 }
 
-static void build_tree_distribution(vp9_coeff_probs *coef_probs,
+static void build_tree_distribution(vp9_coeff_probs_model *coef_probs,
                                     vp9_coeff_count *coef_counts,
                                     unsigned int (*eob_branch_ct)[REF_TYPES]
                                                                  [COEF_BANDS]
@@ -1054,12 +1047,13 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs,
                                     VP9_COMP *cpi,
                                     vp9_coeff_accum *context_counters,
 #endif
-                                    vp9_coeff_stats *coef_branch_ct,
+                                    vp9_coeff_stats_model *coef_branch_ct,
                                     int block_types) {
   int i, j, k, l;
 #ifdef ENTROPY_STATS
   int t = 0;
 #endif
+  unsigned int model_counts[UNCONSTRAINED_NODES + 1];
 
   for (i = 0; i < block_types; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
@@ -1067,10 +1061,11 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs,
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(vp9_coef_tree,
+          vp9_full_to_model_count(model_counts, coef_counts[i][j][k][l]);
+          vp9_tree_probs_from_distribution(vp9_coefmodel_tree,
                                            coef_probs[i][j][k][l],
                                            coef_branch_ct[i][j][k][l],
-                                           coef_counts[i][j][k][l], 0);
+                                           model_counts, 0);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
           coef_probs[i][j][k][l][0] =
@@ -1127,9 +1122,9 @@ static void update_coef_probs_common(
 #ifdef ENTROPY_STATS
     vp9_coeff_stats *tree_update_hist,
 #endif
-    vp9_coeff_probs *new_frame_coef_probs,
+    vp9_coeff_probs_model *new_frame_coef_probs,
     vp9_coeff_probs_model *old_frame_coef_probs,
-    vp9_coeff_stats *frame_branch_ct,
+    vp9_coeff_stats_model *frame_branch_ct,
     TX_SIZE tx_size) {
   int i, j, k, l, t;
   int update[2] = {0, 0};
@@ -1278,10 +1273,6 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
   }
 }
 
-#ifdef PACKET_TESTING
-FILE *vpxlogc = 0;
-#endif
-
 static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
   int mode_cost[MB_MODE_COUNT];
   int bestcost = INT_MAX;
@@ -1482,60 +1473,63 @@ static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   }
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
-  int i;
-  VP9_HEADER oh;
-  VP9_COMMON *const pc = &cpi->common;
-  vp9_writer header_bc, residual_bc;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  int extra_bytes_packed = 0;
 
-  uint8_t *cx_data = dest;
+void write_uncompressed_header(VP9_COMMON *cm,
+                               struct vp9_write_bit_buffer *wb) {
+  const int scaling_active = cm->width != cm->display_width ||
+                             cm->height != cm->display_height;
 
-  oh.show_frame = (int) pc->show_frame;
-  oh.type = (int)pc->frame_type;
-  oh.version = pc->version;
-  oh.first_partition_length_in_bytes = 0;
+  vp9_wb_write_bit(wb, cm->frame_type);
+  vp9_wb_write_literal(wb, cm->version, 3);
+  vp9_wb_write_bit(wb, cm->show_frame);
+  vp9_wb_write_bit(wb, scaling_active);
+  vp9_wb_write_bit(wb, cm->subsampling_x);
+  vp9_wb_write_bit(wb, cm->subsampling_y);
 
-  cx_data += 3;
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
+    vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
+    vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+  }
 
-#if defined(SECTIONBITS_OUTPUT)
-  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
-#endif
+  if (scaling_active) {
+    vp9_wb_write_literal(wb, cm->display_width, 16);
+    vp9_wb_write_literal(wb, cm->display_height, 16);
+  }
 
-  compute_update_table();
+  vp9_wb_write_literal(wb, cm->width, 16);
+  vp9_wb_write_literal(wb, cm->height, 16);
 
-  /* every keyframe send startcode, width, height, scale factor, clamp
-   * and color type.
-   */
-  if (oh.type == KEY_FRAME) {
-    // Start / synch code
-    cx_data[0] = 0x49;
-    cx_data[1] = 0x83;
-    cx_data[2] = 0x42;
-    extra_bytes_packed = 3;
-    cx_data += extra_bytes_packed;
-  }
+  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2);
+  vp9_wb_write_bit(wb, cm->clr_type);
 
-  if (pc->width != pc->display_width || pc->height != pc->display_height) {
-    write_le16(cx_data, pc->display_width);
-    write_le16(cx_data + 2, pc->display_height);
-    cx_data += 4;
-    extra_bytes_packed += 4;
+  vp9_wb_write_bit(wb, cm->error_resilient_mode);
+  if (!cm->error_resilient_mode) {
+    vp9_wb_write_bit(wb, cm->refresh_frame_context);
+    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+  int i, bytes_packed;
+  VP9_COMMON *const pc = &cpi->common;
+  vp9_writer header_bc, residual_bc;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  write_le16(cx_data, pc->width);
-  write_le16(cx_data + 2, pc->height);
-  extra_bytes_packed += 4;
-  cx_data += 4;
+  uint8_t *cx_data = dest;
+  struct vp9_write_bit_buffer wb = {dest, 0};
+  struct vp9_write_bit_buffer first_partition_size_wb;
 
-  vp9_start_encode(&header_bc, cx_data);
+  write_uncompressed_header(pc, &wb);
+  first_partition_size_wb = wb;
+  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+
+  bytes_packed = vp9_rb_bytes_written(&wb);
+  cx_data += bytes_packed;
 
-  // TODO(jkoleszar): remove these two unused bits?
-  vp9_write_bit(&header_bc, pc->clr_type);
+  compute_update_table();
 
-  // error resilient mode
-  vp9_write_bit(&header_bc, pc->error_resilient_mode);
+  vp9_start_encode(&header_bc, cx_data);
 
   encode_loopfilter(pc, xd, &header_bc);
 
@@ -1617,14 +1611,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
       vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
   }
 
-  if (!pc->error_resilient_mode) {
-    vp9_write_bit(&header_bc, pc->refresh_frame_context);
-    vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode);
-  }
-
-  vp9_write_literal(&header_bc, pc->frame_context_idx,
-                    NUM_FRAME_CONTEXTS_LG2);
-
 #ifdef ENTROPY_STATS
   if (pc->frame_type == INTER_FRAME)
     active_section = 0;
@@ -1820,27 +1806,11 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
 
   vp9_stop_encode(&header_bc);
 
-  oh.first_partition_length_in_bytes = header_bc.pos;
-
-  /* update frame tag */
-  {
-    int scaling = (pc->width != pc->display_width ||
-                   pc->height != pc->display_height);
-    int v = (oh.first_partition_length_in_bytes << 8) |
-            (pc->subsampling_y << 7) |
-            (pc->subsampling_x << 6) |
-            (scaling << 5) |
-            (oh.show_frame << 4) |
-            (oh.version << 1) |
-            oh.type;
-
-    assert(oh.first_partition_length_in_bytes <= 0xffff);
-    dest[0] = v;
-    dest[1] = v >> 8;
-    dest[2] = v >> 16;
-  }
 
-  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
+  // first partition size
+  assert(header_bc.pos <= 0xffff);
+  vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);
+  *size = bytes_packed + header_bc.pos;
 
   if (pc->frame_type == KEY_FRAME) {
     decide_kf_ymode_entropy(cpi);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 211eca4b4..0e9b6804c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -117,7 +117,6 @@ struct macroblock {
   int mbmode_cost[2][MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
-  int inter_bmode_costs[INTRA_MODE_COUNT];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b7f60b127..755ff21bf 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -20,6 +20,9 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9_rtcd.h"
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
                         const uint8_t *src_ptr, int src_stride,
@@ -105,7 +108,7 @@ static int trellis_get_coeff_context(const int *scan,
                                      uint8_t *token_cache,
                                      int pad, int l) {
   int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = token;
+  token_cache[scan[idx]] = vp9_pt_energy_class[token];
   pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
   token_cache[scan[idx]] = bak;
   return pt;
@@ -189,7 +192,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
   for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].token;
+    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
+        qcoeff_ptr[scan[i]]].token];
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
 
   for (i = eob; i-- > i0;) {
@@ -610,6 +614,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
+  MB_MODE_INFO* const mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -634,9 +639,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   TX_TYPE tx_type;
   int mode, b_mode;
 
-  mode = plane == 0? xd->mode_info_context->mbmi.mode:
-                     xd->mode_info_context->mbmi.uv_mode;
-  if (bsize <= BLOCK_SIZE_SB8X8 && mode == I4X4_PRED && plane == 0)
+  mode = plane == 0? mbmi->mode: mbmi->uv_mode;
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0 &&
+      mbmi->ref_frame == INTRA_FRAME)
     b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
   else
     b_mode = mode;
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 171b44bf9..67658f575 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -17,7 +17,6 @@
 
 void vp9_init_mode_costs(VP9_COMP *c) {
   VP9_COMMON *x = &c->common;
-  const vp9_tree_p T = vp9_bmode_tree;
   const vp9_tree_p KT = vp9_bmode_tree;
   int i, j;
 
@@ -28,8 +27,6 @@ void vp9_init_mode_costs(VP9_COMP *c) {
     }
   }
 
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
-
   // TODO(rbultje) separate tables for superblock costing?
   vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.sb_ymode_prob,
                   vp9_sb_ymode_tree);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index d42bcbb7e..b484925bb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -103,9 +103,9 @@ extern int skip_false_count;
 
 
 #ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_KF_BINTRAMODES]
-                           [VP9_KF_BINTRAMODES]
-                           [VP9_KF_BINTRAMODES];
+extern int intra_mode_stats[VP9_BINTRAMODES]
+                           [VP9_BINTRAMODES]
+                           [VP9_BINTRAMODES];
 #endif
 
 #ifdef NMV_STATS
@@ -258,9 +258,6 @@ void vp9_initialize_enc() {
     init_done = 1;
   }
 }
-#ifdef PACKET_TESTING
-extern FILE *vpxlogc;
-#endif
 
 static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1756,18 +1753,18 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 
       fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
       fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES]"
-                     "[VP9_KF_BINTRAMODES] =\n{\n");
+      fprintf(fmode, "[VP9_BINTRAMODES][VP9_BINTRAMODES]"
+                     "[VP9_BINTRAMODES] =\n{\n");
 
-      for (i = 0; i < VP9_KF_BINTRAMODES; i++) {
+      for (i = 0; i < VP9_BINTRAMODES; i++) {
 
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
-        for (j = 0; j < VP9_KF_BINTRAMODES; j++) {
+        for (j = 0; j < VP9_BINTRAMODES; j++) {
 
           fprintf(fmode, "        {");
 
-          for (k = 0; k < VP9_KF_BINTRAMODES; k++) {
+          for (k = 0; k < VP9_BINTRAMODES; k++) {
             if (!intra_mode_stats[i][j][k])
               fprintf(fmode, " %5d, ", 1);
             else
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 15f9571bb..24a2acbd5 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -422,20 +422,20 @@ typedef struct VP9_COMP {
   nmv_context_counts NMVcount;
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_4x4[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_8x8[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_16x16[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_16x16[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_32x32[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_32x32[BLOCK_TYPES];
 
   int gfu_boost;
   int last_boost;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cc1c480da..52b4cc39e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -46,6 +46,9 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
@@ -366,7 +369,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
 
       if (!c || token_cache[scan[c - 1]])
         cost += vp9_cost_bit(coef_probs[band][pt][0], 1);
-      token_cache[scan[c]] = t;
+      token_cache[scan[c]] = vp9_pt_energy_class[t];
     }
     if (c < seg_eob) {
       if (c)
@@ -611,7 +614,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     int64_t this_rd;
     int ratey = 0;
 
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
     if (cm->frame_type == KEY_FRAME)
       rate = bmode_costs[mode];
     else
@@ -653,9 +655,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                          block, 16), 16) >> 2;
 
-        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
-                             dst, xd->plane[0].dst.stride);
-
         if (best_tx_type != DCT_DCT)
           vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
                                dst, xd->plane[0].dst.stride, best_tx_type);
@@ -731,7 +730,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
-  bmode_costs = mb->inter_bmode_costs;
+  bmode_costs = mb->mbmode_cost[cpi->common.frame_type];
 
   for (idy = 0; idy < 2; idy += bh) {
     for (idx = 0; idx < 2; idx += bw) {
@@ -927,8 +926,8 @@ int vp9_cost_mv_ref(VP9_COMP *cpi,
     vp9_prob p [VP9_MVREFS - 1];
     assert(NEARESTMV <= m  &&  m <= SPLITMV);
     vp9_mv_ref_probs(pc, p, mode_context);
-    return cost_token(vp9_mv_ref_tree, p,
-                      vp9_mv_ref_encoding_array - NEARESTMV + m);
+    return cost_token(vp9_sb_mv_ref_tree, p,
+                      vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
   } else
     return 0;
 }
@@ -938,8 +937,7 @@ void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-static int labels2mode(MACROBLOCK *x,
-                       int const *labelings, int which_label,
+static int labels2mode(MACROBLOCK *x, int i,
                        MB_PREDICTION_MODE this_mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
@@ -950,7 +948,7 @@ static int labels2mode(MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
-  int i, cost = 0, thismvcost = 0;
+  int cost = 0, thismvcost = 0;
   int idx, idy;
   int bw = 1 << b_width_log2(mbmi->sb_type);
   int bh = 1 << b_height_log2(mbmi->sb_type);
@@ -958,72 +956,64 @@ static int labels2mode(MACROBLOCK *x,
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 4; ++i) {
-    MB_PREDICTION_MODE m;
+  MB_PREDICTION_MODE m;
 
-    if (labelings[i] != which_label)
-      continue;
-
-    {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEWMV:
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-            seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
+  // the only time we should do costing for new motion vector or mode
+  // is when we are on a new label  (jbb May 08, 2007)
+  switch (m = this_mode) {
+    case NEWMV:
+      if (mbmi->second_ref_frame > 0) {
+        this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
+        this_second_mv->as_int = seg_mvs[mbmi->second_ref_frame - 1].as_int;
+      }
 
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case NEARESTMV:
-          this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
-          break;
-        case NEARMV:
-          this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
-          break;
-        case ZEROMV:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
+      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
+                                    102, xd->allow_high_precision_mv);
+      if (mbmi->second_ref_frame > 0) {
+        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                      mvjcost, mvcost, 102,
+                                      xd->allow_high_precision_mv);
       }
+      break;
+    case NEARESTMV:
+      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
+      break;
+    case NEARMV:
+      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
+      break;
+    case ZEROMV:
+      this_mv->as_int = 0;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int = 0;
+      break;
+    default:
+      break;
+  }
 
-      cost = vp9_cost_mv_ref(cpi, this_mode,
-                             mbmi->mb_mode_context[mbmi->ref_frame]);
-    }
+  cost = vp9_cost_mv_ref(cpi, this_mode,
+                         mbmi->mb_mode_context[mbmi->ref_frame]);
 
-    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
+  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
-        vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                   &mic->bmi[i], sizeof(mic->bmi[i]));
-        vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
-      }
+  x->partition_info->bmi[i].mode = m;
+  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
+                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
+                 &x->partition_info->bmi[i],
+                 sizeof(x->partition_info->bmi[i]));
     }
   }
 
@@ -1033,90 +1023,86 @@ static int labels2mode(MACROBLOCK *x,
 
 static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
+                                       int i,
                                        int *labelyrate,
                                        int *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
-  int i, k;
+  int k;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
   int bwl = b_width_log2(bsize), bw = 1 << bwl;
   int bhl = b_height_log2(bsize), bh = 1 << bhl;
   int idx, idy;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t* const src =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src.buf, src_stride);
+  int16_t* src_diff =
+  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src_diff);
+  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
+  uint8_t* const pre =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].pre[0].buf,
+                            xd->plane[0].pre[0].stride);
+  uint8_t* const dst =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride);
+  int thisdistortion = 0;
+  int thisrate = 0;
 
   *labelyrate = 0;
   *distortion = 0;
-  for (i = 0; i < 4; i++) {
-    if (labels[i] == which_label) {
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src.buf, src_stride);
-      int16_t* src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                x->plane[0].src_diff);
-      int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-      uint8_t* const pre =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                xd->plane[0].pre[0].buf,
-                                xd->plane[0].pre[0].stride);
-      uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                xd->plane[0].dst.buf,
-                                xd->plane[0].dst.stride);
-      int thisdistortion = 0;
-      int thisrate = 0;
-
-      vp9_build_inter_predictor(pre,
-                                xd->plane[0].pre[0].stride,
-                                dst,
-                                xd->plane[0].dst.stride,
-                                &xd->mode_info_context->bmi[i].as_mv[0],
-                                &xd->scale_factor[0],
-                                4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
-
-      // TODO(debargha): Make this work properly with the
-      // implicit-compoundinter-weight experiment when implicit
-      // weighting for splitmv modes is turned on.
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        uint8_t* const second_pre =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                  xd->plane[0].pre[1].buf,
-                                  xd->plane[0].pre[1].stride);
-        vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
-                                  dst, xd->plane[0].dst.stride,
-                                  &xd->mode_info_context->bmi[i].as_mv[1],
-                                  &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
-                                  &xd->subpix);
-      }
 
-      vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
+  vp9_build_inter_predictor(pre,
+                            xd->plane[0].pre[0].stride,
+                            dst,
+                            xd->plane[0].dst.stride,
+                            &xd->mode_info_context->bmi[i].as_mv[0],
+                            &xd->scale_factor[0],
+                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
+
+  // TODO(debargha): Make this work properly with the
+  // implicit-compoundinter-weight experiment when implicit
+  // weighting for splitmv modes is turned on.
+  if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+    uint8_t* const second_pre =
+    raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                              xd->plane[0].pre[1].buf,
+                              xd->plane[0].pre[1].stride);
+    vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
+                              dst, xd->plane[0].dst.stride,
+                              &xd->mode_info_context->bmi[i].as_mv[1],
+                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
+                              &xd->subpix);
+  }
 
-      k = i;
-      for (idy = 0; idy < bh; ++idy) {
-        for (idx = 0; idx < bw; ++idx) {
-          k += (idy * 2 + idx);
-          src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
-                                               x->plane[0].src_diff);
-          coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
-          x->fwd_txm4x4(src_diff, coeff, 16);
-          x->quantize_b_4x4(x, k, DCT_DCT, 16);
-          thisdistortion += vp9_block_error(coeff,
-                                            BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                         k, 16), 16);
-          thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
-                                  ta + (k & 1),
-                                  tl + (k >> 1), TX_4X4, 16);
-        }
-      }
-      *distortion += thisdistortion;
-      *labelyrate += thisrate;
+  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+                     src, src_stride,
+                     dst, xd->plane[0].dst.stride);
+
+  k = i;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      k += (idy * 2 + idx);
+      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+                                           x->plane[0].src_diff);
+      coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+      x->fwd_txm4x4(src_diff, coeff, 16);
+      x->quantize_b_4x4(x, k, DCT_DCT, 16);
+      thisdistortion += vp9_block_error(coeff,
+                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                     k, 16), 16);
+      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
+                              ta + (k & 1),
+                              tl + (k >> 1), TX_4X4, 16);
     }
   }
+  *distortion += thisdistortion;
+  *labelyrate += thisrate;
+
   *distortion >>= 2;
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
@@ -1192,7 +1178,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
   int i, j;
-  static const int labels[4] = { 0, 1, 2, 3 };
   int br = 0, bd = 0;
   MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -1261,12 +1246,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
         // motion search for newmv (single predictor case only)
         if (mbmi->second_ref_frame <= 0 && this_mode == NEWMV) {
-          int sseshift, n;
           int step_param = 0;
           int further_steps;
           int thissme, bestsme = INT_MAX;
           const struct buf_2d orig_src = x->plane[0].src;
           const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
+          int sadpb = x->sadperbit4;
+          int_mv mvp_full;
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
@@ -1287,55 +1273,44 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
 
-          {
-            int sadpb = x->sadperbit4;
-            int_mv mvp_full;
-
-            mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-            mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-            // find first label
-            n = i;
-
-            // adjust src pointer for this segment
-            x->plane[0].src.buf =
-            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                      x->plane[0].src.buf,
-                                      x->plane[0].src.stride);
-            assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
-            x->e_mbd.plane[0].pre[0].buf =
-            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                      x->e_mbd.plane[0].pre[0].buf,
-                                      x->e_mbd.plane[0].pre[0].stride);
-
-            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                             sadpb, further_steps, 0, v_fn_ptr,
-                                             bsi->ref_mv, &mode_mv[NEWMV]);
-
-            sseshift = 0;
-
-            // Should we do a full search (best quality only)
-            if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-              /* Check if mvp_full is within the range. */
-              clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                       x->mv_row_min, x->mv_row_max);
-
-              thissme = cpi->full_search_sad(x, &mvp_full,
-                                             sadpb, 16, v_fn_ptr,
-                                             x->nmvjointcost, x->mvcost,
-                                             bsi->ref_mv,
-                                             n);
-
-              if (thissme < bestsme) {
-                bestsme = thissme;
-                mode_mv[NEWMV].as_int =
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
-              } else {
-                /* The full search result is actually worse so re-instate the
-                 * previous best vector */
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-                mode_mv[NEWMV].as_int;
-              }
+          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+          // adjust src pointer for this segment
+          x->plane[0].src.buf =
+              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                        x->plane[0].src.buf,
+                                        x->plane[0].src.stride);
+          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
+          x->e_mbd.plane[0].pre[0].buf =
+              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                        x->e_mbd.plane[0].pre[0].buf,
+                                        x->e_mbd.plane[0].pre[0].stride);
+
+          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                           sadpb, further_steps, 0, v_fn_ptr,
+                                           bsi->ref_mv, &mode_mv[NEWMV]);
+
+          // Should we do a full search (best quality only)
+          if (cpi->compressor_speed == 0) {
+            /* Check if mvp_full is within the range. */
+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                     x->mv_row_min, x->mv_row_max);
+
+            thissme = cpi->full_search_sad(x, &mvp_full,
+                                           sadpb, 16, v_fn_ptr,
+                                           x->nmvjointcost, x->mvcost,
+                                           bsi->ref_mv, i);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int =
+                  x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
+            } else {
+              /* The full search result is actually worse so re-instate the
+               * previous best vector */
+              x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
+                  mode_mv[NEWMV].as_int;
             }
           }
 
@@ -1355,16 +1330,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           x->plane[0].src = orig_src;
           x->e_mbd.plane[0].pre[0] = orig_pre;
         } else if (mbmi->second_ref_frame > 0 && this_mode == NEWMV) {
-          /* NEW4X4 */
-          /* motion search not completed? Then skip newmv for this block with
-           * comppred */
           if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-              seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+              seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV)
             continue;
-          }
         }
 
-        rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+        rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],
                            &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
                            bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                            x->mvcost, cpi);
@@ -1381,7 +1352,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
 
         this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
+                                          x, i, &labelyrate,
                                           &distortion, t_above_s, t_left_s);
         this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
         rate += labelyrate;
@@ -1392,10 +1363,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           bestlabelyrate = labelyrate;
           mode_selected = this_mode;
           best_label_rd = this_rd;
-          for (j = 0; j < 4; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-
+          best_eobs[i] = x->e_mbd.plane[0].eobs[i];
           vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
           vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
         }
@@ -1404,7 +1372,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       vpx_memcpy(t_above, t_above_b, sizeof(t_above));
       vpx_memcpy(t_left, t_left_b, sizeof(t_left));
 
-      labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+      labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                   x->mvcost, cpi);
@@ -1882,6 +1850,154 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
   return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
 }
 
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]) {
+  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int refs[2] = { mbmi->ref_frame,
+                  (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };
+  int_mv ref_mv[2];
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  int ite;
+  // Prediction buffer from second frame.
+  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d scaled_first_yv12;
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+
+  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
+  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+
+  if (scaled_ref_frame[0]) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_second_yv12[i] = xd->plane[i].pre[1];
+
+    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                          mi_row, mi_col);
+  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                          mi_row, mi_col);
+  scaled_first_yv12 = xd->plane[0].pre[0];
+
+  // Initialize mv using single prediction mode result.
+  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+  // Allow joint search multiple times iteratively for each ref frame
+  // and break out the search loop if it couldn't find better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    int_mv tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get pred block from second frame.
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]],
+                              &xd->scale_factor[!id],
+                              pw, ph, 0,
+                              &xd->subpix);
+
+    // Compound motion search on first ref frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+    // Use mv result from single mode as mvp.
+    tmp_mv.as_int = frame_mv[refs[id]].as_int;
+
+    tmp_mv.as_mv.col >>= 3;
+    tmp_mv.as_mv.row >>= 3;
+
+    // Small-range full-pixel motion search
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[block_size],
+                                       x->nmvjointcost, x->mvcost,
+                                       &ref_mv[id], second_pred,
+                                       pw, ph);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+
+      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                             &ref_mv[id],
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &sse, second_pred,
+                                             pw, ph);
+    }
+
+    if (id)
+      xd->plane[0].pre[0] = scaled_first_yv12;
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_int = tmp_mv.as_int;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  // restore the predictor
+  if (scaled_ref_frame[0]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[1] = backup_second_yv12[i];
+  }
+
+  vpx_free(second_pred);
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int64_t txfm_cache[],
@@ -1924,145 +2040,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
-        if (cpi->sf.comp_inter_joint_serach) {
-          int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
-          int ite;
-          // Prediction buffer from second frame.
-          uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-
-          // Do joint motion search in compound mode to get more accurate mv.
-          struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d scaled_first_yv12;
-          int last_besterr[2] = {INT_MAX, INT_MAX};
-
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            // Swap out the reference frame for a version that's been scaled to
-            // match the resolution of the current frame, allowing the existing
-            // motion search code to be used without additional modifications.
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_yv12[i] = xd->plane[i].pre[0];
-
-            setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_second_yv12[i] = xd->plane[i].pre[1];
-
-            setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-          xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
-                                                  mi_row, mi_col);
-          xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
-                                                  mi_row, mi_col);
-
-          scaled_first_yv12 = xd->plane[0].pre[0];
-
-          // Initialize mv using single prediction mode result.
-          frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-          frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-          // Allow joint search multiple times iteratively for each ref frame
-          // and break out the search loop if it couldn't find better mv.
-          for (ite = 0; ite < 4; ite++) {
-            struct buf_2d ref_yv12[2];
-            int bestsme = INT_MAX;
-            int sadpb = x->sadperbit16;
-            int_mv tmp_mv;
-            int search_range = 3;
-
-            int tmp_col_min = x->mv_col_min;
-            int tmp_col_max = x->mv_col_max;
-            int tmp_row_min = x->mv_row_min;
-            int tmp_row_max = x->mv_row_max;
-            int id = ite % 2;
-
-            // Initialized here because of compiler problem in Visual Studio.
-            ref_yv12[0] = xd->plane[0].pre[0];
-            ref_yv12[1] = xd->plane[0].pre[1];
-
-            // Get pred block from second frame.
-            vp9_build_inter_predictor(ref_yv12[!id].buf,
-                                      ref_yv12[!id].stride,
-                                      second_pred, pw,
-                                      &frame_mv[refs[!id]],
-                                      &xd->scale_factor[!id],
-                                      pw, ph, 0,
-                                      &xd->subpix);
-
-            // Compound motion search on first ref frame.
-            if (id)
-              xd->plane[0].pre[0] = ref_yv12[id];
-            vp9_clamp_mv_min_max(x, &ref_mv[id]);
-
-            // Use mv result from single mode as mvp.
-            tmp_mv.as_int = frame_mv[refs[id]].as_int;
-
-            tmp_mv.as_mv.col >>= 3;
-            tmp_mv.as_mv.row >>= 3;
-
-            // Small-range full-pixel motion search
-            bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                               search_range,
-                                               &cpi->fn_ptr[block_size],
-                                               x->nmvjointcost, x->mvcost,
-                                               &ref_mv[id], second_pred,
-                                               pw, ph);
-
-            x->mv_col_min = tmp_col_min;
-            x->mv_col_max = tmp_col_max;
-            x->mv_row_min = tmp_row_min;
-            x->mv_row_max = tmp_row_max;
-
-            if (bestsme < INT_MAX) {
-              int dis; /* TODO: use dis in distortion calculation later. */
-              unsigned int sse;
-
-              bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
-                                                     &ref_mv[id],
-                                                     x->errorperbit,
-                                                     &cpi->fn_ptr[block_size],
-                                                     x->nmvjointcost, x->mvcost,
-                                                     &dis, &sse, second_pred,
-                                                     pw, ph);
-            }
-
-            if (id)
-              xd->plane[0].pre[0] = scaled_first_yv12;
-
-            if (bestsme < last_besterr[id]) {
-              frame_mv[refs[id]].as_int =
-                  xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
-              last_besterr[id] = bestsme;
-            } else {
-              break;
-            }
-          }
-
-          // restore the predictor
-          if (scaled_ref_frame[0]) {
-            int i;
+        // Initialize mv using single prediction mode result.
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[0] = backup_yv12[i];
-          }
-
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[1] = backup_second_yv12[i];
-          }
-
-          vpx_free(second_pred);
-        }
+        if (cpi->sf.comp_inter_joint_serach)
+          iterative_motion_search(cpi, x, bsize, frame_mv, scaled_ref_frame,
+                                  mi_row, mi_col, single_newmv);
 
         if (frame_mv[refs[0]].as_int == INVALID_MV ||
             frame_mv[refs[1]].as_int == INVALID_MV)
@@ -2138,8 +2122,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        x->nmvjointcost, x->mvcost,
                                        &dis, &sse);
         }
-        frame_mv[refs[0]].as_int =
-          xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        frame_mv[refs[0]].as_int = tmp_mv.as_int;
         single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
@@ -2688,8 +2671,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_mode == I4X4_PRED) {
       int rate;
 
-      // Note the rate value returned here includes the cost of coding
-      // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
       mbmi->txfm_size = TX_4X4;
       rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
                                 &distortion_y, INT64_MAX);
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 08efc84d4..eb79de1d9 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -36,6 +36,9 @@ extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
 extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -228,7 +231,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       if (!t->skip_eob_node)
         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
     }
-    token_cache[scan[c]] = token;
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++t;
   } while (c < eob && ++c < seg_eob);
 
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
new file mode 100644
index 000000000..18cf40366
--- /dev/null
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_BIT_WRITE_BUFFER_H_
+#define VP9_BIT_WRITE_BUFFER_H_
+
+#include "vpx/vpx_integer.h"
+
+struct vp9_write_bit_buffer {
+  uint8_t *bit_buffer;
+  size_t bit_offset;
+};
+
+static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+  const int off = wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  wb->bit_buffer[p] &= ~(1 << q);
+  wb->bit_buffer[p] |= bit << q;
+  wb->bit_offset = off + 1;
+}
+
+static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,
+                              int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+
+#endif  // VP9_BIT_WRITE_BUFFER_H_
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 147804d03..29e832384 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -18,7 +18,6 @@ VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h
 VP9_COMMON_SRCS-yes += common/vp9_convolve.c
 VP9_COMMON_SRCS-yes += common/vp9_convolve.h
-VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
 VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
@@ -38,7 +37,6 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
 VP9_COMMON_SRCS-yes += common/vp9_extend.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
-VP9_COMMON_SRCS-yes += common/vp9_header.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_invtrans.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index ee1130402..b5aa10d52 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -215,26 +215,19 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
   if (data + data_sz <= data)
     res = VPX_CODEC_INVALID_PARAM;
   else {
-    /* Parse uncompresssed part of key frame header.
-     * 3 bytes:- including version, frame type and an offset
-     * 3 bytes:- sync code (0x49, 0x83, 0x42)
-     * 4 bytes:- including image width and height in the lowest 14 bits
-     *           of each 2-byte value.
-     */
     si->is_kf = 0;
 
-    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */
-      const uint8_t *c = data + 3;
+    if (data_sz >= 8 && !(data[0] & 0x80)) { /* I-Frame */
+      const uint8_t *c = data + 1;
       si->is_kf = 1;
 
-      /* vet via sync code */
-      if (c[0] != 0x49 || c[1] != 0x83 || c[2] != 0x42)
+      if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2)
         res = VPX_CODEC_UNSUP_BITSTREAM;
 
-      si->w = (c[3] | (c[4] << 8));
-      si->h = (c[5] | (c[6] << 8));
+      si->w = (c[3] << 8) | c[4];
+      si->h = (c[5] << 8) | c[6];
 
-      /*printf("w=%d, h=%d\n", si->w, si->h);*/
+      // printf("w=%d, h=%d\n", si->w, si->h);
       if (!(si->h | si->w))
         res = VPX_CODEC_UNSUP_BITSTREAM;
     } else
@@ -242,7 +235,6 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
   }
 
   return res;
-
 }
 
 static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 42ab02d31..86fd08850 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -28,6 +28,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.h
+VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeintra.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 3be0b6dde..7ae3219ca 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -24,6 +24,7 @@ VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.h
+VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd.h