40 files changed, 1015 insertions, 2882 deletions
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 01fa63fdb..c3d6dae93 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -219,8 +219,4 @@ void vp9_initialize_common() {
   vp9_entropy_mode_init();
 
   vp9_entropy_mv_init();
-
-#if CONFIG_NEWCOEFCONTEXT
-  vp9_init_neighbors();
-#endif
 }
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 3351e6928..054d58dba 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -47,18 +47,6 @@ void vpx_log(const char *format, ...);
 #define MAX_MV_REFS 9
 #define MAX_MV_REF_CANDIDATES 4
 
-#if CONFIG_DWTDCTHYBRID
-#define DWT_MAX_LENGTH     64
-#define DWT_TYPE           26    // 26/53/97
-#define DWT_PRECISION_BITS 2
-#define DWT_PRECISION_RND  ((1 << DWT_PRECISION_BITS) / 2)
-
-#define DWTDCT16X16        0
-#define DWTDCT16X16_LEAN   1
-#define DWTDCT8X8          2
-#define DWTDCT_TYPE        DWTDCT16X16_LEAN
-#endif
-
 typedef struct {
   int r, c;
 } POS;
@@ -218,10 +206,7 @@ union b_mode_info {
     B_PREDICTION_MODE context;
 #endif
   } as_mode;
-  struct {
-    int_mv first;
-    int_mv second;
-  } as_mv;
+  int_mv as_mv[2];  // first, second inter predictor motion vectors
 };
 
 typedef enum {
@@ -386,11 +371,28 @@ typedef struct macroblockd {
   unsigned int frames_since_golden;
   unsigned int frames_till_alt_ref_frame;
 
+#if CONFIG_LOSSLESS
+  int lossless;
+#endif
   /* Inverse transform function pointers. */
-  void (*inv_xform4x4_1_x8)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_xform4x4_x8)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
-  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
+  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_2ndtxm4x4_1)(int16_t *in, int16_t *out);
+  void (*inv_2ndtxm4x4)(int16_t *in, int16_t *out);
+  void (*itxm_add)(int16_t *input, const int16_t *dq,
+    uint8_t *pred, uint8_t *output, int pitch, int stride);
+  void (*dc_itxm_add)(int16_t *input, const int16_t *dq,
+    uint8_t *pred, uint8_t *output, int pitch, int stride, int dc);
+  void (*dc_only_itxm_add)(int input_dc, uint8_t *pred_ptr,
+    uint8_t *dst_ptr, int pitch, int stride);
+  void (*dc_itxm_add_y_block)(int16_t *q, const int16_t *dq,
+    uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs,
+    const int16_t *dc);
+  void (*itxm_add_y_block)(int16_t *q, const int16_t *dq,
+    uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs);
+  void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq,
+    uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride,
+    uint16_t *eobs);
 
   struct subpix_fn_table  subpix;
 
@@ -501,6 +503,10 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
   int ib = (int)(b - xd->block);
   if (ib >= 16)
     return tx_type;
+#if CONFIG_LOSSLESS
+  if (xd->lossless)
+    return DCT_DCT;
+#endif
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
   if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index f21f1d84e..b87c410df 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -7,12 +7,15 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "vp9/common/vp9_convolve.h"
+
 #include <assert.h>
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT  7
@@ -293,9 +296,21 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  convolve_avg_c(src, src_stride, dst, dst_stride,
-                 filter_x, x_step_q4, filter_y, y_step_q4,
-                 w, h, 8);
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
+  assert(w <= 16);
+  assert(h <= 16);
+
+  vp9_convolve8(src, src_stride,
+                temp, 16,
+                filter_x, x_step_q4,
+                filter_y, y_step_q4,
+                w, h);
+  vp9_convolve_avg(temp, 16,
+                   dst, dst_stride,
+                   NULL, 0, /* These unused parameter should be removed! */
+                   NULL, 0, /* These unused parameter should be removed! */
+                   w, h);
 }
 
 void vp9_convolve_copy(const uint8_t *src, int src_stride,
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 5ea7736b7..1953d60c6 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -129,8 +129,8 @@ void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
         mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
         bindex = (b_row & 3) * 4 + (b_col & 3);
         fprintf(mvs, "%3d:%-3d ",
-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,
-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);
+                mi[mb_index].bmi[bindex].as_mv[0].as_mv.row,
+                mi[mb_index].bmi[bindex].as_mv[0].as_mv.col);
 
       }
 
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 352e17c0c..e21eaba83 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -143,624 +143,6 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
   237, 252, 253, 238, 223, 239, 254, 255,
 };
 
-#if CONFIG_DWTDCTHYBRID
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-  0,    1,   32,   64,   33,    2,    3,   34,
-  65,   96, 128,   97,   66,   35,    4,  5,
-  36,   67,   98,  129,  160,  192,  161,  130,
-  99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224, 256,  225,  194,  163,
-  132,  101,   70,   39,    8,    9,   40,   71,
-  102,  133,  164,  195,  226,  257,  288,  320,
-  289,  258,  227,  196,  165,  134,  103,   72,
-  41,   10,   11,   42,   73,  104,  135,  166,
-  197,  228,  259,  290,  321,  352,  384,  353,
-  322,  291,  260,  229,  198,  167,  136,  105,
-  74,   43,   12,   13,   44,   75,  106,  137,
-  168,  199,  230,  261,  292,  323,  354,  385,
-  416,  448,  417,  386,  355,  324,  293,  262,
-  231,  200,  169,  138,  107,   76,   45,   14,
-  15,   46,   77,  108,  139,  170,  201,  232,
-  263,  294,  325,  356,  387,  418,  449,  480,
-  481,  450,  419,  388,  357,  326,  295,  264,
-  233,  202,  171,  140,  109,   78,   47,   79,
-  110,  141,  172,  203,  234,  265,  296,  327,
-  358,  389,  420,  451,  482,  483,  452,  421,
-  390,  359,  328,  297,  266,  235,  204,  173,
-  142,  111,  143,  174,  205,  236,  267,  298,
-  329,  360,  391,  422,  453,  484,  485,  454,
-  423,  392,  361,  330,  299,  268,  237,  206,
-  175,  207,  238,  269,  300,  331,  362,  393,
-  424,  455,  486,  487,  456,  425,  394,  363,
-  332,  301,  270,  239,  271,  302,  333,  364,
-  395,  426,  457,  488,  489,  458,  427,  396,
-  365,  334,  303,  335,  366,  397,  428,  459,
-  490,  491,  460,  429,  398,  367,  399,  430,
-  461,  492,  493,  462,  431,  463,  494,  495,
-
-  16,   512,  528, 17,  513,  529,   48,  544,
-  560, 80,  576,  592,   49,  545,  561,   18,
-  514,  530,   19,  515,  531,   50,  546,  562,
-  81,  577,  593,  112,  608,  624,  144,  640,
-  656,  113,  609,  625,   82,  578,  594,   51,
-  547,  563,   20,  516,  532,   21,  517,  533,
-  52,  548,  564,   83,  579,  595,  114,  610,
-  626,  145,  641,  657,  176,  672,  688,  208,
-  704,  720,  177,  673,  689,  146,  642,  658,
-  115,  611,  627,   84,  580,  596,   53,  549,
-  565,   22,  518,  534,   23,  519,  535,   54,
-  550,  566,   85,  581,  597,  116,  612,  628,
-  147,  643,  659,  178,  674,  690,  209,  705,
-  721,  240,  736,  752,  272,  768,  784,  241,
-  737,  753,  210,  706,  722,  179,  675,  691,
-  148,  644,  660,  117,  613,  629,   86,  582,
-  598,   55,  551,  567,   24,  520,  536,   25,
-  521,  537,   56,  552,  568,   87,  583,  599,
-  118,  614,  630,  149,  645,  661,  180,  676,
-  692,  211,  707,  723,  242,  738,  754,  273,
-  769,  785,  304,  800,  816,  336,  832,  848,
-  305,  801,  817,  274,  770,  786,  243,  739,
-  755,  212,  708,  724,  181,  677,  693,  150,
-  646,  662,  119,  615,  631,   88,  584,  600,
-  57,  553,  569,   26,  522,  538,   27,  523,
-  539,   58,  554,  570,   89,  585,  601,  120,
-  616,  632,  151,  647,  663,  182,  678,  694,
-  213,  709,  725,  244,  740,  756,  275,  771,
-  787,  306,  802,  818,  337,  833,  849,  368,
-  864,  880,  400,  896,  912,  369,  865,  881,
-  338,  834,  850,  307,  803,  819,  276,  772,
-  788,  245,  741,  757,  214,  710,  726,  183,
-
-  679,  695,  152,  648,  664,  121,  617,  633,
-  90,  586,  602,   59,  555,  571,   28,  524,
-  540,   29,  525,  541,   60,  556,  572,   91,
-  587,  603,  122,  618,  634,  153,  649,  665,
-  184,  680,  696,  215,  711,  727,  246,  742,
-  758,  277,  773,  789,  308,  804,  820,  339,
-  835,  851,  370,  866,  882,  401,  897,  913,
-  432,  928,  944,  464,  960,  976,  433,  929,
-  945,  402,  898,  914,  371,  867,  883,  340,
-  836,  852,  309,  805,  821,  278,  774,  790,
-  247,  743,  759,  216,  712,  728,  185,  681,
-  697,  154,  650,  666,  123,  619,  635,   92,
-  588,  604,   61,  557,  573,   30,  526,  542,
-  31,  527,  543,   62,  558,  574,   93,  589,
-  605,  124,  620,  636,  155,  651,  667,  186,
-  682,  698,  217,  713,  729,  248,  744,  760,
-  279,  775,  791,  310,  806,  822,  341,  837,
-  853,  372,  868,  884,  403,  899,  915,  434,
-  930,  946,  465,  961,  977,  496,  992, 1008,
-  497,  993, 1009,  466,  962,  978,  435,  931,
-  947,  404,  900,  916,  373,  869,  885,  342,
-  838,  854,  311,  807,  823,  280,  776,  792,
-  249,  745,  761,  218,  714,  730,  187,  683,
-  699,  156,  652,  668,  125,  621,  637,   94,
-  590,  606,   63,  559,  575,   95,  591,  607,
-  126,  622,  638,  157,  653,  669,  188,  684,
-  700,  219,  715,  731,  250,  746,  762,  281,
-  777,  793,  312,  808,  824,  343,  839,  855,
-  374,  870,  886,  405,  901,  917,  436,  932,
-  948,  467,  963,  979,  498,  994, 1010,  499,
-  995, 1011,  468,  964,  980,  437,  933,  949,
-  406,  902,  918,  375,  871,  887,  344,  840,
-
-  856,  313,  809,  825,  282,  778,  794,  251,
-  747,  763,  220,  716,  732,  189,  685,  701,
-  158,  654,  670,  127,  623,  639,  159,  655,
-  671,  190,  686,  702,  221,  717,  733,  252,
-  748,  764,  283,  779,  795,  314,  810,  826,
-  345,  841,  857,  376,  872,  888,  407,  903,
-  919,  438,  934,  950,  469,  965,  981,  500,
-  996, 1012,  501,  997, 1013,  470,  966,  982,
-  439,  935,  951,  408,  904,  920,  377,  873,
-  889,  346,  842,  858,  315,  811,  827,  284,
-  780,  796,  253,  749,  765,  222,  718,  734,
-  191,  687,  703,  223,  719,  735,  254,  750,
-  766,  285,  781,  797,  316,  812,  828,  347,
-  843,  859,  378,  874,  890,  409,  905,  921,
-  440,  936,  952,  471,  967,  983,  502,  998,
-  1014,  503,  999, 1015,  472,  968,  984,  441,
-  937,  953,  410,  906,  922,  379,  875,  891,
-  348,  844,  860,  317,  813,  829,  286,  782,
-  798,  255,  751,  767,  287,  783,  799,  318,
-  814,  830,  349,  845,  861,  380,  876,  892,
-  411,  907,  923,  442,  938,  954,  473,  969,
-  985,  504, 1000, 1016,  505, 1001, 1017,  474,
-  970,  986,  443,  939,  955,  412,  908,  924,
-  381,  877,  893,  350,  846,  862,  319,  815,
-  831,  351,  847,  863,  382,  878,  894,  413,
-  909,  925,  444,  940,  956,  475,  971,  987,
-  506, 1002, 1018,  507, 1003, 1019,  476,  972,
-  988,  445,  941,  957,  414,  910,  926,  383,
-  879,  895,  415,  911,  927,  446,  942,  958,
-  477,  973,  989,  508, 1004, 1020,  509, 1005,
-  1021,  478,  974,  990,  447,  943,  959,  479,
-  975,  991,  510, 1006, 1022,  511, 1007, 1023,
-};
-
-#elif DWTDCT_TYPE == DWTDCT16X16
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6,
-  6, 6, 6,
-  6,
-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-  0,    1,   32,   64,   33,    2,    3,   34,
-  65,   96, 128,   97,   66,   35,    4,
-  16,   512,  528,
-  5,
-  36,   67,   98,  129,  160,  192,  161,  130,
-  99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224, 256,  225,  194,  163,
-  132,  101,   70,   39,    8,    9,   40,   71,
-  102,  133,  164,  195,  226,  257,  288,  320,
-  289,  258,  227,  196,  165,  134,  103,   72,
-  41,   10,   11,   42,   73,  104,  135,  166,
-  197,  228,  259,  290,  321,  352,  384,  353,
-  322,  291,  260,  229,  198,  167,  136,  105,
-  74,   43,   12,   13,   44,   75,  106,  137,
-  168,  199,  230,  261,  292,  323,  354,  385,
-  416,  448,  417,  386,  355,  324,  293,  262,
-  231,  200,  169,  138,  107,   76,   45,   14,
-  15,   46,   77,  108,  139,  170,  201,  232,
-  263,  294,  325,  356,  387,  418,  449,  480,
-  481,  450,  419,  388,  357,  326,  295,  264,
-  233,  202,  171,  140,  109,   78,   47,   79,
-  110,  141,  172,  203,  234,  265,  296,  327,
-  358,  389,  420,  451,  482,  483,  452,  421,
-  390,  359,  328,  297,  266,  235,  204,  173,
-  142,  111,  143,  174,  205,  236,  267,  298,
-  329,  360,  391,  422,  453,  484,  485,  454,
-  423,  392,  361,  330,  299,  268,  237,  206,
-  175,  207,  238,  269,  300,  331,  362,  393,
-  424,  455,  486,  487,  456,  425,  394,  363,
-  332,  301,  270,  239,  271,  302,  333,  364,
-  395,  426,  457,  488,  489,  458,  427,  396,
-  365,  334,  303,  335,  366,  397,  428,  459,
-  490,  491,  460,  429,  398,  367,  399,  430,
-  461,  492,  493,  462,  431,  463,  494,  495,
-
-  17,  513,  529,   48,  544,
-  560, 80,  576,  592,   49,  545,  561,   18,
-  514,  530,   19,  515,  531,   50,  546,  562,
-  81,  577,  593,  112,  608,  624,  144,  640,
-  656,  113,  609,  625,   82,  578,  594,   51,
-  547,  563,   20,  516,  532,   21,  517,  533,
-  52,  548,  564,   83,  579,  595,  114,  610,
-  626,  145,  641,  657,  176,  672,  688,  208,
-  704,  720,  177,  673,  689,  146,  642,  658,
-  115,  611,  627,   84,  580,  596,   53,  549,
-  565,   22,  518,  534,   23,  519,  535,   54,
-  550,  566,   85,  581,  597,  116,  612,  628,
-  147,  643,  659,  178,  674,  690,  209,  705,
-  721,  240,  736,  752,  272,  768,  784,  241,
-  737,  753,  210,  706,  722,  179,  675,  691,
-  148,  644,  660,  117,  613,  629,   86,  582,
-  598,   55,  551,  567,   24,  520,  536,   25,
-  521,  537,   56,  552,  568,   87,  583,  599,
-  118,  614,  630,  149,  645,  661,  180,  676,
-  692,  211,  707,  723,  242,  738,  754,  273,
-  769,  785,  304,  800,  816,  336,  832,  848,
-  305,  801,  817,  274,  770,  786,  243,  739,
-  755,  212,  708,  724,  181,  677,  693,  150,
-  646,  662,  119,  615,  631,   88,  584,  600,
-  57,  553,  569,   26,  522,  538,   27,  523,
-  539,   58,  554,  570,   89,  585,  601,  120,
-  616,  632,  151,  647,  663,  182,  678,  694,
-  213,  709,  725,  244,  740,  756,  275,  771,
-  787,  306,  802,  818,  337,  833,  849,  368,
-  864,  880,  400,  896,  912,  369,  865,  881,
-  338,  834,  850,  307,  803,  819,  276,  772,
-  788,  245,  741,  757,  214,  710,  726,  183,
-
-  679,  695,  152,  648,  664,  121,  617,  633,
-  90,  586,  602,   59,  555,  571,   28,  524,
-  540,   29,  525,  541,   60,  556,  572,   91,
-  587,  603,  122,  618,  634,  153,  649,  665,
-  184,  680,  696,  215,  711,  727,  246,  742,
-  758,  277,  773,  789,  308,  804,  820,  339,
-  835,  851,  370,  866,  882,  401,  897,  913,
-  432,  928,  944,  464,  960,  976,  433,  929,
-  945,  402,  898,  914,  371,  867,  883,  340,
-  836,  852,  309,  805,  821,  278,  774,  790,
-  247,  743,  759,  216,  712,  728,  185,  681,
-  697,  154,  650,  666,  123,  619,  635,   92,
-  588,  604,   61,  557,  573,   30,  526,  542,
-  31,  527,  543,   62,  558,  574,   93,  589,
-  605,  124,  620,  636,  155,  651,  667,  186,
-  682,  698,  217,  713,  729,  248,  744,  760,
-  279,  775,  791,  310,  806,  822,  341,  837,
-  853,  372,  868,  884,  403,  899,  915,  434,
-  930,  946,  465,  961,  977,  496,  992, 1008,
-  497,  993, 1009,  466,  962,  978,  435,  931,
-  947,  404,  900,  916,  373,  869,  885,  342,
-  838,  854,  311,  807,  823,  280,  776,  792,
-  249,  745,  761,  218,  714,  730,  187,  683,
-  699,  156,  652,  668,  125,  621,  637,   94,
-  590,  606,   63,  559,  575,   95,  591,  607,
-  126,  622,  638,  157,  653,  669,  188,  684,
-  700,  219,  715,  731,  250,  746,  762,  281,
-  777,  793,  312,  808,  824,  343,  839,  855,
-  374,  870,  886,  405,  901,  917,  436,  932,
-  948,  467,  963,  979,  498,  994, 1010,  499,
-  995, 1011,  468,  964,  980,  437,  933,  949,
-  406,  902,  918,  375,  871,  887,  344,  840,
-
-  856,  313,  809,  825,  282,  778,  794,  251,
-  747,  763,  220,  716,  732,  189,  685,  701,
-  158,  654,  670,  127,  623,  639,  159,  655,
-  671,  190,  686,  702,  221,  717,  733,  252,
-  748,  764,  283,  779,  795,  314,  810,  826,
-  345,  841,  857,  376,  872,  888,  407,  903,
-  919,  438,  934,  950,  469,  965,  981,  500,
-  996, 1012,  501,  997, 1013,  470,  966,  982,
-  439,  935,  951,  408,  904,  920,  377,  873,
-  889,  346,  842,  858,  315,  811,  827,  284,
-  780,  796,  253,  749,  765,  222,  718,  734,
-  191,  687,  703,  223,  719,  735,  254,  750,
-  766,  285,  781,  797,  316,  812,  828,  347,
-  843,  859,  378,  874,  890,  409,  905,  921,
-  440,  936,  952,  471,  967,  983,  502,  998,
-  1014,  503,  999, 1015,  472,  968,  984,  441,
-  937,  953,  410,  906,  922,  379,  875,  891,
-  348,  844,  860,  317,  813,  829,  286,  782,
-  798,  255,  751,  767,  287,  783,  799,  318,
-  814,  830,  349,  845,  861,  380,  876,  892,
-  411,  907,  923,  442,  938,  954,  473,  969,
-  985,  504, 1000, 1016,  505, 1001, 1017,  474,
-  970,  986,  443,  939,  955,  412,  908,  924,
-  381,  877,  893,  350,  846,  862,  319,  815,
-  831,  351,  847,  863,  382,  878,  894,  413,
-  909,  925,  444,  940,  956,  475,  971,  987,
-  506, 1002, 1018,  507, 1003, 1019,  476,  972,
-  988,  445,  941,  957,  414,  910,  926,  383,
-  879,  895,  415,  911,  927,  446,  942,  958,
-  477,  973,  989,  508, 1004, 1020,  509, 1005,
-  1021,  478,  974,  990,  447,  943,  959,  479,
-  975,  991,  510, 1006, 1022,  511, 1007, 1023,
-};
-
-#elif DWTDCT_TYPE == DWTDCT8X8
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
-  0, 1, 2, 3, 5, 4, 4, 5,
-  5, 3, 6, 3, 5, 4, 6, 6,
-  6, 5, 5, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7,
-
-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-  0,    1,   32,   64,   33,    2,    3,   34,
-  65,   96,  128,   97,   66,   35,    4,    5,
-  36,   67,   98,  129,  160,  192,  161,  130,
-  99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224,  225,  194,  163,  132,
-  101,   70,   39,   71,  102,  133,  164,  195,
-  226,  227,  196,  165,  134,  103,  135,  166,
-  197,  228,  229,  198,  167,  199,  230,  231,
-
-  8,  256,  264,    9,  257,  265,   40,  288, 296, 72,  320,  328,
-  41,  289,  297,   10, 258,  266, 11,  259,  267,   42,  290,  298,
-  73,  321,  329,  104,  352,  360,  136,  384, 392,  105,  353,  361,
-  74,  322,  330,   43, 291,  299,   12,  260,  268,   13,  261,  269,
-  44,  292,  300,   75,  323,  331,  106,  354, 362,  137,  385,  393,
-  168,  416,  424,  200, 448,  456,  169,  417,  425,  138,  386,  394,
-  107,  355,  363,   76,  324,  332,   45,  293, 301,   14,  262,  270,
-  15,  263,  271,   46, 294,  302,   77,  325,  333,  108,  356,  364,
-  139,  387,  395,  170, 418,  426,  201,  449, 457,  232,  480,  488,
-  233,  481,  489,  202, 450,  458,  171,  419,  427,  140,  388,  396,
-  109,  357,  365,   78,  326,  334,   47,  295, 303,   79,  327,  335,
-  110,  358,  366,  141, 389,  397,  172,  420,  428,  203,  451,  459,
-  234,  482,  490,  235,  483,  491,  204,  452, 460,  173,  421,  429,
-  142,  390,  398,  111, 359,  367,  143,  391,  399,  174,  422,  430,
-  205,  453,  461,  236,  484,  492,  237,  485, 493,  206,  454,  462,
-  175,  423,  431,  207, 455,  463,  238,  486,  494,  239,  487,  495,
-
-  16,  512,  528,   17,  513,  529,   18,  514,
-  530,   19,  515,  531,   20,  516,  532,   21,
-  517,  533,   22,  518,  534,   23,  519,  535,
-  24,  520,  536,   25,  521,  537,   26,  522,
-  538,   27,  523,  539,   28,  524,  540,   29,
-  525,  541,   30,  526,  542,   31,  527,  543,
-  48,  544,  560,   49,  545,  561,   50,  546,
-  562,   51,  547,  563,   52,  548,  564,   53,
-  549,  565,   54,  550,  566,   55,  551,  567,
-  56,  552,  568,   57,  553,  569,   58,  554,
-  570,   59,  555,  571,   60,  556,  572,   61,
-  557,  573,   62,  558,  574,   63,  559,  575,
-  80,  576,  592,   81,  577,  593,   82,  578,
-  594,   83,  579,  595,   84,  580,  596,   85,
-  581,  597,   86,  582,  598,   87,  583,  599,
-  88,  584,  600,   89,  585,  601,   90,  586,
-  602,   91,  587,  603,   92,  588,  604,   93,
-  589,  605,   94,  590,  606,   95,  591,  607,
-  112,  608,  624,  113,  609,  625,  114,  610,
-  626,  115,  611,  627,  116,  612,  628,  117,
-  613,  629,  118,  614,  630,  119,  615,  631,
-  120,  616,  632,  121,  617,  633,  122,  618,
-  634,  123,  619,  635,  124,  620,  636,  125,
-  621,  637,  126,  622,  638,  127,  623,  639,
-  144,  640,  656,  145,  641,  657,  146,  642,
-  658,  147,  643,  659,  148,  644,  660,  149,
-  645,  661,  150,  646,  662,  151,  647,  663,
-  152,  648,  664,  153,  649,  665,  154,  650,
-  666,  155,  651,  667,  156,  652,  668,  157,
-  653,  669,  158,  654,  670,  159,  655,  671,
-  176,  672,  688,  177,  673,  689,  178,  674,
-  690,  179,  675,  691,  180,  676,  692,  181,
-  677,  693,  182,  678,  694,  183,  679,  695,
-  184,  680,  696,  185,  681,  697,  186,  682,
-  698,  187,  683,  699,  188,  684,  700,  189,
-  685,  701,  190,  686,  702,  191,  687,  703,
-  208,  704,  720,  209,  705,  721,  210,  706,
-  722,  211,  707,  723,  212,  708,  724,  213,
-  709,  725,  214,  710,  726,  215,  711,  727,
-  216,  712,  728,  217,  713,  729,  218,  714,
-  730,  219,  715,  731,  220,  716,  732,  221,
-  717,  733,  222,  718,  734,  223,  719,  735,
-  240,  736,  752,  241,  737,  753,  242,  738,
-  754,  243,  739,  755,  244,  740,  756,  245,
-  741,  757,  246,  742,  758,  247,  743,  759,
-  248,  744,  760,  249,  745,  761,  250,  746,
-  762,  251,  747,  763,  252,  748,  764,  253,
-  749,  765,  254,  750,  766,  255,  751,  767,
-  272,  768,  784,  273,  769,  785,  274,  770,
-  786,  275,  771,  787,  276,  772,  788,  277,
-  773,  789,  278,  774,  790,  279,  775,  791,
-  280,  776,  792,  281,  777,  793,  282,  778,
-  794,  283,  779,  795,  284,  780,  796,  285,
-  781,  797,  286,  782,  798,  287,  783,  799,
-  304,  800,  816,  305,  801,  817,  306,  802,
-  818,  307,  803,  819,  308,  804,  820,  309,
-  805,  821,  310,  806,  822,  311,  807,  823,
-  312,  808,  824,  313,  809,  825,  314,  810,
-  826,  315,  811,  827,  316,  812,  828,  317,
-  813,  829,  318,  814,  830,  319,  815,  831,
-  336,  832,  848,  337,  833,  849,  338,  834,
-  850,  339,  835,  851,  340,  836,  852,  341,
-  837,  853,  342,  838,  854,  343,  839,  855,
-  344,  840,  856,  345,  841,  857,  346,  842,
-  858,  347,  843,  859,  348,  844,  860,  349,
-  845,  861,  350,  846,  862,  351,  847,  863,
-  368,  864,  880,  369,  865,  881,  370,  866,
-  882,  371,  867,  883,  372,  868,  884,  373,
-  869,  885,  374,  870,  886,  375,  871,  887,
-  376,  872,  888,  377,  873,  889,  378,  874,
-  890,  379,  875,  891,  380,  876,  892,  381,
-  877,  893,  382,  878,  894,  383,  879,  895,
-  400,  896,  912,  401,  897,  913,  402,  898,
-  914,  403,  899,  915,  404,  900,  916,  405,
-  901,  917,  406,  902,  918,  407,  903,  919,
-  408,  904,  920,  409,  905,  921,  410,  906,
-  922,  411,  907,  923,  412,  908,  924,  413,
-  909,  925,  414,  910,  926,  415,  911,  927,
-  432,  928,  944,  433,  929,  945,  434,  930,
-  946,  435,  931,  947,  436,  932,  948,  437,
-  933,  949,  438,  934,  950,  439,  935,  951,
-  440,  936,  952,  441,  937,  953,  442,  938,
-  954,  443,  939,  955,  444,  940,  956,  445,
-  941,  957,  446,  942,  958,  447,  943,  959,
-  464,  960,  976,  465,  961,  977,  466,  962,
-  978,  467,  963,  979,  468,  964,  980,  469,
-  965,  981,  470,  966,  982,  471,  967,  983,
-  472,  968,  984,  473,  969,  985,  474,  970,
-  986,  475,  971,  987,  476,  972,  988,  477,
-  973,  989,  478,  974,  990,  479,  975,  991,
-  496,  992, 1008,  497,  993, 1009,  498,  994,
-  1010,  499,  995, 1011,  500,  996, 1012,  501,
-  997, 1013,  502,  998, 1014,  503,  999, 1015,
-  504, 1000, 1016,  505, 1001, 1017,  506, 1002,
-  1018,  507, 1003, 1019,  508, 1004, 1020,  509,
-  1005, 1021,  510, 1006, 1022,  511, 1007, 1023,
-};
-#endif
-
-#else
-
 DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
   0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
   6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
@@ -865,7 +247,6 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
   951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
   923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
 };
-#endif  // CONFIG_DWTDCTHYBRID
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
@@ -937,145 +318,28 @@ vp9_extra_bit_struct vp9_extra_bits[12] = {
 
 #include "vp9/common/vp9_default_coef_probs.h"
 
-#if CONFIG_NEWCOEFCONTEXT
-
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]);
-
-static int find_in_scan(const int *scan, int l, int m) {
-  int i, l2 = l * l;
-  for (i = 0; i < l2; ++i) {
-    if (scan[i] == m)
-      return i;
-  }
-  return -1;
-}
-
-static void init_scan_neighbors(const int *scan, int l, int *neighbors) {
-  int l2 = l * l;
-  int m, n, i, j, k;
-  for (n = 0; n < l2; ++n) {
-    int locn = find_in_scan(scan, l, n);
-    int z = -1;
-    i = n / l;
-    j = n % l;
-    for (k = 0; k < MAX_NEIGHBORS; ++k)
-      neighbors[MAX_NEIGHBORS * n + k] = -1;
-    if (i - 1 >= 0) {
-      m = (i - 1) * l + j;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n] = m;
-        if (m == 0) z = 0;
-      }
-    }
-    if (j - 1 >= 0) {
-      m = i * l + j - 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 1] = m;
-        if (m == 0) z = 1;
-      }
-    }
-    if (i - 1 >= 0 && j - 1 >= 0) {
-      m = (i - 1) * l + j - 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 2] = m;
-        if (m == 0) z = 2;
-      }
-    }
-    if (i - 1 >= 0 && j + 1 < l) {
-      m = (i - 1) * l + j + 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 3] = m;
-        if (m == 0) z = 3;
-      }
-    }
-    if (i + 1 < l && j - 1 >= 0) {
-       m = (i + 1) * l + j - 1;
-      if (find_in_scan(scan, l, m) < locn) {
-        neighbors[MAX_NEIGHBORS * n + 4] = m;
-        if (m == 0) z = 4;
-      }
-    }
-    if (z != -1) {  // zero exists
-      int v = 0;
-      for (k = 0; k < MAX_NEIGHBORS; ++k)
-        v += (neighbors[MAX_NEIGHBORS * n + k] > 0);
-      if (v) {
-        neighbors[MAX_NEIGHBORS * n + z] = -1;
-      }
-    }
-  }
-}
-
-void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4,
-                      vp9_default_zig_zag1d_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8,
-                      vp9_default_zig_zag1d_8x8_neighbors);
-  init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16,
-                      vp9_default_zig_zag1d_16x16_neighbors);
-  init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32,
-                      vp9_default_zig_zag1d_32x32_neighbors);
-}
-
-const int *vp9_get_coef_neighbors_handle(const int *scan) {
-  if (scan == vp9_default_zig_zag1d_4x4) {
-    return vp9_default_zig_zag1d_4x4_neighbors;
-  } else if (scan == vp9_row_scan_4x4) {
-    return vp9_row_scan_4x4_neighbors;
-  } else if (scan == vp9_col_scan_4x4) {
-    return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_8x8) {
-    return vp9_default_zig_zag1d_8x8_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_16x16) {
-    return vp9_default_zig_zag1d_16x16_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_32x32) {
-    return vp9_default_zig_zag1d_32x32_neighbors;
+// This function updates and then returns n AC coefficient context
+// This is currently a placeholder function to allow experimentation
+// using various context models based on the energy earlier tokens
+// within the current block.
+//
+// For now it just returns the previously used context.
+int vp9_get_coef_context(int * recent_energy, int token) {
+  // int token_energy;
+  // int av_energy;
+
+  // Placeholder code for experiments with token energy
+  // as a coefficient context.
+  /*token_energy = ((token != DCT_EOB_TOKEN) ? token : 0);
+  if (token_energy) {
+    av_energy = (token_energy + *recent_energy + 1) >> 1;
+  } else {
+    av_energy = 0;
   }
-  return vp9_default_zig_zag1d_4x4_neighbors;
-}
+  *recent_energy = token_energy;*/
 
-int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
-                                  const int *neigbor_handle, int rc) {
-  static int neighbors_used = MAX_NEIGHBORS;   // maximum is MAX_NEIGHBORS
-  const int *nb = neigbor_handle + rc * MAX_NEIGHBORS;
-  int i, v, val = 0, n = 0;
-  for (i = 0; i < neighbors_used; ++i) {
-    if (nb[i] == -1 || (nb[i] == 0 && nodc)) {
-      continue;
-    }
-    v = abs(qcoeff_ptr[nb[i]]);
-    val = (v > val ? v : val);
-    n++;
-  }
-  if (n == 0)
-    return 0;
-  else if (val <= 1)
-    return val;
-  else if (val < 4)
-    return 2;
-  else
-    return 3;
-}
-#endif  /* CONFIG_NEWCOEFCONTEXT */
+  return vp9_prev_token_class[token];
+};
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
   vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 84e5255c2..1979638d4 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -106,9 +106,6 @@ typedef vp9_prob vp9_coeff_probs[COEF_BANDS][PREV_COEF_CONTEXTS]
 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
 #define MODULUS_PARAM               13  /* Modulus parameter */
 
-extern DECLARE_ALIGNED(16, const uint8_t,
-                       vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
-
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);
@@ -129,26 +126,5 @@ static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
   vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-
-#define MAX_NEIGHBORS 5
-#define NEWCOEFCONTEXT_BAND_COND(b)   ((b) >= 1)
-void vp9_init_neighbors(void);
-
-const int *vp9_get_coef_neighbors_handle(const int *scan);
-int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
-                                  const int *neigbor_handle, int rc);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_4x4_neighbors[
-                       16 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[
-                       16 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[
-                       16 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[
-                       64 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[
-                       256 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[
-                       1024 * MAX_NEIGHBORS]);
-#endif  // CONFIG_NEWCOEFCONTEXT
+extern int vp9_get_coef_context(int * recent_energy, int token);
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 74fce7aad..c42aab1a5 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -98,7 +98,7 @@ static int left_block_mv(const MACROBLOCKD *xd,
     b += 4;
   }
 
-  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+  return (cur_mb->bmi + b - 1)->as_mv[0].as_int;
 }
 
 static int left_block_second_mv(const MACROBLOCKD *xd,
@@ -117,8 +117,8 @@ static int left_block_second_mv(const MACROBLOCKD *xd,
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 1)->as_mv.second.as_int :
-      (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+      (cur_mb->bmi + b - 1)->as_mv[1].as_int :
+      (cur_mb->bmi + b - 1)->as_mv[0].as_int;
 }
 
 static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
@@ -131,7 +131,7 @@ static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
     b += 16;
   }
 
-  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+  return (cur_mb->bmi + b - 4)->as_mv[0].as_int;
 }
 
 static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
@@ -146,8 +146,8 @@ static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 4)->as_mv.second.as_int :
-      (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+      (cur_mb->bmi + b - 4)->as_mv[1].as_int :
+      (cur_mb->bmi + b - 4)->as_mv[0].as_int;
 }
 
 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 2f847dc78..2fec98e50 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -476,12 +476,13 @@ void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
   }
 }
 
-void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
+void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
                                  uint8_t *dst_ptr,
                                  int pitch, int stride) {
   int r, c;
-  short tmp[16];
-  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
+  int16_t dc = input_dc;
+  int16_t tmp[16];
+  vp9_short_inv_walsh4x4_1_x8_c(&dc, tmp, 4 << 1);
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 4; c++) {
@@ -1152,8 +1153,6 @@ void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   *output = (out + 32) >> 6;
 }
 
-
-#if !CONFIG_DWTDCTHYBRID
 void idct32_1d(int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
@@ -1521,7 +1520,6 @@ void idct32_1d(int16_t *input, int16_t *output) {
   output[31] = step1[0] - step1[31];
 }
 
-
 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
   int16_t out[32 * 32];
   int16_t *outptr = &out[0];
@@ -1554,792 +1552,3 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
   out = dct_const_round_shift(tmp);
   *output = (out + 32) >> 6;
 }
-
-#else  // !CONFIG_DWTDCTHYBRID
-
-#if DWT_TYPE == 53
-
-// Note: block length must be even for this implementation
-static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, *a, *b;
-  int n;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ -= (r + (*b) + 1) >> 1;
-    r = *b++;
-  }
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    *x++ = ((r = *a++) + 1) >> 1;
-    *x++ = *b++ + ((r + (*a) + 2) >> 2);
-  }
-  *x++ = ((r = *a) + 1) >> 1;
-  *x++ = *b + ((r + 1) >> 1);
-}
-
-static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, *a, *b;
-  int n;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ -= (r + (*b) + 1) >> 1;
-    r = *b++;
-  }
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    r = *a++;
-    *x++ = r;
-    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
-  }
-  *x++ = *a;
-  *x++ = ((*b) << 1) + *a;
-}
-
-static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
-                                 int pitch_c, int16_t *x, int pitch_x) {
-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
-  short buffer[2 * DWT_MAX_LENGTH];
-
-  th[0] = hh;
-  tw[0] = hw;
-  for (i = 1; i <= levels; i++) {
-    th[i] = (th[i - 1] + 1) >> 1;
-    tw[i] = (tw[i - 1] + 1) >> 1;
-  }
-  for (lv = levels - 1; lv >= 0; lv--) {
-    nh = th[lv];
-    nw = tw[lv];
-    hh = th[lv + 1];
-    hw = tw[lv + 1];
-    if ((nh < 2) || (nw < 2)) continue;
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i] = c[i * pitch_c + j];
-      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i + nh];
-    }
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
-      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
-    }
-  }
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
-          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
-          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
-    }
-  }
-}
-
-#elif DWT_TYPE == 26
-
-// Note: block length must be even for this implementation
-static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, s, *a, *b;
-  int i, n = length >> 1;
-
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ += (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b += (r - *a + 4) >> 3;
-  }
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    s = *b++;
-    r = *a++;
-    *x++ = (r + s + 1) >> 1;
-    *x++ = (r - s + 1) >> 1;
-  }
-}
-
-static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass,
-                             int16_t *x) {
-  int16_t r, s, *a, *b;
-  int i, n = length >> 1;
-
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ += (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b += (r - *a + 4) >> 3;
-  }
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    s = *b++;
-    r = *a++;
-    *x++ = r + s;
-    *x++ = r - s;
-  }
-}
-
-static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
-                                 int pitch_c, int16_t *x, int pitch_x) {
-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
-  int16_t buffer[2 * DWT_MAX_LENGTH];
-
-  th[0] = hh;
-  tw[0] = hw;
-  for (i = 1; i <= levels; i++) {
-    th[i] = (th[i - 1] + 1) >> 1;
-    tw[i] = (tw[i - 1] + 1) >> 1;
-  }
-  for (lv = levels - 1; lv >= 0; lv--) {
-    nh = th[lv];
-    nw = tw[lv];
-    hh = th[lv + 1];
-    hw = tw[lv + 1];
-    if ((nh < 2) || (nw < 2)) continue;
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i] = c[i * pitch_c + j];
-      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i + nh];
-    }
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
-      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
-    }
-  }
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
-          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
-          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
-    }
-  }
-}
-
-#elif DWT_TYPE == 97
-
-static void synthesis_97(int length, double *lowpass, double *highpass,
-                         double *x) {
-  static const double a_predict1 = -1.586134342;
-  static const double a_update1 = -0.05298011854;
-  static const double a_predict2 = 0.8829110762;
-  static const double a_update2 = 0.4435068522;
-  static const double s_low = 1.149604398;
-  static const double s_high = 1/1.149604398;
-  static const double inv_s_low = 1 / s_low;
-  static const double inv_s_high = 1 / s_high;
-  int i;
-  double y[DWT_MAX_LENGTH];
-  // Undo pack and scale
-  for (i = 0; i < length / 2; i++) {
-    y[i * 2] = lowpass[i] * inv_s_low;
-    y[i * 2 + 1] = highpass[i] * inv_s_high;
-  }
-  memcpy(x, y, sizeof(*y) * length);
-  // Undo update 2
-  for (i = 2; i < length; i += 2) {
-    x[i] -= a_update2 * (x[i-1] + x[i+1]);
-  }
-  x[0] -= 2 * a_update2 * x[1];
-  // Undo predict 2
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] -= 2 * a_predict2 * x[length - 2];
-  // Undo update 1
-  for (i = 2; i < length; i += 2) {
-    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
-  }
-  x[0] -= 2 * a_update1 * x[1];
-  // Undo predict 1
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] -= 2 * a_predict1 * x[length - 2];
-}
-
-static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
-                                 int pitch_c, int16_t *x, int pitch_x) {
-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
-  double buffer[2 * DWT_MAX_LENGTH];
-  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
-
-  th[0] = hh;
-  tw[0] = hw;
-  for (i = 1; i <= levels; i++) {
-    th[i] = (th[i - 1] + 1) >> 1;
-    tw[i] = (tw[i - 1] + 1) >> 1;
-  }
-  for (lv = levels - 1; lv >= 0; lv--) {
-    nh = th[lv];
-    nw = tw[lv];
-    hh = th[lv + 1];
-    hw = tw[lv + 1];
-    if ((nh < 2) || (nw < 2)) continue;
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i] = c[i * pitch_c + j];
-      synthesis_97(nh, buffer, buffer + hh, buffer + nh);
-      for (i = 0; i < nh; i++)
-        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
-    }
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
-      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
-    }
-  }
-  for (i = 0; i < height; i++)
-    for (j = 0; j < width; j++)
-      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
-                                 (1 << DWT_PRECISION_BITS));
-}
-
-#endif  // DWT_TYPE
-
-// TODO(debargha): Implement scaling differently so as not to have to use the
-// floating point 16x16 dct
-static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) {
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-
-    // step 1 and 2
-    step[ 0] = input[0] + input[8];
-    step[ 1] = input[0] - input[8];
-
-    temp1 = input[4]*C12;
-    temp2 = input[12]*C4;
-
-    temp1 -= temp2;
-    temp1 *= C8;
-
-    step[ 2] = 2*(temp1);
-
-    temp1 = input[4]*C4;
-    temp2 = input[12]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    step[ 3] = 2*(temp1);
-
-    temp1 = input[2]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] + input[10];
-
-    step[ 4] = temp1 + temp2;
-    step[ 5] = temp1 - temp2;
-
-    temp1 = input[14]*C8;
-    temp1 = 2*(temp1);
-    temp2 = input[6] - input[10];
-
-    step[ 6] = temp2 - temp1;
-    step[ 7] = temp2 + temp1;
-
-    // for odd input
-    temp1 = input[3]*C12;
-    temp2 = input[13]*C4;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[ 8] = 2*(temp1);
-
-    temp1 = input[3]*C4;
-    temp2 = input[13]*C12;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[ 9] = 2*(temp2);
-
-    intermediate[10] = 2*(input[9]*C8);
-    intermediate[11] = input[15] - input[1];
-    intermediate[12] = input[15] + input[1];
-    intermediate[13] = 2*((input[7]*C8));
-
-    temp1 = input[11]*C12;
-    temp2 = input[5]*C4;
-    temp2 -= temp1;
-    temp2 = (temp2);
-    temp2 *= C8;
-    intermediate[14] = 2*(temp2);
-
-    temp1 = input[11]*C4;
-    temp2 = input[5]*C12;
-    temp1 += temp2;
-    temp1 = (temp1);
-    temp1 *= C8;
-    intermediate[15] = 2*(temp1);
-
-    step[ 8] = intermediate[ 8] + intermediate[14];
-    step[ 9] = intermediate[ 9] + intermediate[15];
-    step[10] = intermediate[10] + intermediate[11];
-    step[11] = intermediate[10] - intermediate[11];
-    step[12] = intermediate[12] + intermediate[13];
-    step[13] = intermediate[12] - intermediate[13];
-    step[14] = intermediate[ 8] - intermediate[14];
-    step[15] = intermediate[ 9] - intermediate[15];
-
-    // step 3
-    output[0] = step[ 0] + step[ 3];
-    output[1] = step[ 1] + step[ 2];
-    output[2] = step[ 1] - step[ 2];
-    output[3] = step[ 0] - step[ 3];
-
-    temp1 = step[ 4]*C14;
-    temp2 = step[ 7]*C2;
-    temp1 -= temp2;
-    output[4] =  (temp1);
-
-    temp1 = step[ 4]*C2;
-    temp2 = step[ 7]*C14;
-    temp1 += temp2;
-    output[7] =  (temp1);
-
-    temp1 = step[ 5]*C10;
-    temp2 = step[ 6]*C6;
-    temp1 -= temp2;
-    output[5] =  (temp1);
-
-    temp1 = step[ 5]*C6;
-    temp2 = step[ 6]*C10;
-    temp1 += temp2;
-    output[6] =  (temp1);
-
-    output[8] = step[ 8] + step[11];
-    output[9] = step[ 9] + step[10];
-    output[10] = step[ 9] - step[10];
-    output[11] = step[ 8] - step[11];
-    output[12] = step[12] + step[15];
-    output[13] = step[13] + step[14];
-    output[14] = step[13] - step[14];
-    output[15] = step[12] - step[15];
-
-    // output 4
-    step[ 0] = output[0] + output[7];
-    step[ 1] = output[1] + output[6];
-    step[ 2] = output[2] + output[5];
-    step[ 3] = output[3] + output[4];
-    step[ 4] = output[3] - output[4];
-    step[ 5] = output[2] - output[5];
-    step[ 6] = output[1] - output[6];
-    step[ 7] = output[0] - output[7];
-
-    temp1 = output[8]*C7;
-    temp2 = output[15]*C9;
-    temp1 -= temp2;
-    step[ 8] = (temp1);
-
-    temp1 = output[9]*C11;
-    temp2 = output[14]*C5;
-    temp1 += temp2;
-    step[ 9] = (temp1);
-
-    temp1 = output[10]*C3;
-    temp2 = output[13]*C13;
-    temp1 -= temp2;
-    step[10] = (temp1);
-
-    temp1 = output[11]*C15;
-    temp2 = output[12]*C1;
-    temp1 += temp2;
-    step[11] = (temp1);
-
-    temp1 = output[11]*C1;
-    temp2 = output[12]*C15;
-    temp2 -= temp1;
-    step[12] = (temp2);
-
-    temp1 = output[10]*C13;
-    temp2 = output[13]*C3;
-    temp1 += temp2;
-    step[13] = (temp1);
-
-    temp1 = output[9]*C5;
-    temp2 = output[14]*C11;
-    temp2 -= temp1;
-    step[14] = (temp2);
-
-    temp1 = output[8]*C9;
-    temp2 = output[15]*C7;
-    temp1 += temp2;
-    step[15] = (temp1);
-
-    // step 5
-    output[0] = (step[0] + step[15]);
-    output[1] = (step[1] + step[14]);
-    output[2] = (step[2] + step[13]);
-    output[3] = (step[3] + step[12]);
-    output[4] = (step[4] + step[11]);
-    output[5] = (step[5] + step[10]);
-    output[6] = (step[6] + step[ 9]);
-    output[7] = (step[7] + step[ 8]);
-
-    output[15] = (step[0] - step[15]);
-    output[14] = (step[1] - step[14]);
-    output[13] = (step[2] - step[13]);
-    output[12] = (step[3] - step[12]);
-    output[11] = (step[4] - step[11]);
-    output[10] = (step[5] - step[10]);
-    output[9] = (step[6] - step[ 9]);
-    output[8] = (step[7] - step[ 8]);
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,
-                                    int scale) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double out[16*16], out2[16*16];
-    const int short_pitch = pitch >> 1;
-    int i, j;
-      // First transform rows
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j + i*short_pitch];
-      butterfly_16x16_idct_1d_f(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out[j + i*16] = temp_out[j];
-    }
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      double temp_in[16], temp_out[16];
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      butterfly_16x16_idct_1d_f(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out2[j*16 + i] = temp_out[j];
-    }
-    for (i = 0; i < 16*16; ++i)
-      output[i] = round(out2[i] / (128 >> scale));
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-static void idct8_1d_f(double *x) {
-  int i, j;
-  double t[8];
-  static const double idctmat[64] = {
-    0.35355339059327,  0.49039264020162,  0.46193976625564,  0.41573480615127,
-    0.35355339059327,   0.2777851165098,  0.19134171618254, 0.097545161008064,
-    0.35355339059327,  0.41573480615127,  0.19134171618254, -0.097545161008064,
-    -0.35355339059327, -0.49039264020161, -0.46193976625564,  -0.2777851165098,
-    0.35355339059327,   0.2777851165098, -0.19134171618254, -0.49039264020162,
-    -0.35355339059327, 0.097545161008064,  0.46193976625564,  0.41573480615127,
-    0.35355339059327, 0.097545161008063, -0.46193976625564,  -0.2777851165098,
-    0.35355339059327,  0.41573480615127, -0.19134171618254, -0.49039264020162,
-    0.35355339059327, -0.097545161008063, -0.46193976625564,   0.2777851165098,
-    0.35355339059327, -0.41573480615127, -0.19134171618255,  0.49039264020162,
-    0.35355339059327,  -0.2777851165098, -0.19134171618254,  0.49039264020161,
-    -0.35355339059327, -0.097545161008064,  0.46193976625564, -0.41573480615127,
-    0.35355339059327, -0.41573480615127,  0.19134171618254, 0.097545161008065,
-    -0.35355339059327,  0.49039264020162, -0.46193976625564,   0.2777851165098,
-    0.35355339059327, -0.49039264020162,  0.46193976625564, -0.41573480615127,
-    0.35355339059327,  -0.2777851165098,  0.19134171618255, -0.097545161008064
-  };
-  for (i = 0; i < 8; ++i) {
-    t[i] = 0;
-    for (j = 0; j < 8; ++j)
-      t[i] += idctmat[i * 8 + j] * x[j];
-  }
-  for (i = 0; i < 8; ++i) {
-    x[i] = t[i];
-  }
-}
-
-static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,
-                                  int scale) {
-  double X[8 * 8], Y[8];
-  int i, j;
-  int shortpitch = pitch >> 1;
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++) {
-        X[i * 8 + j] = (double)coefs[i * shortpitch + j];
-      }
-    }
-    for (i = 0; i < 8; i++)
-      idct8_1d_f(X + 8 * i);
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; ++j)
-        Y[j] = X[i + 8 * j];
-      idct8_1d_f(Y);
-      for (j = 0; j < 8; ++j)
-        X[i + 8 * j] = Y[j];
-    }
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++) {
-        block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));
-      }
-    }
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 32x32 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[16 * 16];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[32 * 32];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
-  int i, j;
-
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
-#endif
-}
-
-#elif DWTDCT_TYPE == DWTDCT16X16
-
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 32x32 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[16 * 16];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[32 * 32];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
-  int i, j;
-
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,
-               sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
-                          1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,
-               sizeof(*buffer2) * 16);
-  }
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
-#endif
-}
-
-#elif DWTDCT_TYPE == DWTDCT8X8
-
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 32x32 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[8 * 8];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[32 * 32];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
-  int i, j;
-
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct8x8_c_f(input, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);
-  }
-  vp9_short_idct8x8_c_f(input + 8, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);
-  }
-  vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,
-               sizeof(*buffer2) * 8);
-  }
-  vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,
-                        1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i) {
-    vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,
-               sizeof(*buffer2) * 8);
-  }
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      buffer2[i * 32 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);
-#endif
-}
-
-#endif
-
-#if CONFIG_TX64X64
-void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
-  // assume output is a 64x64 buffer
-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
-  int16_t buffer[16 * 16];
-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
-  int16_t buffer2[64 * 64];
-  // Note: pitch is in bytes, short_pitch is in short units
-  const int short_pitch = pitch >> 1;
-  int i, j;
-
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 16; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-#elif DWTDCT_TYPE == DWTDCT16X16
-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,
-               sizeof(*buffer2) * 16);
-  }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
-                          2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i) {
-    vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,
-               sizeof(*buffer2) * 16);
-  }
-
-  // Copying and scaling highest bands into buffer2
-  for (i = 0; i < 32; ++i) {
-    for (j = 32; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 32; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      buffer2[i * 64 + j] =
-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-#endif  // DWTDCT_TYPE
-
-#if DWT_TYPE == 26
-  dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
-#elif DWT_TYPE == 97
-  dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);
-#elif DWT_TYPE == 53
-  dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
-#endif
-}
-#endif  // CONFIG_TX64X64
-#endif  // !CONFIG_DWTDCTHYBRID
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index e7cfe207b..cb9a3db63 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -32,9 +32,9 @@ static void recon_dcblock_8x8(MACROBLOCKD *xd) {
 void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) {
   BLOCKD *b = &xd->block[block];
   if (b->eob <= 1)
-    xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch);
+    xd->inv_txm4x4_1(b->dqcoeff, b->diff, pitch);
   else
-    xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch);
+    xd->inv_txm4x4(b->dqcoeff, b->diff, pitch);
 }
 
 void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
@@ -44,7 +44,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
 
   if (has_2nd_order) {
     /* do 2nd order transform on the dc block */
-    vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff);
+    xd->inv_2ndtxm4x4(blockd[24].dqcoeff, blockd[24].diff);
     recon_dcblock(xd);
   }
 
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 0b7d98a58..d93b7d5fb 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -177,6 +177,7 @@ extern "C"
     int arnr_type;
 
     int tile_columns;
+    int tile_rows;
 
     struct vpx_fixed_buf         two_pass_stats_in;
     struct vpx_codec_pkt_list  *output_pkt_list;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5e57228b4..6295514ea 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -279,8 +279,10 @@ typedef struct VP9Common {
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
-  int tile_columns;
-  int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_idx;
+  int tile_columns, log2_tile_columns;
+  int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_col_idx;
+  int tile_rows, log2_tile_rows;
+  int cur_tile_mb_row_start, cur_tile_mb_row_end, cur_tile_row_idx;
 } VP9_COMMON;
 
 static int get_free_fb(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index d4435d872..b75525e2c 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -154,7 +154,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
   int_mv mv;
 
   ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
+  mv.as_int = d->bmi.as_mv[0].as_int;
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
@@ -179,7 +179,7 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
   int_mv mv;
 
   ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv.second.as_int;
+  mv.as_int = d->bmi.as_mv[1].as_int;
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
@@ -197,7 +197,7 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
   int_mv mv;
 
   ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
+  mv.as_int = d->bmi.as_mv[0].as_int;
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
@@ -222,7 +222,7 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
   int_mv mv;
 
   ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv.second.as_int;
+  mv.as_int = d->bmi.as_mv[1].as_int;
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
@@ -240,7 +240,7 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
   int_mv mv;
 
   ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv.first.as_int;
+  mv.as_int = d->bmi.as_mv[0].as_int;
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
@@ -264,38 +264,38 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
       int voffset = 20 + i * 2 + j;
       int temp;
 
-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row
-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;
+      temp = blockd[yoffset  ].bmi.as_mv[0].as_mv.row
+             + blockd[yoffset + 1].bmi.as_mv[0].as_mv.row
+             + blockd[yoffset + 4].bmi.as_mv[0].as_mv.row
+             + blockd[yoffset + 5].bmi.as_mv[0].as_mv.row;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+      xd->block[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) &
         xd->fullpixel_mask;
 
-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col
-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;
+      temp = blockd[yoffset  ].bmi.as_mv[0].as_mv.col
+             + blockd[yoffset + 1].bmi.as_mv[0].as_mv.col
+             + blockd[yoffset + 4].bmi.as_mv[0].as_mv.col
+             + blockd[yoffset + 5].bmi.as_mv[0].as_mv.col;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+      blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) &
         xd->fullpixel_mask;
 
-      blockd[voffset].bmi.as_mv.first.as_mv.row =
-        blockd[uoffset].bmi.as_mv.first.as_mv.row;
-      blockd[voffset].bmi.as_mv.first.as_mv.col =
-        blockd[uoffset].bmi.as_mv.first.as_mv.col;
+      blockd[voffset].bmi.as_mv[0].as_mv.row =
+        blockd[uoffset].bmi.as_mv[0].as_mv.row;
+      blockd[voffset].bmi.as_mv[0].as_mv.col =
+        blockd[uoffset].bmi.as_mv[0].as_mv.col;
 
       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row
-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;
+        temp = blockd[yoffset  ].bmi.as_mv[1].as_mv.row
+               + blockd[yoffset + 1].bmi.as_mv[1].as_mv.row
+               + blockd[yoffset + 4].bmi.as_mv[1].as_mv.row
+               + blockd[yoffset + 5].bmi.as_mv[1].as_mv.row;
 
         if (temp < 0) {
           temp -= 4;
@@ -303,13 +303,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+        blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) &
           xd->fullpixel_mask;
 
-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col
-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;
+        temp = blockd[yoffset  ].bmi.as_mv[1].as_mv.col
+               + blockd[yoffset + 1].bmi.as_mv[1].as_mv.col
+               + blockd[yoffset + 4].bmi.as_mv[1].as_mv.col
+               + blockd[yoffset + 5].bmi.as_mv[1].as_mv.col;
 
         if (temp < 0) {
           temp -= 4;
@@ -317,13 +317,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+        blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) &
           xd->fullpixel_mask;
 
-        blockd[voffset].bmi.as_mv.second.as_mv.row =
-          blockd[uoffset].bmi.as_mv.second.as_mv.row;
-        blockd[voffset].bmi.as_mv.second.as_mv.col =
-          blockd[uoffset].bmi.as_mv.second.as_mv.col;
+        blockd[voffset].bmi.as_mv[1].as_mv.row =
+          blockd[uoffset].bmi.as_mv[1].as_mv.row;
+        blockd[voffset].bmi.as_mv[1].as_mv.col =
+          blockd[uoffset].bmi.as_mv[1].as_mv.col;
       }
     }
   }
@@ -332,7 +332,7 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
     BLOCKD *d0 = &blockd[i];
     BLOCKD *d1 = &blockd[i + 1];
 
-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+    if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int)
       build_inter_predictors2b(xd, d0, 8);
     else {
       vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
@@ -717,15 +717,15 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
     blockd[10].bmi = xd->mode_info_context->bmi[10];
 
     if (mbmi->need_to_clamp_mvs) {
-      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv[0].as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv[0].as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv[0].as_mv, xd);
+      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv[0].as_mv, xd);
       if (mbmi->second_ref_frame > 0) {
-        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv[1].as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv[1].as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv[1].as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv[1].as_mv, xd);
       }
     }
 
@@ -750,15 +750,15 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
       blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
 
       if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[0].as_mv, xd);
+        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv[0].as_mv, xd);
         if (mbmi->second_ref_frame > 0) {
-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);
-          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);
+          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[1].as_mv, xd);
+          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv[1].as_mv, xd);
         }
       }
 
-      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+      if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int)
         build_inter_predictors2b(xd, d0, 16);
       else {
         vp9_build_inter_predictors_b(d0, 16, &xd->subpix);
@@ -776,7 +776,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
     BLOCKD *d0 = &blockd[i];
     BLOCKD *d1 = &blockd[i + 1];
 
-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+    if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int)
       build_inter_predictors2b(xd, d0, 8);
     else {
       vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
@@ -803,44 +803,44 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
 
       int temp;
 
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;
+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.row
+             + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.row;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+      blockd[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) &
                                                   xd->fullpixel_mask;
 
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;
+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.col
+             + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.col;
 
       if (temp < 0) temp -= 4;
       else temp += 4;
 
-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+      blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) &
         xd->fullpixel_mask;
 
       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd);
 
       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd);
 
-      blockd[voffset].bmi.as_mv.first.as_mv.row =
-        blockd[uoffset].bmi.as_mv.first.as_mv.row;
-      blockd[voffset].bmi.as_mv.first.as_mv.col =
-        blockd[uoffset].bmi.as_mv.first.as_mv.col;
+      blockd[voffset].bmi.as_mv[0].as_mv.row =
+        blockd[uoffset].bmi.as_mv[0].as_mv.row;
+      blockd[voffset].bmi.as_mv[0].as_mv.col =
+        blockd[uoffset].bmi.as_mv[0].as_mv.col;
 
       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;
+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.row
+               + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.row;
 
         if (temp < 0) {
           temp -= 4;
@@ -848,13 +848,13 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+       blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) &
                                                     xd->fullpixel_mask;
 
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;
+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.col
+               + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.col;
 
         if (temp < 0) {
           temp -= 4;
@@ -862,21 +862,21 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
           temp += 4;
         }
 
-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+        blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) &
                                                         xd->fullpixel_mask;
 
         // if (mbmi->need_to_clamp_mvs)
         clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+          &blockd[uoffset].bmi.as_mv[1].as_mv, xd);
 
         // if (mbmi->need_to_clamp_mvs)
         clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+          &blockd[uoffset].bmi.as_mv[1].as_mv, xd);
 
-        blockd[voffset].bmi.as_mv.second.as_mv.row =
-          blockd[uoffset].bmi.as_mv.second.as_mv.row;
-        blockd[voffset].bmi.as_mv.second.as_mv.col =
-          blockd[uoffset].bmi.as_mv.second.as_mv.col;
+        blockd[voffset].bmi.as_mv[1].as_mv.row =
+          blockd[uoffset].bmi.as_mv[1].as_mv.row;
+        blockd[voffset].bmi.as_mv[1].as_mv.col =
+          blockd[uoffset].bmi.as_mv[1].as_mv.col;
       }
     }
   }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 066989272..3bd1f250f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -329,10 +329,15 @@ specialize vp9_dc_only_idct_add
 
 if [ "$CONFIG_LOSSLESS" = "yes" ]; then
 prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_inv_walsh4x4_1_x8
 prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_inv_walsh4x4_x8
 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
+specialize vp9_dc_only_inv_walsh_add
 prototype void vp9_short_inv_walsh4x4_1_lossless "int16_t *in, int16_t *out"
+specialize vp9_short_inv_walsh4x4_1_lossless
 prototype void vp9_short_inv_walsh4x4_lossless "int16_t *in, int16_t *out"
+specialize vp9_short_inv_walsh4x4_lossless
 fi
 
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
new file mode 100644
index 000000000..29f89b618
--- /dev/null
+++ b/vp9/common/vp9_tile_common.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_tile_common.h"
+
+static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
+                                 int *max_tile_off, int tile_idx,
+                                 int log2_n_tiles, int n_mbs) {
+  const int n_sbs = (n_mbs + 3) >> 2;
+  const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
+  const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
+
+  *min_tile_off = (sb_off1 << 2) > n_mbs ? n_mbs : (sb_off1 << 2);
+  *max_tile_off = (sb_off2 << 2) > n_mbs ? n_mbs : (sb_off2 << 2);
+}
+
+void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
+  cm->cur_tile_col_idx = tile_col_idx;
+  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_col_start,
+                       &cm->cur_tile_mb_col_end, tile_col_idx,
+                       cm->log2_tile_columns, cm->mb_cols);
+}
+
+void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
+  cm->cur_tile_row_idx = tile_row_idx;
+  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_row_start,
+                       &cm->cur_tile_mb_row_end, tile_row_idx,
+                       cm->log2_tile_rows, cm->mb_rows);
+}
+
+#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
+#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
+
+void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,
+                         int *delta_log2_n_tiles) {
+  const int sb_cols = (cm->mb_cols + 3) >> 2;
+  int min_log2_n_tiles, max_log2_n_tiles;
+
+  for (max_log2_n_tiles = 0;
+       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;
+       max_log2_n_tiles++) {}
+  for (min_log2_n_tiles = 0;
+       (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;
+       min_log2_n_tiles++) {}
+
+  *min_log2_n_tiles_ptr = min_log2_n_tiles;
+  *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;
+}
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
new file mode 100644
index 000000000..92bf50897
--- /dev/null
+++ b/vp9/common/vp9_tile_common.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
+#define VP9_COMMON_VP9_TILE_COMMON_H_
+
+#include "vp9/common/vp9_onyxc_int.h"
+
+#define MIN_TILE_WIDTH 256
+#define MAX_TILE_WIDTH 4096
+
+extern void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
+
+extern void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
+
+extern void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles,
+                                int *delta_log2_n_tiles);
+
+#endif  // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 3e2346f29..fbc95b6ce 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -65,6 +65,20 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
                                    unsigned int output_height,
                                    const short *filter);
 
+void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
+void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
+                                   const unsigned int src_pitch,
+                                   unsigned char *output_ptr,
+                                   unsigned int out_pitch,
+                                   unsigned int output_height,
+                                   const short *filter);
+
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -87,6 +101,14 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
       dst += 8;
       w -= 8;
     }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
   }
   if (w) {
     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
@@ -117,6 +139,14 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
       dst += 8;
       w -= 8;
     }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
   }
   if (w) {
     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
@@ -156,6 +186,15 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                                    h, filter_y);
       return;
     }
+    if (w == 4) {
+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+                                   fdata2, 16,
+                                   h + 7, filter_x);
+      vp9_filter_block1d4_v8_ssse3(fdata2, 16,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      return;
+    }
   }
   vp9_convolve8_c(src, src_stride, dst, dst_stride,
                   filter_x, x_step_q4, filter_y, y_step_q4,
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index c6d65e904..5f039454a 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -30,6 +30,124 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+    add         rax, rdx
+
+    lea         rbx, [rdx + rdx*4]
+    add         rbx, rdx                    ;pitch * 6
+
+.vp9_filter_block1d4_v8_ssse3_loop:
+    movd        xmm0, [rsi]                 ;A
+    movd        xmm1, [rsi + rdx]           ;B
+    movd        xmm2, [rsi + rdx * 2]       ;C
+    movd        xmm3, [rax + rdx * 2]       ;D
+    movd        xmm4, [rsi + rdx * 4]       ;E
+    movd        xmm5, [rax + rdx * 4]       ;F
+
+    punpcklbw   xmm0, xmm1                  ;A B
+    punpcklbw   xmm2, xmm3                  ;C D
+    punpcklbw   xmm4, xmm5                  ;E F
+
+    movd        xmm6, [rsi + rbx]           ;G
+    movd        xmm7, [rax + rbx]           ;H
+
+    pmaddubsw   xmm0, k0k1
+    pmaddubsw   xmm2, k2k3
+    punpcklbw   xmm6, xmm7                  ;G H
+    pmaddubsw   xmm4, k4k5
+    pmaddubsw   xmm6, k6k7
+
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, krd
+    paddsw      xmm4, xmm6
+    paddsw      xmm0, xmm4
+
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    add         rsi,  rdx
+    add         rax,  rdx
+
+    movd        [rdi], xmm0
+
+%if ABI_IS_32BIT
+    add         rdi, DWORD PTR arg(3)       ;out_pitch
+%else
+    add         rdi, r8
+%endif
+    dec         rcx
+    jnz         .vp9_filter_block1d4_v8_ssse3_loop
+
+    add rsp, 16*5
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
 global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_v8_ssse3):
     push        rbp
@@ -289,6 +407,110 @@ sym(vp9_filter_block1d16_v8_ssse3):
     pop         rbp
     ret
 
+;void vp9_filter_block1d4_h8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16*5
+    %define k0k1 [rsp + 16*0]
+    %define k2k3 [rsp + 16*1]
+    %define k4k5 [rsp + 16*2]
+    %define k6k7 [rsp + 16*3]
+    %define krd [rsp + 16*4]
+
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm4, [rdx]                 ;load filters
+    movd        xmm5, rcx
+    packsswb    xmm4, xmm4
+    pshuflw     xmm0, xmm4, 0b              ;k0_k1
+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
+
+    punpcklqdq  xmm0, xmm0
+    punpcklqdq  xmm1, xmm1
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm1
+    pshufd      xmm5, xmm5, 0
+    movdqa      k4k5, xmm2
+    movdqa      k6k7, xmm3
+    movdqa      krd, xmm5
+
+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
+    movsxd      rdx, dword ptr arg(3)       ;output_pitch
+    movsxd      rcx, dword ptr arg(4)       ;output_height
+
+.filter_block1d4_h8_rowloop_ssse3:
+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
+
+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      xmm1,   xmm0
+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
+    pmaddubsw   xmm0,   k0k1
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
+    pmaddubsw   xmm1,   k2k3
+
+    movdqa      xmm4,   xmm2
+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
+    pmaddubsw   xmm2,   k4k5
+
+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
+    pmaddubsw   xmm4,   k6k7
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
+    psraw       xmm0,   7
+    packuswb    xmm0,   xmm0
+
+    lea         rsi,    [rsi + rax]
+    movd        [rdi],  xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .filter_block1d4_h8_rowloop_ssse3
+
+    add rsp, 16*5
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 ;void vp9_filter_block1d8_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
@@ -340,7 +562,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pshufd      xmm5, xmm5, 0
     movdqa      k4k5, xmm2
     movdqa      k6k7, xmm3
-;    movdqa      krd, xmm5
+    movdqa      krd, xmm5
 
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
@@ -349,10 +571,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
 .filter_block1d8_h8_rowloop_ssse3:
     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
     punpcklqdq  xmm0,   xmm3
 
     movdqa      xmm1,   xmm0
@@ -371,9 +590,9 @@ sym(vp9_filter_block1d8_h8_ssse3):
     pmaddubsw   xmm4,   k6k7
 
     paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   xmm5
     paddsw      xmm0,   xmm4
+    paddsw      xmm0,   xmm2
+    paddsw      xmm0,   krd
     psraw       xmm0,   7
     packuswb    xmm0,   xmm0
 
@@ -456,10 +675,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
 .filter_block1d16_h8_rowloop_ssse3:
     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
 
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
     punpcklqdq  xmm0,   xmm3
 
     movdqa      xmm1,   xmm0
@@ -486,10 +702,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
 
 
     movq        xmm3,   [rsi +  5]
-;    movq        xmm7,   [rsi + 12]
     movq        xmm7,   [rsi + 13]
-;note: same as above
-;    punpcklbw   xmm3,   xmm7
     punpcklqdq  xmm3,   xmm7
 
     movdqa      xmm1,   xmm3
@@ -508,9 +721,9 @@ sym(vp9_filter_block1d16_h8_ssse3):
     pmaddubsw   xmm4,   k6k7
 
     paddsw      xmm3,   xmm1
+    paddsw      xmm3,   xmm4
     paddsw      xmm3,   xmm2
     paddsw      xmm3,   krd
-    paddsw      xmm3,   xmm4
     psraw       xmm3,   7
     packuswb    xmm3,   xmm3
     punpcklqdq  xmm0,   xmm3
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 5d6a4a717..316bda33b 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -1041,9 +1041,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
             fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
 
             do {
-              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;
+              mi->bmi[ *fill_offset].as_mv[0].as_int = blockmv.as_int;
               if (mbmi->second_ref_frame > 0)
-                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;
+                mi->bmi[ *fill_offset].as_mv[1].as_int = secondmv.as_int;
               fill_offset++;
             } while (--fill_count);
           }
@@ -1051,8 +1051,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         } while (++j < num_p);
       }
 
-      mv->as_int = mi->bmi[15].as_mv.first.as_int;
-      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;
+      mv->as_int = mi->bmi[15].as_mv[0].as_int;
+      mbmi->mv[1].as_int = mi->bmi[15].as_mv[1].as_int;
 
       break;  /* done with SPLITMV */
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 9f4db6bf7..facd761f0 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -31,6 +31,7 @@
 #include "vp9/decoder/vp9_dboolhuff.h"
 
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9_rtcd.h"
 
 #include <assert.h>
@@ -123,38 +124,30 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
     xd->block[i].dequant = pc->Y1dequant[QIndex];
   }
 
+  xd->inv_txm4x4_1        = vp9_short_idct4x4llm_1;
+  xd->inv_txm4x4          = vp9_short_idct4x4llm;
+  xd->inv_2ndtxm4x4_1     = vp9_short_inv_walsh4x4_1;
+  xd->inv_2ndtxm4x4       = vp9_short_inv_walsh4x4;
+  xd->itxm_add            = vp9_dequant_idct_add;
+  xd->dc_only_itxm_add    = vp9_dc_only_idct_add_c;
+  xd->dc_itxm_add         = vp9_dequant_dc_idct_add;
+  xd->dc_itxm_add_y_block = vp9_dequant_dc_idct_add_y_block;
+  xd->itxm_add_y_block    = vp9_dequant_idct_add_y_block;
+  xd->itxm_add_uv_block   = vp9_dequant_idct_add_uv_block;
 #if CONFIG_LOSSLESS
-  if (!QIndex) {
-    pbi->mb.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;
-    pbi->mb.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;
-    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;
-    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;
-    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;
-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;
-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;
-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;
-  } else {
-    pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
-    pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;
-    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
-    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
-    pbi->idct_add            = vp9_dequant_idct_add;
-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
+  if (xd->lossless) {
+    assert(QIndex == 0);
+    xd->inv_txm4x4_1        = vp9_short_inv_walsh4x4_1_x8;
+    xd->inv_txm4x4          = vp9_short_inv_walsh4x4_x8;
+    xd->inv_2ndtxm4x4_1     = vp9_short_inv_walsh4x4_1_lossless;
+    xd->inv_2ndtxm4x4       = vp9_short_inv_walsh4x4_lossless;
+    xd->itxm_add            = vp9_dequant_idct_add_lossless_c;
+    xd->dc_only_itxm_add    = vp9_dc_only_inv_walsh_add_c;
+    xd->dc_itxm_add         = vp9_dequant_dc_idct_add_lossless_c;
+    xd->dc_itxm_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
+    xd->itxm_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;
+    xd->itxm_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;
   }
-#else
-  pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
-  pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;
-  pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
-  pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
-  pbi->idct_add            = vp9_dequant_idct_add;
-  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
-  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
-  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;
-  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
 #endif
 
   for (i = 16; i < 24; i++) {
@@ -345,15 +338,15 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
       int i8x8mode = b->bmi.as_mode.first;
       b = &xd->block[16 + i];
       vp9_intra_uv4x4_predict(xd, &xd->block[16 + i], i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
                     *(b->base_dst) + b->dst, 8, b->dst_stride);
       b = &xd->block[20 + i];
       vp9_intra_uv4x4_predict(xd, &xd->block[20 + i], i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
                     *(b->base_dst) + b->dst, 8, b->dst_stride);
     }
   } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
          xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
          xd->dst.uv_stride, xd->eobs + 16);
   } else {
@@ -400,17 +393,17 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                     *(b->base_dst) + b->dst, 16,
                                     b->dst_stride, b->eob);
         } else {
-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                               *(b->base_dst) + b->dst, 16, b->dst_stride);
+          xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                        *(b->base_dst) + b->dst, 16, b->dst_stride);
         }
       }
       b = &xd->block[16 + i];
       vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
                     *(b->base_dst) + b->dst, 8, b->dst_stride);
       b = &xd->block[20 + i];
       vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
                     *(b->base_dst) + b->dst, 8, b->dst_stride);
     }
   } else if (mode == B_PRED) {
@@ -434,8 +427,8 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                   *(b->base_dst) + b->dst, 16, b->dst_stride,
                                   b->eob);
       } else {
-        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                             *(b->base_dst) + b->dst, 16, b->dst_stride);
+        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                      *(b->base_dst) + b->dst, 16, b->dst_stride);
       }
     }
     if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
@@ -444,7 +437,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
     xd->above_context->y2 = 0;
     xd->left_context->y2 = 0;
     vp9_build_intra_predictors_mbuv(xd);
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
                            xd->predictor + 16 * 16,
                            xd->dst.u_buffer,
@@ -453,13 +446,13 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
                            xd->eobs + 16);
   } else if (mode == SPLITMV || get_2nd_order_usage(xd) == 0) {
     assert(get_2nd_order_usage(xd) == 0);
-    pbi->idct_add_y_block(xd->qcoeff,
+    xd->itxm_add_y_block(xd->qcoeff,
                           xd->block[0].dequant,
                           xd->predictor,
                           xd->dst.y_buffer,
                           xd->dst.y_stride,
                           xd->eobs);
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
                            xd->predictor + 16 * 16,
                            xd->dst.u_buffer,
@@ -496,8 +489,8 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                     *(b->base_dst) + b->dst, 16,
                                     b->dst_stride, b->eob);
         } else {
-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
-                               *(b->base_dst) + b->dst, 16, b->dst_stride);
+          xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
+                        *(b->base_dst) + b->dst, 16, b->dst_stride);
         }
       }
     } else {
@@ -505,7 +498,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       assert(get_2nd_order_usage(xd) == 1);
       vp9_dequantize_b(b);
       if (xd->eobs[24] > 1) {
-        vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);
+        xd->inv_2ndtxm4x4(&b->dqcoeff[0], b->diff);
         ((int *)b->qcoeff)[0] = 0;
         ((int *)b->qcoeff)[1] = 0;
         ((int *)b->qcoeff)[2] = 0;
@@ -515,11 +508,11 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
         ((int *)b->qcoeff)[6] = 0;
         ((int *)b->qcoeff)[7] = 0;
       } else {
-        xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);
+        xd->inv_2ndtxm4x4_1(&b->dqcoeff[0], b->diff);
         ((int *)b->qcoeff)[0] = 0;
       }
       vp9_dequantize_b(b);
-      pbi->dc_idct_add_y_block(xd->qcoeff,
+      xd->dc_itxm_add_y_block(xd->qcoeff,
                                xd->block[0].dequant,
                                xd->predictor,
                                xd->dst.y_buffer,
@@ -527,7 +520,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                xd->eobs,
                                xd->block[24].diff);
     }
-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
                            xd->predictor + 16 * 16,
                            xd->dst.u_buffer,
@@ -645,7 +638,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
             + x_idx * 16 + (i & 3) * 4,
             xd->dst.y_stride, xd->dst.y_stride, b->eob);
       } else {
-        vp9_dequant_idct_add_c(
+        xd->itxm_add(
             b->qcoeff, b->dequant,
             xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
             + x_idx * 16 + (i & 3) * 4,
@@ -657,7 +650,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
   } else if (get_2nd_order_usage(xd) == 1) {
     vp9_dequantize_b(b);
     if (xd->eobs[24] > 1) {
-      vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);
+      xd->inv_2ndtxm4x4(&b->dqcoeff[0], b->diff);
       ((int *)b->qcoeff)[0] = 0;
       ((int *)b->qcoeff)[1] = 0;
       ((int *)b->qcoeff)[2] = 0;
@@ -667,7 +660,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
     } else {
-      xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);
+      xd->inv_2ndtxm4x4_1(&b->dqcoeff[0], b->diff);
       ((int *)b->qcoeff)[0] = 0;
     }
     vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(
@@ -1534,17 +1527,24 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   pc->sb64_coded = vp9_read_literal(&header_bc, 8);
   pc->sb32_coded = vp9_read_literal(&header_bc, 8);
-
-  /* Read the loop filter level and type */
-  pc->txfm_mode = vp9_read_literal(&header_bc, 2);
-  if (pc->txfm_mode == 3)
-    pc->txfm_mode += vp9_read_bit(&header_bc);
-  if (pc->txfm_mode == TX_MODE_SELECT) {
-    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
-    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
-    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+#if CONFIG_LOSSLESS
+  xd->lossless = vp9_read_bit(&header_bc);
+  if (xd->lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  }
+  else
+#endif
+  {
+    /* Read the loop filter level and type */
+    pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+    if (pc->txfm_mode == 3)
+      pc->txfm_mode += vp9_read_bit(&header_bc);
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
+      pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+      pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+    }
   }
-
   pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
   pc->filter_level = vp9_read_literal(&header_bc, 6);
   pc->sharpness_level = vp9_read_literal(&header_bc, 3);
@@ -1775,78 +1775,91 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   /* tile info */
   {
-    int log2_tile_cols;
     const unsigned char *data_ptr = data + first_partition_length_in_bytes;
-    int tile, mb_start, mb_end;
+    int tile_row, tile_col, delta_log2_tiles;
 
-    log2_tile_cols = vp9_read_bit(&header_bc);
-    if (log2_tile_cols) {
-      log2_tile_cols += vp9_read_bit(&header_bc);
+    vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);
+    while (delta_log2_tiles--) {
+      if (vp9_read_bit(&header_bc)) {
+        pc->log2_tile_columns++;
+      } else {
+        break;
+      }
     }
-    pc->tile_columns = 1 << log2_tile_cols;
+    pc->log2_tile_rows = vp9_read_bit(&header_bc);
+    if (pc->log2_tile_rows)
+      pc->log2_tile_rows += vp9_read_bit(&header_bc);
+    pc->tile_columns = 1 << pc->log2_tile_columns;
+    pc->tile_rows    = 1 << pc->log2_tile_rows;
 
     vpx_memset(pc->above_context, 0,
                sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
 
     if (pbi->oxcf.inv_tile_order) {
-      const unsigned char *data_ptr2[4];
+      const int n_cols = pc->tile_columns;
+      const unsigned char *data_ptr2[4][1 << 6];
       BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak);
 
-      data_ptr2[0] = data_ptr;
-      for (tile = 1; tile < pc->tile_columns; tile++) {
-        int size = data_ptr2[tile - 1][0] + (data_ptr2[tile - 1][1] << 8) +
-                (data_ptr2[tile - 1][2] << 16) + (data_ptr2[tile - 1][3] << 24);
-        data_ptr2[tile - 1] += 4;
-        data_ptr2[tile] = data_ptr2[tile - 1] + size;
+      // pre-initialize the offsets, we're going to read in inverse order
+      data_ptr2[0][0] = data_ptr;
+      for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+        if (tile_row) {
+          int size = data_ptr2[tile_row - 1][n_cols - 1][0] +
+                    (data_ptr2[tile_row - 1][n_cols - 1][1] << 8) +
+                    (data_ptr2[tile_row - 1][n_cols - 1][2] << 16) +
+                    (data_ptr2[tile_row - 1][n_cols - 1][3] << 24);
+          data_ptr2[tile_row - 1][n_cols - 1] += 4;
+          data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
+        }
+
+        for (tile_col = 1; tile_col < n_cols; tile_col++) {
+          int size = data_ptr2[tile_row][tile_col - 1][0] +
+                    (data_ptr2[tile_row][tile_col - 1][1] << 8) +
+                    (data_ptr2[tile_row][tile_col - 1][2] << 16) +
+                    (data_ptr2[tile_row][tile_col - 1][3] << 24);
+          data_ptr2[tile_row][tile_col - 1] += 4;
+          data_ptr2[tile_row][tile_col] =
+              data_ptr2[tile_row][tile_col - 1] + size;
+        }
       }
-      for (mb_end = pc->mb_cols, tile = pc->tile_columns - 1;
-           tile >= 0; tile--) {
-        // calculate end of tile column
-        const int sb_cols = (pc->mb_cols + 3) >> 2;
-        const int sb_start = (sb_cols * tile) >> log2_tile_cols;
-        mb_start = ((sb_start << 2) > pc->mb_cols) ?
-                    pc->mb_cols : (sb_start << 2);
-
-        pc->cur_tile_idx = tile;
-        pc->cur_tile_mb_col_start = mb_start;
-        pc->cur_tile_mb_col_end   = mb_end;
-
-        setup_token_decoder(pbi, data_ptr2[tile], &residual_bc);
-
-        /* Decode a row of superblocks */
-        for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {
-          decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+
+      for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+        vp9_get_tile_row_offsets(pc, tile_row);
+        for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {
+          vp9_get_tile_col_offsets(pc, tile_col);
+          setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], &residual_bc);
+
+          /* Decode a row of superblocks */
+          for (mb_row = pc->cur_tile_mb_row_start;
+               mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
+            decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+          }
+          if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)
+            bc_bak = residual_bc;
         }
-        mb_end = mb_start;
-        if (tile == pc->tile_columns - 1)
-          bc_bak = residual_bc;
       }
       residual_bc = bc_bak;
     } else {
-      for (mb_start = 0, tile = 0; tile < pc->tile_columns; tile++) {
-        // calculate end of tile column
-        const int sb_cols = (pc->mb_cols + 3) >> 2;
-        const int sb_end = (sb_cols * (tile + 1)) >> log2_tile_cols;
-        mb_end = ((sb_end << 2) > pc->mb_cols) ? pc->mb_cols : (sb_end << 2);
-
-        pc->cur_tile_idx = tile;
-        pc->cur_tile_mb_col_start = mb_start;
-        pc->cur_tile_mb_col_end   = mb_end;
-
-        if (tile < pc->tile_columns - 1)
-          setup_token_decoder(pbi, data_ptr + 4, &residual_bc);
-        else
-          setup_token_decoder(pbi, data_ptr, &residual_bc);
-
-        /* Decode a row of superblocks */
-        for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {
-          decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
-        }
-        mb_start = mb_end;
-        if (tile < pc->tile_columns - 1) {
-          int size = data_ptr[0] + (data_ptr[1] << 8) + (data_ptr[2] << 16) +
-                    (data_ptr[3] << 24);
-          data_ptr += 4 + size;
+      for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+        vp9_get_tile_row_offsets(pc, tile_row);
+        for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
+          vp9_get_tile_col_offsets(pc, tile_col);
+
+          if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
+            setup_token_decoder(pbi, data_ptr + 4, &residual_bc);
+          else
+            setup_token_decoder(pbi, data_ptr, &residual_bc);
+
+          /* Decode a row of superblocks */
+          for (mb_row = pc->cur_tile_mb_row_start;
+               mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
+            decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+          }
+          if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
+            int size = data_ptr[0] + (data_ptr[1] << 8) + (data_ptr[2] << 16) +
+                      (data_ptr[3] << 24);
+            data_ptr += 4 + size;
+          }
         }
       }
     }
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 1f64767fa..92a9df84c 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -357,21 +357,17 @@ void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
 
   if (eob) {
     input[0] = input[0] * dq[0] / 2;
-#if !CONFIG_DWTDCTHYBRID
     if (eob == 1) {
       vp9_short_idct1_32x32_c(input, output);
       add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32);
       input[0] = 0;
     } else {
-#endif
       for (i = 1; i < 1024; i++)
         input[i] = input[i] * dq[1] / 2;
       vp9_short_idct32x32_c(input, output, 64);
       vpx_memset(input, 0, 2048);
       add_residual(output, pred, pitch, dest, stride, 32, 32);
-#if !CONFIG_DWTDCTHYBRID
     }
-#endif
   }
 }
 
diff --git a/vp9/decoder/vp9_dequantize.h b/vp9/decoder/vp9_dequantize.h
index 2edbd6a3a..b7efb44f1 100644
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@@ -42,20 +42,6 @@ extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *
                                                      uint16_t *eobs);
 #endif
 
-typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-    unsigned char *pred, unsigned char *output, int pitch, int stride);
-typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
-
-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
-    const int16_t *dc);
-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
-    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
-    uint16_t *eobs);
-
 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
                                     unsigned char *pred, unsigned char *dest,
                                     int pitch, int stride, uint16_t eobs);
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index acf69d906..bfdb486b8 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -63,24 +63,11 @@ static int get_signed(BOOL_DECODER *br, int value_to_sign) {
   return decode_bool(br, 128) ? -value_to_sign : value_to_sign;
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-#define PT pn
-#define INCREMENT_COUNT(token)                       \
-  do {                                               \
-    coef_counts[type][coef_bands[c]][pn][token]++;   \
-    pn = pt = vp9_prev_token_class[token];           \
-    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(coef_bands[c + 1]))  \
-      pn = vp9_get_coef_neighbor_context(            \
-          qcoeff_ptr, nodc, neighbors, scan[c + 1]); \
-  } while (0)
-#else
-#define PT pt
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[type][coef_bands[c]][pt][token]++; \
-    pt = vp9_prev_token_class[token];              \
+    coef_counts[type][coef_bands[c]][pt][token]++;            \
+    pt = vp9_get_coef_context(&recent_energy, token);         \
   } while (0)
-#endif  /* CONFIG_NEWCOEFCONTEXT */
 
 #define WRITE_COEF_CONTINUE(val, token)                       \
   {                                                           \
@@ -108,10 +95,7 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
   const int lidx = vp9_block2left[txfm_size][block_idx];
   ENTROPY_CONTEXT above_ec = A0[aidx] != 0, left_ec = L0[lidx] != 0;
   FRAME_CONTEXT *const fc = &dx->common.fc;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-  int pn;
-#endif
+  int recent_energy = 0;
   int nodc = (type == PLANE_TYPE_Y_NO_DC);
   int pt, c = nodc;
   vp9_coeff_probs *coef_probs;
@@ -192,15 +176,11 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, above_ec, left_ec);
-#if CONFIG_NEWCOEFCONTEXT
-  pn = pt;
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-#endif
   while (1) {
     int val;
     const uint8_t *cat6 = cat6_prob;
     if (c >= seg_eob) break;
-    prob = coef_probs[type][coef_bands[c]][PT];
+    prob = coef_probs[type][coef_bands[c]][pt];
     if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
       break;
 SKIP_START:
@@ -208,7 +188,7 @@ SKIP_START:
     if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
-      prob = coef_probs[type][coef_bands[c]][PT];
+      prob = coef_probs[type][coef_bands[c]][pt];
       goto SKIP_START;
     }
     // ONE_CONTEXT_NODE_0_
@@ -272,7 +252,7 @@ SKIP_START:
   }
 
   if (c < seg_eob)
-    coef_counts[type][coef_bands[c]][PT][DCT_EOB_TOKEN]++;
+    coef_counts[type][coef_bands[c]][pt][DCT_EOB_TOKEN]++;
 
   A0[aidx] = L0[lidx] = (c > !type);
   if (txfm_size >= TX_8X8 && type != PLANE_TYPE_Y2) {
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index b350e4d68..80b301931 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -51,9 +51,9 @@ void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
       if (*eobs++ > 1)
-        vp9_dequant_dc_idct_add_c(q, dq, dst, dst, stride, stride, dc[0]);
+        xd->dc_itxm_add(q, dq, dst, dst, stride, stride, dc[0]);
       else
-        vp9_dc_only_idct_add_c(dc[0], dst, dst, stride, stride);
+        xd->dc_only_itxm_add(dc[0], dst, dst, stride, stride);
 
       q   += 16;
       dst += 4;
@@ -168,9 +168,9 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
       if (*eobs++ > 1) {
-        vp9_dequant_idct_add_c(q, dq, dstu, dstu, stride, stride);
+        xd->itxm_add(q, dq, dstu, dstu, stride, stride);
       } else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], dstu, dstu, stride, stride);
+        xd->dc_only_itxm_add(q[0]*dq[0], dstu, dstu, stride, stride);
         ((int *)q)[0] = 0;
       }
 
@@ -184,9 +184,9 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
       if (*eobs++ > 1) {
-        vp9_dequant_idct_add_c(q, dq, dstv, dstv, stride, stride);
+        xd->itxm_add(q, dq, dstv, dstv, stride, stride);
       } else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], dstv, dstv, stride, stride);
+        xd->dc_only_itxm_add(q[0]*dq[0], dstv, dstv, stride, stride);
         ((int *)q)[0] = 0;
       }
 
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index e04b9f5e4..0b0b90356 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -70,12 +70,6 @@ typedef struct VP9Decompressor {
 
   DETOK detoken;
 
-  vp9_dequant_idct_add_fn_t            idct_add;
-  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;
-  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
-  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;
-  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
-
   int refresh_frame_flags;
   vp9_prob prob_skip_false;
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a3c407865..257ddb2c5 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -14,6 +14,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include <assert.h>
@@ -1088,14 +1089,15 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
 }
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
-                        TOKENEXTRA **tok) {
+                        TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   VP9_COMMON *const c = &cpi->common;
   const int mis = c->mode_info_stride;
-  MODE_INFO *m, *m_ptr = c->mi + c->cur_tile_mb_col_start;
+  MODE_INFO *m, *m_ptr = c->mi;
   int i, mb_row, mb_col;
-  TOKENEXTRA *tok_end = *tok + cpi->tok_count;
 
-  for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) {
+  m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis;
+  for (mb_row = c->cur_tile_mb_row_start;
+       mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) {
     m = m_ptr;
     for (mb_col = c->cur_tile_mb_col_start;
          mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) {
@@ -1667,7 +1669,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   vp9_write_literal(&header_bc, pc->sb64_coded, 8);
   pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);
   vp9_write_literal(&header_bc, pc->sb32_coded, 8);
-
+#if CONFIG_LOSSLESS
+  vp9_write_bit(&header_bc, cpi->oxcf.lossless);
+  if (cpi->oxcf.lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  }
+  else
+#endif
   {
     if (pc->txfm_mode == TX_MODE_SELECT) {
       pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +
@@ -2026,9 +2034,22 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   }
 
   /* tiling */
-  vp9_write(&header_bc, pc->tile_columns > 1, 128);
-  if (pc->tile_columns > 1) {
-    vp9_write(&header_bc, pc->tile_columns > 2, 128);
+  {
+    int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;
+
+    vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles);
+    n_tile_bits = pc->log2_tile_columns - min_log2_tiles;
+    for (n = 0; n < delta_log2_tiles; n++) {
+      if (n_tile_bits--) {
+        vp9_write_bit(&header_bc, 1);
+      } else {
+        vp9_write_bit(&header_bc, 0);
+        break;
+      }
+    }
+    vp9_write_bit(&header_bc, pc->log2_tile_rows != 0);
+    if (pc->log2_tile_rows != 0)
+      vp9_write_bit(&header_bc, pc->log2_tile_rows != 1);
   }
 
   vp9_stop_encode(&header_bc);
@@ -2058,41 +2079,45 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   }
 
   {
-    int mb_start = 0, tile;
-    int total_size = 0;
+    int tile_row, tile_col, total_size = 0;
     unsigned char *data_ptr = cx_data + header_bc.pos;
-    TOKENEXTRA *tok = cpi->tok;
-
-    for (tile = 0; tile < pc->tile_columns; tile++) {
-      // calculate end of tile column
-      const int sb_cols = (pc->mb_cols + 3) >> 2;
-      const int sb_end = (sb_cols * (tile + 1)) >> cpi->oxcf.tile_columns;
-      const int mb_end = ((sb_end << 2) > pc->mb_cols) ?
-                          pc->mb_cols : (sb_end << 2);
-
-      pc->cur_tile_idx = tile;
-      pc->cur_tile_mb_col_start = mb_start;
-      pc->cur_tile_mb_col_end = mb_end;
-
-      if (tile < pc->tile_columns - 1)
-        vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
-      else
-        vp9_start_encode(&residual_bc, data_ptr + total_size);
-      write_modes(cpi, &residual_bc, &tok);
-      vp9_stop_encode(&residual_bc);
-      if (tile < pc->tile_columns - 1) {
-        /* size of this tile */
-        data_ptr[total_size + 0] = residual_bc.pos;
-        data_ptr[total_size + 1] = residual_bc.pos >> 8;
-        data_ptr[total_size + 2] = residual_bc.pos >> 16;
-        data_ptr[total_size + 3] = residual_bc.pos >> 24;
-        total_size += 4;
-      }
+    TOKENEXTRA *tok[1 << 6], *tok_end;
+
+    tok[0] = cpi->tok;
+    for (tile_col = 1; tile_col < pc->tile_columns; tile_col++)
+      tok[tile_col] = tok[tile_col - 1] + cpi->tok_count[tile_col - 1];
+
+    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+      vp9_get_tile_row_offsets(pc, tile_row);
+      tok_end = cpi->tok + cpi->tok_count[0];
+      for (tile_col = 0; tile_col < pc->tile_columns;
+           tile_col++, tok_end += cpi->tok_count[tile_col]) {
+        vp9_get_tile_col_offsets(pc, tile_col);
+
+        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
+          vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+        else
+          vp9_start_encode(&residual_bc, data_ptr + total_size);
+        write_modes(cpi, &residual_bc, &tok[tile_col], tok_end);
+        vp9_stop_encode(&residual_bc);
+        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
+          /* size of this tile */
+          data_ptr[total_size + 0] = residual_bc.pos;
+          data_ptr[total_size + 1] = residual_bc.pos >> 8;
+          data_ptr[total_size + 2] = residual_bc.pos >> 16;
+          data_ptr[total_size + 3] = residual_bc.pos >> 24;
+          total_size += 4;
+        }
 
-      mb_start = mb_end;
-      total_size += residual_bc.pos;
+        total_size += residual_bc.pos;
+      }
     }
 
+    assert((unsigned int)(tok[0] - cpi->tok) == cpi->tok_count[0]);
+    for (tile_col = 1; tile_col < pc->tile_columns; tile_col++)
+      assert((unsigned int)(tok[tile_col] - tok[tile_col - 1]) ==
+                  cpi->tok_count[tile_col]);
+
     *size += total_size;
   }
 }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 91d4c4530..d5110c810 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -169,14 +169,14 @@ typedef struct macroblock {
   PICK_MODE_CONTEXT sb32_context[4];
   PICK_MODE_CONTEXT sb64_context;
 
-  void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch);
-  void (*vp9_short_fdct8x4)(int16_t *input, int16_t *output, int pitch);
-  void (*short_walsh4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_2ndtxm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
+  void (*fwd_2ndtxm2x2)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
-  void (*vp9_short_fdct8x8)(int16_t *input, int16_t *output, int pitch);
-  void (*vp9_short_fdct16x16)(int16_t *input, int16_t *output, int pitch);
-  void (*short_fhaar2x2)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 7af044fe4..746648291 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -323,6 +323,8 @@ static const int16_t adst_i16[256] = {
 };
 #endif
 
+#define NEW_FDCT8x8 1
+#if !NEW_FDCT8x8
 static const int xC1S7 = 16069;
 static const int xC2S6 = 15137;
 static const int xC3S5 = 13623;
@@ -560,6 +562,7 @@ void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
     op++;
   }
 }
+#endif
 
 void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
   /* [1 1; 1 -1] orthogonal transform */
@@ -836,6 +839,79 @@ void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
     vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
+#if NEW_FDCT8x8
+static void fdct8_1d(int16_t *input, int16_t *output) {
+  int16_t step[8];
+  int temp1, temp2;
+
+  // stage 1
+  step[0] = input[0] + input[7];
+  step[1] = input[1] + input[6];
+  step[2] = input[2] + input[5];
+  step[3] = input[3] + input[4];
+  step[4] = input[3] - input[4];
+  step[5] = input[2] - input[5];
+  step[6] = input[1] - input[6];
+  step[7] = input[0] - input[7];
+
+  fdct4_1d(step, step);
+
+  // Stage 2
+  output[4] = step[4];
+  temp1 = (-step[5] + step[6]) * cospi_16_64;
+  temp2 = (step[6] + step[5]) * cospi_16_64;
+  output[5] = dct_const_round_shift(temp1);
+  output[6] = dct_const_round_shift(temp2);
+  output[7] = step[7];
+
+  // Stage 3
+  step[4] = output[4] + output[5];
+  step[5] = -output[5] + output[4];
+  step[6] = -output[6] + output[7];
+  step[7] = output[7] + output[6];
+
+  // Stage 4
+  output[0] = step[0];
+  output[4] = step[2];
+  output[2] = step[1];
+  output[6] = step[3];
+
+  temp1 = step[4] * cospi_28_64 + step[7] * cospi_4_64;
+  temp2 = step[5] * cospi_12_64 + step[6] * cospi_20_64;
+  output[1] = dct_const_round_shift(temp1);
+  output[5] = dct_const_round_shift(temp2);
+  temp1 = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
+  temp2 = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
+  output[3] = dct_const_round_shift(temp1);
+  output[7] = dct_const_round_shift(temp2);
+}
+
+void vp9_short_fdct8x8_c(int16_t *input, int16_t *output, int pitch) {
+  int shortpitch = pitch >> 1;
+  int i, j;
+  int16_t out[64];
+  int16_t temp_in[8], temp_out[8];
+
+  // First transform columns
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++)
+      temp_in[j] = input[j * shortpitch + i] << 2;
+    fdct8_1d(temp_in, temp_out);
+    for (j = 0; j < 8; j++)
+      out[j * 8 + i] = temp_out[j];
+  }
+
+  // Then transform rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j + i * 8];
+    fdct8_1d(temp_in, temp_out);
+    for (j = 0; j < 8; ++j)
+      output[j + i * 8] = temp_out[j] >> 1;
+  }
+}
+#endif
+
 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
@@ -1395,8 +1471,6 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
 #undef ROUNDING
 #endif
 
-#if !CONFIG_DWTDCTHYBRID
-
 #define TEST_INT_32x32_DCT 1
 
 #if !TEST_INT_32x32_DCT
@@ -2134,706 +2208,3 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
 }
 
 #endif
-
-#else  // CONFIG_DWTDCTHYBRID
-
-#if DWT_TYPE == 53
-
-// Note: block length must be even for this implementation
-static void analysis_53_row(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int n;
-  short r, *a, *b;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    *a++ = (r = *x++) << 1;
-    *b++ = *x - ((r + x[1] + 1) >> 1);
-    x++;
-  }
-  *a = (r = *x++) << 1;
-  *b = *x - r;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ += (r + (*b) + 1) >> 1;
-    r = *b++;
-  }
-}
-
-static void analysis_53_col(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int n;
-  short r, *a, *b;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  while (--n) {
-    *a++ = (r = *x++);
-    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
-    x++;
-  }
-  *a = (r = *x++);
-  *b = (*x - r + 1) >> 1;
-
-  n = length >> 1;
-  b = highpass;
-  a = lowpass;
-  r = *highpass;
-  while (n--) {
-    *a++ += (r + (*b) + 1) >> 1;
-    r = *b++;
-  }
-}
-
-static void dyadic_analyze_53(int levels, int width, int height,
-                              short *x, int pitch_x, short *c, int pitch_c) {
-  int lv, i, j, nh, nw, hh = height, hw = width;
-  short buffer[2 * DWT_MAX_LENGTH];
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
-    }
-  }
-  for (lv = 0; lv < levels; lv++) {
-    nh = hh;
-    hh = (hh + 1) >> 1;
-    nw = hw;
-    hw = (hw + 1) >> 1;
-    if ((nh < 2) || (nw < 2)) return;
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
-      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
-    }
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i + nh] = c[i * pitch_c + j];
-      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i];
-    }
-  }
-}
-
-#elif DWT_TYPE == 26
-
-static void analysis_26_row(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int i, n;
-  short r, s, *a, *b;
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    r = *x++;
-    s = *x++;
-    *a++ = r + s;
-    *b++ = r - s;
-  }
-  n = length >> 1;
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ -= (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b -= (r - *a + 4) >> 3;
-  }
-}
-
-static void analysis_26_col(int length, short *x,
-                            short *lowpass, short *highpass) {
-  int i, n;
-  short r, s, *a, *b;
-  a = lowpass;
-  b = highpass;
-  for (i = length >> 1; i; i--) {
-    r = *x++;
-    s = *x++;
-    *a++ = (r + s + 1) >> 1;
-    *b++ = (r - s + 1) >> 1;
-  }
-  n = length >> 1;
-  if (n >= 4) {
-    a = lowpass;
-    b = highpass;
-    r = *lowpass;
-    while (--n) {
-      *b++ -= (r - a[1] + 4) >> 3;
-      r = *a++;
-    }
-    *b -= (r - *a + 4) >> 3;
-  }
-}
-
-static void dyadic_analyze_26(int levels, int width, int height,
-                              short *x, int pitch_x, short *c, int pitch_c) {
-  int lv, i, j, nh, nw, hh = height, hw = width;
-  short buffer[2 * DWT_MAX_LENGTH];
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
-    }
-  }
-  for (lv = 0; lv < levels; lv++) {
-    nh = hh;
-    hh = (hh + 1) >> 1;
-    nw = hw;
-    hw = (hw + 1) >> 1;
-    if ((nh < 2) || (nw < 2)) return;
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
-      analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
-    }
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i + nh] = c[i * pitch_c + j];
-      analysis_26_col(nh, buffer + nh, buffer, buffer + hh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = buffer[i];
-    }
-  }
-}
-
-#elif DWT_TYPE == 97
-
-static void analysis_97(int length, double *x,
-                        double *lowpass, double *highpass) {
-  static const double a_predict1 = -1.586134342;
-  static const double a_update1 = -0.05298011854;
-  static const double a_predict2 = 0.8829110762;
-  static const double a_update2 = 0.4435068522;
-  static const double s_low = 1.149604398;
-  static const double s_high = 1/1.149604398;
-  int i;
-  double y[DWT_MAX_LENGTH];
-  // Predict 1
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] += a_predict1 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] += 2 * a_predict1 * x[length - 2];
-  // Update 1
-  for (i = 2; i < length; i += 2) {
-    x[i] += a_update1 * (x[i - 1] + x[i + 1]);
-  }
-  x[0] += 2 * a_update1 * x[1];
-  // Predict 2
-  for (i = 1; i < length - 2; i += 2) {
-    x[i] += a_predict2 * (x[i - 1] + x[i + 1]);
-  }
-  x[length - 1] += 2 * a_predict2 * x[length - 2];
-  // Update 2
-  for (i = 2; i < length; i += 2) {
-    x[i] += a_update2 * (x[i - 1] + x[i + 1]);
-  }
-  x[0] += 2 * a_update2 * x[1];
-  memcpy(y, x, sizeof(*y) * length);
-  // Scale and pack
-  for (i = 0; i < length / 2; i++) {
-    lowpass[i] = y[2 * i] * s_low;
-    highpass[i] = y[2 * i + 1] * s_high;
-  }
-}
-
-static void dyadic_analyze_97(int levels, int width, int height,
-                             short *x, int pitch_x, short *c, int pitch_c) {
-  int lv, i, j, nh, nw, hh = height, hw = width;
-  double buffer[2 * DWT_MAX_LENGTH];
-  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
-    }
-  }
-  for (lv = 0; lv < levels; lv++) {
-    nh = hh;
-    hh = (hh + 1) >> 1;
-    nw = hw;
-    hw = (hw + 1) >> 1;
-    if ((nh < 2) || (nw < 2)) return;
-    for (i = 0; i < nh; i++) {
-      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
-      analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],
-                  &y[i * DWT_MAX_LENGTH] + hw);
-    }
-    for (j = 0; j < nw; j++) {
-      for (i = 0; i < nh; i++)
-        buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];
-      analysis_97(nh, buffer + nh, buffer, buffer + hh);
-      for (i = 0; i < nh; i++)
-        c[i * pitch_c + j] = round(buffer[i]);
-    }
-  }
-}
-
-#endif  // DWT_TYPE
-
-// TODO(debargha): Implement the scaling differently so as not to have to
-// use the floating point dct
-static void dct16x16_1d_f(double input[16], double output[16]) {
-  static const double C1 = 0.995184726672197;
-  static const double C2 = 0.98078528040323;
-  static const double C3 = 0.956940335732209;
-  static const double C4 = 0.923879532511287;
-  static const double C5 = 0.881921264348355;
-  static const double C6 = 0.831469612302545;
-  static const double C7 = 0.773010453362737;
-  static const double C8 = 0.707106781186548;
-  static const double C9 = 0.634393284163646;
-  static const double C10 = 0.555570233019602;
-  static const double C11 = 0.471396736825998;
-  static const double C12 = 0.38268343236509;
-  static const double C13 = 0.290284677254462;
-  static const double C14 = 0.195090322016128;
-  static const double C15 = 0.098017140329561;
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    double step[16];
-    double intermediate[16];
-    double temp1, temp2;
-
-    // step 1
-    step[ 0] = input[0] + input[15];
-    step[ 1] = input[1] + input[14];
-    step[ 2] = input[2] + input[13];
-    step[ 3] = input[3] + input[12];
-    step[ 4] = input[4] + input[11];
-    step[ 5] = input[5] + input[10];
-    step[ 6] = input[6] + input[ 9];
-    step[ 7] = input[7] + input[ 8];
-    step[ 8] = input[7] - input[ 8];
-    step[ 9] = input[6] - input[ 9];
-    step[10] = input[5] - input[10];
-    step[11] = input[4] - input[11];
-    step[12] = input[3] - input[12];
-    step[13] = input[2] - input[13];
-    step[14] = input[1] - input[14];
-    step[15] = input[0] - input[15];
-
-    // step 2
-    output[0] = step[0] + step[7];
-    output[1] = step[1] + step[6];
-    output[2] = step[2] + step[5];
-    output[3] = step[3] + step[4];
-    output[4] = step[3] - step[4];
-    output[5] = step[2] - step[5];
-    output[6] = step[1] - step[6];
-    output[7] = step[0] - step[7];
-
-    temp1 = step[ 8]*C7;
-    temp2 = step[15]*C9;
-    output[ 8] = temp1 + temp2;
-
-    temp1 = step[ 9]*C11;
-    temp2 = step[14]*C5;
-    output[ 9] = temp1 - temp2;
-
-    temp1 = step[10]*C3;
-    temp2 = step[13]*C13;
-    output[10] = temp1 + temp2;
-
-    temp1 = step[11]*C15;
-    temp2 = step[12]*C1;
-    output[11] = temp1 - temp2;
-
-    temp1 = step[11]*C1;
-    temp2 = step[12]*C15;
-    output[12] = temp2 + temp1;
-
-    temp1 = step[10]*C13;
-    temp2 = step[13]*C3;
-    output[13] = temp2 - temp1;
-
-    temp1 = step[ 9]*C5;
-    temp2 = step[14]*C11;
-    output[14] = temp2 + temp1;
-
-    temp1 = step[ 8]*C9;
-    temp2 = step[15]*C7;
-    output[15] = temp2 - temp1;
-
-    // step 3
-    step[ 0] = output[0] + output[3];
-    step[ 1] = output[1] + output[2];
-    step[ 2] = output[1] - output[2];
-    step[ 3] = output[0] - output[3];
-
-    temp1 = output[4]*C14;
-    temp2 = output[7]*C2;
-    step[ 4] = temp1 + temp2;
-
-    temp1 = output[5]*C10;
-    temp2 = output[6]*C6;
-    step[ 5] = temp1 + temp2;
-
-    temp1 = output[5]*C6;
-    temp2 = output[6]*C10;
-    step[ 6] = temp2 - temp1;
-
-    temp1 = output[4]*C2;
-    temp2 = output[7]*C14;
-    step[ 7] = temp2 - temp1;
-
-    step[ 8] = output[ 8] + output[11];
-    step[ 9] = output[ 9] + output[10];
-    step[10] = output[ 9] - output[10];
-    step[11] = output[ 8] - output[11];
-
-    step[12] = output[12] + output[15];
-    step[13] = output[13] + output[14];
-    step[14] = output[13] - output[14];
-    step[15] = output[12] - output[15];
-
-    // step 4
-    output[ 0] = (step[ 0] + step[ 1]);
-    output[ 8] = (step[ 0] - step[ 1]);
-
-    temp1 = step[2]*C12;
-    temp2 = step[3]*C4;
-    temp1 = temp1 + temp2;
-    output[ 4] = 2*(temp1*C8);
-
-    temp1 = step[2]*C4;
-    temp2 = step[3]*C12;
-    temp1 = temp2 - temp1;
-    output[12] = 2*(temp1*C8);
-
-    output[ 2] = 2*((step[4] + step[ 5])*C8);
-    output[14] = 2*((step[7] - step[ 6])*C8);
-
-    temp1 = step[4] - step[5];
-    temp2 = step[6] + step[7];
-    output[ 6] = (temp1 + temp2);
-    output[10] = (temp1 - temp2);
-
-    intermediate[8] = step[8] + step[14];
-    intermediate[9] = step[9] + step[15];
-
-    temp1 = intermediate[8]*C12;
-    temp2 = intermediate[9]*C4;
-    temp1 = temp1 - temp2;
-    output[3] = 2*(temp1*C8);
-
-    temp1 = intermediate[8]*C4;
-    temp2 = intermediate[9]*C12;
-    temp1 = temp2 + temp1;
-    output[13] = 2*(temp1*C8);
-
-    output[ 9] = 2*((step[10] + step[11])*C8);
-
-    intermediate[11] = step[10] - step[11];
-    intermediate[12] = step[12] + step[13];
-    intermediate[13] = step[12] - step[13];
-    intermediate[14] = step[ 8] - step[14];
-    intermediate[15] = step[ 9] - step[15];
-
-    output[15] = (intermediate[11] + intermediate[12]);
-    output[ 1] = -(intermediate[11] - intermediate[12]);
-
-    output[ 7] = 2*(intermediate[13]*C8);
-
-    temp1 = intermediate[14]*C12;
-    temp2 = intermediate[15]*C4;
-    temp1 = temp1 - temp2;
-    output[11] = -2*(temp1*C8);
-
-    temp1 = intermediate[14]*C4;
-    temp2 = intermediate[15]*C12;
-    temp1 = temp2 + temp1;
-    output[ 5] = 2*(temp1*C8);
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch,
-                                    int scale) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    double output[256];
-    // First transform columns
-    for (i = 0; i < 16; i++) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; j++)
-            temp_in[j] = input[j*shortpitch + i];
-        dct16x16_1d_f(temp_in, temp_out);
-        for (j = 0; j < 16; j++)
-            output[j*16 + i] = temp_out[j];
-    }
-    // Then transform rows
-    for (i = 0; i < 16; ++i) {
-        double temp_in[16], temp_out[16];
-        for (j = 0; j < 16; ++j)
-            temp_in[j] = output[j + i*16];
-        dct16x16_1d_f(temp_in, temp_out);
-        for (j = 0; j < 16; ++j)
-            output[j + i*16] = temp_out[j];
-    }
-    // Scale by some magic number
-    for (i = 0; i < 256; i++)
-        out[i] = (short)round(output[i] / (2 << scale));
-  }
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) {
-  int j1, i, j, k;
-  float b[8];
-  float b1[8];
-  float d[8][8];
-  float f0 = (float) .7071068;
-  float f1 = (float) .4903926;
-  float f2 = (float) .4619398;
-  float f3 = (float) .4157348;
-  float f4 = (float) .3535534;
-  float f5 = (float) .2777851;
-  float f6 = (float) .1913417;
-  float f7 = (float) .0975452;
-  pitch = pitch / 2;
-  for (i = 0, k = 0; i < 8; i++, k += pitch) {
-    for (j = 0; j < 8; j++) {
-      b[j] = (float)(block[k + j] << (3 - scale));
-    }
-    /* Horizontal transform */
-    for (j = 0; j < 4; j++) {
-      j1 = 7 - j;
-      b1[j] = b[j] + b[j1];
-      b1[j1] = b[j] - b[j1];
-    }
-    b[0] = b1[0] + b1[3];
-    b[1] = b1[1] + b1[2];
-    b[2] = b1[1] - b1[2];
-    b[3] = b1[0] - b1[3];
-    b[4] = b1[4];
-    b[5] = (b1[6] - b1[5]) * f0;
-    b[6] = (b1[6] + b1[5]) * f0;
-    b[7] = b1[7];
-    d[i][0] = (b[0] + b[1]) * f4;
-    d[i][4] = (b[0] - b[1]) * f4;
-    d[i][2] = b[2] * f6 + b[3] * f2;
-    d[i][6] = b[3] * f6 - b[2] * f2;
-    b1[4] = b[4] + b[5];
-    b1[7] = b[7] + b[6];
-    b1[5] = b[4] - b[5];
-    b1[6] = b[7] - b[6];
-    d[i][1] = b1[4] * f7 + b1[7] * f1;
-    d[i][5] = b1[5] * f3 + b1[6] * f5;
-    d[i][7] = b1[7] * f7 - b1[4] * f1;
-    d[i][3] = b1[6] * f3 - b1[5] * f5;
-  }
-  /* Vertical transform */
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 4; j++) {
-      j1 = 7 - j;
-      b1[j] = d[j][i] + d[j1][i];
-      b1[j1] = d[j][i] - d[j1][i];
-    }
-    b[0] = b1[0] + b1[3];
-    b[1] = b1[1] + b1[2];
-    b[2] = b1[1] - b1[2];
-    b[3] = b1[0] - b1[3];
-    b[4] = b1[4];
-    b[5] = (b1[6] - b1[5]) * f0;
-    b[6] = (b1[6] + b1[5]) * f0;
-    b[7] = b1[7];
-    d[0][i] = (b[0] + b[1]) * f4;
-    d[4][i] = (b[0] - b[1]) * f4;
-    d[2][i] = b[2] * f6 + b[3] * f2;
-    d[6][i] = b[3] * f6 - b[2] * f2;
-    b1[4] = b[4] + b[5];
-    b1[7] = b[7] + b[6];
-    b1[5] = b[4] - b[5];
-    b1[6] = b[7] - b[6];
-    d[1][i] = b1[4] * f7 + b1[7] * f1;
-    d[5][i] = b1[5] * f3 + b1[6] * f5;
-    d[7][i] = b1[7] * f7 - b1[4] * f1;
-    d[3][i] = b1[6] * f3 - b1[5] * f5;
-  }
-  for (i = 0; i < 8; i++) {
-    for (j = 0; j < 8; j++) {
-      *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5);
-    }
-  }
-  return;
-}
-
-#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-
-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
-  // assume out is a 32x32 buffer
-  short buffer[16 * 16];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-}
-
-#elif DWTDCT_TYPE == DWTDCT16X16
-
-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
-  // assume out is a 32x32 buffer
-  short buffer[16 * 16];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
-  vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);
-}
-
-#elif DWTDCT_TYPE == DWTDCT8X8
-
-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
-  // assume out is a 32x32 buffer
-  short buffer[8 * 8];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);
-
-  vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);
-
-  vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);
-
-  vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
-  for (i = 0; i < 8; ++i)
-    vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);
-
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-  for (i = 16; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) {
-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
-    }
-  }
-}
-
-#endif
-
-#if CONFIG_TX64X64
-void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {
-  // assume out is a 64x64 buffer
-  short buffer[16 * 16];
-  int i, j;
-  const int short_pitch = pitch >> 1;
-#if DWT_TYPE == 26
-  dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);
-#elif DWT_TYPE == 97
-  dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);
-#elif DWT_TYPE == 53
-  dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);
-#endif
-  // TODO(debargha): Implement more efficiently by adding output pitch
-  // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);
-
-#if DWTDCT_TYPE == DWTDCT16X16_LEAN
-  for (i = 0; i < 16; ++i) {
-    for (j = 16; j < 48; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 16; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-#elif DWTDCT_TYPE == DWTDCT16X16
-  vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);
-
-  vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
-  for (i = 0; i < 16; ++i)
-    vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);
-
-  // There is no dct used on the highest bands for now.
-  // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS
-  // TODO(debargha): experiment with turning these coeffs to 0
-  for (i = 0; i < 32; ++i) {
-    for (j = 32; j < 64; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-  for (i = 32; i < 64; ++i) {
-    for (j = 0; j < 64; ++j) {
-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
-    }
-  }
-#endif  // DWTDCT_TYPE
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_DWTDCTHYBRID
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 927a1b901..fe33f2ebf 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -28,6 +28,7 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9_rtcd.h"
 #include <stdio.h>
@@ -1230,8 +1231,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-
-  TOKENEXTRA *tp = cpi->tok;
   int totalrate;
 
   // printf("encode_frame_internal frame %d (%d)\n",
@@ -1312,26 +1311,19 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
     {
       // Take tiles into account and give start/end MB
-      int tile, mb_start = 0;
+      int tile_col;
+      TOKENEXTRA *tp = cpi->tok;
 
-      for (tile = 0; tile < cm->tile_columns; tile++) {
-        // calculate end of tile column
-        const int sb_cols = (cm->mb_cols + 3) >> 2;
-        const int sb_end = (sb_cols * (tile + 1)) >> cpi->oxcf.tile_columns;
-        const int mb_end = ((sb_end << 2) > cm->mb_cols) ?
-                            cm->mb_cols : (sb_end << 2);
+      for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+        TOKENEXTRA *tp_old = tp;
 
         // For each row of SBs in the frame
-        cm->cur_tile_idx = tile;
-        cm->cur_tile_mb_col_start = mb_start;
-        cm->cur_tile_mb_col_end = mb_end;
+        vp9_get_tile_col_offsets(cm, tile_col);
         for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
           encode_sb_row(cpi, mb_row, &tp, &totalrate);
         }
-        mb_start = mb_end;
+        cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
       }
-
-      cpi->tok_count = (unsigned int)(tp - cpi->tok);
     }
 
     vpx_usec_timer_mark(&emr_timer);
@@ -1543,8 +1535,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
     /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
 #if CONFIG_LOSSLESS
+    cpi->mb.e_mbd.lossless = 0;
     if (cpi->oxcf.lossless) {
       txfm_type = ONLY_4X4;
+      cpi->mb.e_mbd.lossless = 1;
     } else
 #endif
     /* FIXME (rbultje)
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 1dd30130a..a52763080 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -62,7 +62,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
     vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
 #endif
   } else {
-    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+    x->fwd_txm4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(be, b) ;
     vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32);
   }
@@ -165,7 +165,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
                    tx_type, 8, xd->block[idx].eob);
 #endif
     } else {
-      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
       vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
     }
@@ -183,13 +183,13 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
         vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
 #endif
       } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
-        x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+        x->fwd_txm8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(be, be + 1, b, b + 1);
         vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
         vp9_inverse_transform_b_4x4(xd, ib + iblock[i] + 1, 32);
         i++;
       } else {
-        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->fwd_txm4x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4(be, b);
         vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
       }
@@ -222,7 +222,7 @@ static void encode_intra_uv4x4(MACROBLOCK *x, int ib,
 
   vp9_subtract_b(be, b, 8);
 
-  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
+  x->fwd_txm4x4(be->src_diff, be->coeff, 16);
   x->quantize_b_4x4(be, b);
   vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16);
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 52eabf129..12082a88d 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -188,11 +188,11 @@ void vp9_transform_mby_4x4(MACROBLOCK *x) {
       assert(has_2nd_order == 0);
       vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 4);
     } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) {
-      x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
+      x->fwd_txm8x4(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
       i++;
     } else {
-      x->vp9_short_fdct4x4(&x->block[i].src_diff[0],
+      x->fwd_txm4x4(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
     }
   }
@@ -202,7 +202,7 @@ void vp9_transform_mby_4x4(MACROBLOCK *x) {
     build_dcblock_4x4(x);
 
     // do 2nd order transform on the dc block
-    x->short_walsh4x4(&x->block[24].src_diff[0],
+    x->fwd_2ndtxm4x4(&x->block[24].src_diff[0],
                       &x->block[24].coeff[0], 8);
   } else {
     vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0]));
@@ -213,7 +213,7 @@ void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
   int i;
 
   for (i = 16; i < 24; i += 2) {
-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
+    x->fwd_txm8x4(&x->block[i].src_diff[0],
                          &x->block[i].coeff[0], 16);
   }
 }
@@ -253,7 +253,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
       assert(has_2nd_order == 0);
       vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 8);
     } else {
-      x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+      x->fwd_txm8x8(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
     }
   }
@@ -264,7 +264,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
       assert(has_2nd_order == 0);
       vp9_fht_c(b->src_diff, 32, (b + 2)->coeff, tx_type, 8);
     } else {
-      x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+      x->fwd_txm8x8(&x->block[i].src_diff[0],
                            &x->block[i + 2].coeff[0], 32);
     }
   }
@@ -274,7 +274,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
     build_dcblock_8x8(x);
 
     // do 2nd order transform on the dc block
-    x->short_fhaar2x2(&x->block[24].src_diff[0],
+    x->fwd_2ndtxm2x2(&x->block[24].src_diff[0],
                       &x->block[24].coeff[0], 8);
   } else {
     vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0]));
@@ -285,7 +285,7 @@ void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
   int i;
 
   for (i = 16; i < 24; i += 4) {
-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+    x->fwd_txm8x8(&x->block[i].src_diff[0],
                          &x->block[i].coeff[0], 16);
   }
 }
@@ -303,7 +303,7 @@ void vp9_transform_mby_16x16(MACROBLOCK *x) {
   if (tx_type != DCT_DCT) {
     vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 16);
   } else {
-    x->vp9_short_fdct16x16(&x->block[0].src_diff[0],
+    x->fwd_txm16x16(&x->block[0].src_diff[0],
                            &x->block[0].coeff[0], 32);
   }
 }
@@ -321,9 +321,9 @@ void vp9_transform_sby_32x32(MACROBLOCK *x) {
 void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
   SUPERBLOCK * const x_sb = &x->sb_coeff_data;
   vp9_clear_system_state();
-  x->vp9_short_fdct16x16(x_sb->src_diff + 1024,
+  x->fwd_txm16x16(x_sb->src_diff + 1024,
                          x_sb->coeff + 1024, 32);
-  x->vp9_short_fdct16x16(x_sb->src_diff + 1280,
+  x->fwd_txm16x16(x_sb->src_diff + 1280,
                          x_sb->coeff + 1280, 32);
 }
 
@@ -361,6 +361,13 @@ static const int plane_rd_mult[4] = {
   }\
 }
 
+// This function is a place holder for now but may ultimately need
+// to scan previous tokens to work out the correct context.
+static int trellis_get_coeff_context(int token) {
+  int recent_energy = 0;
+  return vp9_get_coef_context(&recent_energy, token);
+}
+
 static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        int tx_size) {
@@ -380,9 +387,6 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
   int err_mult = plane_rd_mult[type];
   int default_eob;
   int const *scan, *bands;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-#endif
 
   switch (tx_size) {
     default:
@@ -424,9 +428,6 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       default_eob = 256;
       break;
   }
-#if CONFIG_NEWCOEFCONTEXT
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-#endif
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
@@ -459,12 +460,7 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = bands[i + 1];
-        pt = vp9_prev_token_class[t0];
-#if CONFIG_NEWCOEFCONTEXT
-        if (NEWCOEFCONTEXT_BAND_COND(band))
-          pt = vp9_get_coef_neighbor_context(
-              qcoeff_ptr, i0, neighbors, scan[i + 1]);
-#endif
+        pt = trellis_get_coeff_context(t0);
         rate0 +=
           mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
         rate1 +=
@@ -512,34 +508,12 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
       if (next < default_eob) {
         band = bands[i + 1];
         if (t0 != DCT_EOB_TOKEN) {
-#if CONFIG_NEWCOEFCONTEXT
-          int tmp = qcoeff_ptr[scan[i]];
-          qcoeff_ptr[scan[i]] = x;
-          if (NEWCOEFCONTEXT_BAND_COND(band))
-            pt = vp9_get_coef_neighbor_context(
-                qcoeff_ptr, i0, neighbors, scan[i + 1]);
-          else
-            pt = vp9_prev_token_class[t0];
-          qcoeff_ptr[scan[i]] = tmp;
-#else
-          pt = vp9_prev_token_class[t0];
-#endif
+          pt = trellis_get_coeff_context(t0);
           rate0 += mb->token_costs[tx_size][type][band][pt][
               tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
-#if CONFIG_NEWCOEFCONTEXT
-          int tmp = qcoeff_ptr[scan[i]];
-          qcoeff_ptr[scan[i]] = x;
-          if (NEWCOEFCONTEXT_BAND_COND(band))
-            pt = vp9_get_coef_neighbor_context(
-                qcoeff_ptr, i0, neighbors, scan[i + 1]);
-          else
-            pt = vp9_prev_token_class[t1];
-          qcoeff_ptr[scan[i]] = tmp;
-#else
-          pt = vp9_prev_token_class[t1];
-#endif
+          pt = trellis_get_coeff_context(t1);
           rate1 += mb->token_costs[tx_size][type][band][pt][
               tokens[next][1].token];
         }
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4694a92c6..3791737d2 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1546,7 +1546,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv *best_mv = &d->bmi.as_mv[0];
   int_mv this_mv;
   int bestsad = INT_MAX;
   int r, c;
@@ -1641,7 +1641,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv *best_mv = &d->bmi.as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1770,7 +1770,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv.first;
+  int_mv *best_mv = &d->bmi.as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index ad5fe7819..3e5940f55 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_extend.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
@@ -752,10 +753,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->quarter_pixel_search = 1;
   sf->half_pixel_search = 1;
   sf->iterative_sub_pixel = 1;
-#if CONFIG_LOSSLESS
-  sf->optimize_coefficients = 0;
-#else
   sf->optimize_coefficients = 1;
+#if CONFIG_LOSSLESS
+  if (cpi->oxcf.lossless)
+    sf->optimize_coefficients = 0;
 #endif
   sf->no_skip_block4x4_search = 1;
   sf->first_step = 0;
@@ -840,20 +841,18 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     }
   }
 
-  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;
-  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;
-  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;
-  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;
-  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
-  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
+  cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;
+  cpi->mb.fwd_txm8x8    = vp9_short_fdct8x8;
+  cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;
+  cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;
+  cpi->mb.fwd_2ndtxm4x4 = vp9_short_walsh4x4;
+  cpi->mb.fwd_2ndtxm2x2 = vp9_short_fhaar2x2;
 
 #if CONFIG_LOSSLESS
   if (cpi->oxcf.lossless) {
-    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;
-    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;
-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
-    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;
+    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4_x8;
+    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4_x8;
+    cpi->mb.fwd_2ndtxm4x4 = vp9_short_walsh4x4_lossless;
   }
 #endif
 
@@ -949,7 +948,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
-
   vpx_free(cpi->tok);
 
   {
@@ -1107,6 +1105,22 @@ rescale(int val, int num, int denom) {
   return (int)(llval * llnum / llden);
 }
 
+static void set_tile_limits(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int min_log2_tiles, max_log2_tiles;
+
+  cm->log2_tile_columns = cpi->oxcf.tile_columns;
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+  vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);
+  max_log2_tiles += min_log2_tiles;
+  if (cm->log2_tile_columns < min_log2_tiles)
+    cm->log2_tile_columns = min_log2_tiles;
+  else if (cm->log2_tile_columns > max_log2_tiles)
+    cm->log2_tile_columns = max_log2_tiles;
+  cm->tile_columns = 1 << cm->log2_tile_columns;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+}
 
 static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
@@ -1145,7 +1159,7 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->gld_fb_idx = 1;
   cpi->alt_fb_idx = 2;
 
-  cm->tile_columns = 1 << cpi->oxcf.tile_columns;
+  set_tile_limits(cpi);
 
 #if VP9_TEMPORAL_ALT_REF
   {
@@ -1206,18 +1220,18 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
   cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
-  cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
-  cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_idct4x4llm;
-  cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
-  cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
+  cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;
+  cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;
+  cpi->mb.e_mbd.inv_2ndtxm4x4_1 = vp9_short_inv_walsh4x4_1;
+  cpi->mb.e_mbd.inv_2ndtxm4x4   = vp9_short_inv_walsh4x4;
 
 #if CONFIG_LOSSLESS
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;
-    cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;
-    cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;
-    cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;
+    cpi->mb.e_mbd.inv_2ndtxm4x4_1 = vp9_short_inv_walsh4x4_1_lossless;
+    cpi->mb.e_mbd.inv_2ndtxm4x4   = vp9_short_inv_walsh4x4_lossless;
   }
 #endif
 
@@ -1372,7 +1386,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->last_frame_distortion = 0;
 #endif
 
-  cm->tile_columns = 1 << cpi->oxcf.tile_columns;
+  set_tile_limits(cpi);
 }
 
 #define M_LOG2_E 0.693147180559945309417
@@ -2619,10 +2633,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // For 2 Pass Only used where GF/ARF prediction quality
   // is above a threshold
   cpi->zbin_mode_boost = 0;
-#if CONFIG_LOSSLESS
-  cpi->zbin_mode_boost_enabled = FALSE;
-#else
   cpi->zbin_mode_boost_enabled = TRUE;
+#if CONFIG_LOSSLESS
+  if (cpi->oxcf.lossless)
+    cpi->zbin_mode_boost_enabled = FALSE;
 #endif
   if (cpi->gfu_boost <= 400) {
     cpi->zbin_mode_boost_enabled = FALSE;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 7acaef472..1476de4da 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -347,7 +347,7 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tok;
-  unsigned int tok_count;
+  unsigned int tok_count[1 << 6];
 
 
   unsigned int frames_since_key;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index b5dbef0b3..e66db7499 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -228,43 +228,71 @@ void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
 }
 
 void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  int zero_run = 0;
-  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
-  int16_t *coeff_ptr  = b->coeff;
-  int16_t *zbin_ptr   = b->zbin;
-  int16_t *round_ptr  = b->round;
-  int16_t *quant_ptr  = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
   int16_t *qcoeff_ptr = d->qcoeff;
   int16_t *dqcoeff_ptr = d->dqcoeff;
-  int16_t *dequant_ptr = d->dequant;
-  int zbin_oq_value = b->zbin_extra;
 
   vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));
 
-  eob = -1;
-
   if (!b->skip_block) {
-    for (i = 0; i < 64; i++) {
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    int zero_run;
+    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+    int16_t *coeff_ptr  = b->coeff;
+    int16_t *zbin_ptr   = b->zbin;
+    int16_t *round_ptr  = b->round;
+    int16_t *quant_ptr  = b->quant;
+    uint8_t *quant_shift_ptr = b->quant_shift;
+    int16_t *dequant_ptr = d->dequant;
+    int zbin_oq_value = b->zbin_extra;
+
+    eob = -1;
+
+    // Special case for DC as it is the one triggering access in various
+    // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0]
+    {
+      z    = coeff_ptr[0];
+      zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value);
+      zero_run = 1;
+
+      sz = (z >> 31);                                // sign of z
+      x  = (z ^ sz) - sz;                            // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[0]);
+        y  = ((int)(((int)(x * quant_ptr[0]) >> 16) + x))
+             >> quant_shift_ptr[0];                  // quantize (x)
+        x  = (y ^ sz) - sz;                          // get the sign back
+        qcoeff_ptr[0]  = x;                          // write to destination
+        dqcoeff_ptr[0] = x * dequant_ptr[0];         // dequantized value
+
+        if (y) {
+          eob = 0;                                   // last nonzero coeffs
+          zero_run = 0;
+        }
+      }
+    }
+    for (i = 1; i < 64; i++) {
       rc   = vp9_default_zig_zag1d_8x8[i];
       z    = coeff_ptr[rc];
-      zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
-      zero_run += (zero_run < 15);
+      zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value);
+      // The original code was incrementing zero_run while keeping it at
+      // maximum 15 by adding "(zero_run < 15)". The same is achieved by
+      // removing the opposite of the sign mask of "(zero_run - 15)".
+      zero_run -= (zero_run - 15) >> 31;
 
       sz = (z >> 31);                                // sign of z
       x  = (z ^ sz) - sz;                            // x = abs(z)
 
       if (x >= zbin) {
         x += (round_ptr[rc != 0]);
-        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-             >> quant_shift_ptr[rc != 0];            // quantize (x)
+        y  = ((int)(((int)(x * quant_ptr[1]) >> 16) + x))
+             >> quant_shift_ptr[1];                  // quantize (x)
         x  = (y ^ sz) - sz;                          // get the sign back
         qcoeff_ptr[rc]  = x;                         // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
+        dqcoeff_ptr[rc] = x * dequant_ptr[1];        // dequantized value
 
         if (y) {
           eob = i;                                   // last nonzero coeffs
@@ -272,8 +300,10 @@ void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
         }
       }
     }
+    d->eob = eob + 1;
+  } else {
+    d->eob = 0;
   }
-  d->eob = eob + 1;
 }
 
 void vp9_quantize_mby_8x8(MACROBLOCK *x) {
@@ -460,18 +490,14 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
                                      14, 16, 20, 24, 28, 32, 36, 40 };
 
-
-  int qrounding_factor = 48;
-
   for (Q = 0; Q < QINDEX_RANGE; Q++) {
     int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
 
+    int qrounding_factor = 48;
 #if CONFIG_LOSSLESS
-    if (cpi->oxcf.lossless) {
-      if (Q == 0) {
-        qzbin_factor = 64;
-        qrounding_factor = 64;
-      }
+    if (cpi->oxcf.lossless && Q == 0) {
+      qzbin_factor = 64;
+      qrounding_factor = 64;
     }
 #endif
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 29893b819..8385a1872 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -419,11 +419,6 @@ int vp9_uvsse(MACROBLOCK *x) {
 
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-#define PT pn
-#else
-#define PT pt
-#endif
 static INLINE int cost_coeffs(MACROBLOCK *mb,
                               BLOCKD *b, PLANE_TYPE type,
                               ENTROPY_CONTEXT *a,
@@ -443,11 +438,6 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       (tx_type == DCT_DCT) ? mb->token_costs[tx_size][type] :
                              mb->hybrid_token_costs[tx_size][type];
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-  int pn;
-#endif
-
   ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
 
   switch (tx_size) {
@@ -495,50 +485,34 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-#if CONFIG_NEWCOEFCONTEXT
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-  pn = pt;
-#endif
 
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
 
   if (tx_type != DCT_DCT) {
+    int recent_energy = 0;
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].Token;
-      cost += token_costs[band[c]][PT][t];
+      cost += token_costs[band[c]][pt][t];
       cost += vp9_dct_value_cost_ptr[v];
-      pt = vp9_prev_token_class[t];
-#if CONFIG_NEWCOEFCONTEXT
-      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))
-        pn = vp9_get_coef_neighbor_context(
-            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
-      else
-        pn = pt;
-#endif
+      pt = vp9_get_coef_context(&recent_energy, t);
     }
     if (c < seg_eob)
       cost += mb->hybrid_token_costs[tx_size][type][band[c]]
-          [PT][DCT_EOB_TOKEN];
+          [pt][DCT_EOB_TOKEN];
   } else {
+    int recent_energy = 0;
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].Token;
       cost += token_costs[band[c]][pt][t];
       cost += vp9_dct_value_cost_ptr[v];
-      pt = vp9_prev_token_class[t];
-#if CONFIG_NEWCOEFCONTEXT
-      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))
-        pn = vp9_get_coef_neighbor_context(
-            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
-      else
-        pn = pt;
-#endif
+      pt = vp9_get_coef_context(&recent_energy, t);
     }
     if (c < seg_eob)
       cost += mb->token_costs[tx_size][type][band[c]]
-          [PT][DCT_EOB_TOKEN];
+          [pt][DCT_EOB_TOKEN];
   }
 
   // is eob first coefficient;
@@ -698,7 +672,8 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
   // TODO(jingning) is it possible to quickly determine whether to force
   //                trailing coefficients to be zero, instead of running trellis
   //                optimization in the rate-distortion optimization loop?
-  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
+  if (mb->optimize &&
+      xd->mode_info_context->mbmi.mode < I8X8_PRED)
     vp9_optimize_mby_16x16(mb);
 
   d = vp9_mbblock_error(mb, 0);
@@ -859,21 +834,18 @@ static void super_block_yrd_32x32(MACROBLOCK *x,
   SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
   MACROBLOCKD * const xd = &x->e_mbd;
   SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
-#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
+#if DEBUG_ERROR
   int16_t out[1024];
 #endif
 
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
-#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
+#if DEBUG_ERROR
   vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
 #endif
 
-#if !CONFIG_DWTDCTHYBRID
   *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
-#else
-  *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;
-#endif
+
 #if DEBUG_ERROR
   printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
          vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
@@ -1140,7 +1112,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
       vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
       vp9_ht_quantize_b_4x4(be, b, tx_type);
     } else {
-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(be, b);
     }
 
@@ -1176,7 +1148,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
     vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);
 #endif
   else
-    xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);
+    xd->inv_txm4x4(best_dqcoeff, b->diff, 32);
 
   vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 
@@ -1440,7 +1412,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       if (tx_type != DCT_DCT)
         vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
       else
-        x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
 
       // compute quantization mse of 8x8 block
@@ -1474,11 +1446,11 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
           vp9_ht_quantize_b_4x4(be, b, tx_type);
         } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
-          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
           x->quantize_b_4x4_pair(be, be + 1, b, b + 1);
           do_two = 1;
         } else {
-          x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+          x->fwd_txm4x4(be->src_diff, be->coeff, 32);
           x->quantize_b_4x4(be, b);
         }
         distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
@@ -2166,17 +2138,17 @@ static int labels2mode(
           }
           break;
         case LEFT4X4:
-          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int :
+          this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int :
                                   left_block_mv(xd, mic, i);
           if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int :
+            this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int :
                                            left_block_second_mv(xd, mic, i);
           break;
         case ABOVE4X4:
-          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int :
+          this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int :
                                   above_block_mv(mic, i, mis);
           if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int :
+            this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int :
                                            above_block_second_mv(mic, i, mis);
           break;
         case ZERO4X4:
@@ -2192,10 +2164,10 @@ static int labels2mode(
         int_mv left_mv, left_second_mv;
 
         left_second_mv.as_int = 0;
-        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :
+        left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int :
                          left_block_mv(xd, mic, i);
         if (mbmi->second_ref_frame > 0)
-          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :
+          left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int :
                                   left_block_second_mv(xd, mic, i);
 
         if (left_mv.as_int == this_mv->as_int &&
@@ -2212,9 +2184,9 @@ static int labels2mode(
 #endif
     }
 
-    d->bmi.as_mv.first.as_int = this_mv->as_int;
+    d->bmi.as_mv[0].as_int = this_mv->as_int;
     if (mbmi->second_ref_frame > 0)
-      d->bmi.as_mv.second.as_int = this_second_mv->as_int;
+      d->bmi.as_mv[1].as_int = this_second_mv->as_int;
 
     x->partition_info->bmi[i].mode = m;
     x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
@@ -2248,7 +2220,7 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
       if (xd->mode_info_context->mbmi.second_ref_frame > 0)
         vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix);
       vp9_subtract_b(be, bd, 16);
-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(be, bd);
       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
       *distortion += thisdistortion;
@@ -2300,7 +2272,7 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
         if (otherrd) {
-          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
+          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
           x->quantize_b_8x8(be2, bd2);
           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
           otherdist += thisdistortion;
@@ -2312,7 +2284,7 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
         for (j = 0; j < 4; j += 2) {
           bd = &xd->block[ib + iblock[j]];
           be = &x->block[ib + iblock[j]];
-          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
           x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
           *distortion += thisdistortion;
@@ -2330,7 +2302,7 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
           for (j = 0; j < 4; j += 2) {
             BLOCKD *bd = &xd->block[ib + iblock[j]];
             BLOCK *be = &x->block[ib + iblock[j]];
-            x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+            x->fwd_txm8x4(be->src_diff, be->coeff, 32);
             x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
             otherdist += thisdistortion;
@@ -2344,7 +2316,7 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
                            TX_4X4);
           }
         }
-        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
+        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
         x->quantize_b_8x8(be2, bd2);
         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
         *distortion += thisdistortion;
@@ -2500,9 +2472,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
           // use previous block's result as next block's MV predictor.
           if (segmentation == PARTITIONING_4X4 && i > 0) {
-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
+            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int;
             if (i == 4 || i == 8 || i == 12)
-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
+              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int;
             step_param = 2;
           }
         }
@@ -2541,11 +2513,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
             if (thissme < bestsme) {
               bestsme = thissme;
-              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
+              mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int;
             } else {
               /* The full search result is actually worse so re-instate the
                * previous best vector */
-              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
+              e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int;
             }
           }
         }
@@ -2885,9 +2857,9 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 16; i++) {
     BLOCKD *bd = &x->e_mbd.block[i];
 
-    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;
+    bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int;
     if (mbmi->second_ref_frame > 0)
-      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;
+      bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int;
     bd->eob = bsi.eobs[i];
   }
 
@@ -3307,8 +3279,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        x->nmvjointcost, x->mvcost,
                                        &dis, &sse);
         }
-        d->bmi.as_mv.first.as_int = tmp_mv.as_int;
-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
+        d->bmi.as_mv[0].as_int = tmp_mv.as_int;
+        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -4251,10 +4223,12 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (best_mbmode.mode == SPLITMV) {
     for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;
+      xd->mode_info_context->bmi[i].as_mv[0].as_int =
+          best_bmodes[i].as_mv[0].as_int;
     if (mbmi->second_ref_frame > 0)
       for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;
+        xd->mode_info_context->bmi[i].as_mv[1].as_int =
+            best_bmodes[i].as_mv[1].as_int;
 
     vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 17d8f25bd..b125a486e 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -13,6 +13,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
 void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
   int mb_row, mb_col;
@@ -254,7 +255,7 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   int t_pred_cost = INT_MAX;
 
   int i;
-  int tile, mb_row, mb_col, mb_start = 0;
+  int tile_col, mb_row, mb_col;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
   int no_pred_segcounts[MAX_MB_SEGMENTS];
@@ -282,21 +283,13 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   // First of all generate stats regarding how well the last segment map
   // predicts this one
 
-  for (tile = 0; tile < cm->tile_columns; tile++) {
-    // calculate end of tile column
-    const int sb_cols = (cm->mb_cols + 3) >> 2;
-    const int sb_end = (sb_cols * (tile + 1)) >> cpi->oxcf.tile_columns;
-    const int mb_end = ((sb_end << 2) > cm->mb_cols) ?
-                        cm->mb_cols : (sb_end << 2);
-
-    cm->cur_tile_idx = tile;
-    cm->cur_tile_mb_col_start = mb_start;
-    cm->cur_tile_mb_col_end = mb_end;
-
-    mi_ptr = cm->mi + mb_start;
+  for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+    vp9_get_tile_col_offsets(cm, tile_col);
+    mi_ptr = cm->mi + cm->cur_tile_mb_col_start;
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
       mi = mi_ptr;
-      for (mb_col = mb_start; mb_col < mb_end; mb_col += 4, mi += 4) {
+      for (mb_col = cm->cur_tile_mb_col_start;
+           mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) {
         if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
           count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
                      t_unpred_seg_counts, 4, mb_row, mb_col);
@@ -338,8 +331,6 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
         }
       }
     }
-
-    mb_start = mb_end;
   }
 
   // Work out probability tree for coding segments without prediction
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 7bca01e05..164709009 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -171,7 +171,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,
+  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0],
                            step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
                            NULL, NULL, NULL, NULL,
                            &best_ref_mv1);
@@ -183,7 +183,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
     int distortion;
     unsigned int sse;
     // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,
+    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0],
                                            &best_ref_mv1,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
@@ -263,8 +263,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         if (cpi->frames[frame] == NULL)
           continue;
 
-        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;
-        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;
+        mbd->block[0].bmi.as_mv[0].as_mv.row = 0;
+        mbd->block[0].bmi.as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -297,8 +297,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
            cpi->frames[frame]->u_buffer + mb_uv_offset,
            cpi->frames[frame]->v_buffer + mb_uv_offset,
            cpi->frames[frame]->y_stride,
-           mbd->block[0].bmi.as_mv.first.as_mv.row,
-           mbd->block[0].bmi.as_mv.first.as_mv.col,
+           mbd->block[0].bmi.as_mv[0].as_mv.row,
+           mbd->block[0].bmi.as_mv[0].as_mv.col,
            predictor);
 
           // Apply the filter (YUV)
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 2dedb1a51..12fee9037 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -100,12 +100,6 @@ static void fill_value_tokens() {
   vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
-#if CONFIG_NEWCOEFCONTEXT
-#define PT pn
-#else
-#define PT pt
-#endif
-
 static void tokenize_b(VP9_COMP *cpi,
                        MACROBLOCKD *xd,
                        const int ib,
@@ -115,6 +109,7 @@ static void tokenize_b(VP9_COMP *cpi,
                        int dry_run) {
   int pt; /* near block/prev token context index */
   int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+  int recent_energy = 0;
   const BLOCKD * const b = xd->block + ib;
   const int eob = b->eob;     /* one beyond last nonzero coeff */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
@@ -126,10 +121,6 @@ static void tokenize_b(VP9_COMP *cpi,
   vp9_coeff_probs *probs;
   const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
                           get_tx_type(xd, b) : DCT_DCT;
-#if CONFIG_NEWCOEFCONTEXT
-  const int *neighbors;
-  int pn;
-#endif
 
   ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
       vp9_block2above[tx_size][ib];
@@ -228,10 +219,6 @@ static void tokenize_b(VP9_COMP *cpi,
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-#if CONFIG_NEWCOEFCONTEXT
-  neighbors = vp9_get_coef_neighbors_handle(scan);
-  pn = pt;
-#endif
 
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
@@ -252,21 +239,15 @@ static void tokenize_b(VP9_COMP *cpi,
     }
 
     t->Token = token;
-    t->context_tree = probs[type][band][PT];
+    t->context_tree = probs[type][band][pt];
     t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
                                      (band > 1 && type == PLANE_TYPE_Y_NO_DC));
     assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
     if (!dry_run) {
-      ++counts[type][band][PT][token];
+      ++counts[type][band][pt][token];
     }
-    pt = vp9_prev_token_class[token];
-#if CONFIG_NEWCOEFCONTEXT
-    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(bands[c + 1]))
-      pn = vp9_get_coef_neighbor_context(
-          qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
-    else
-      pn = pt;
-#endif
+
+    pt = vp9_get_coef_context(&recent_energy, token);
     ++t;
   } while (c < eob && ++c < seg_eob);
 
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d8d95a136..eb152f521 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -59,6 +59,8 @@ VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
+VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
+VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.h
 VP9_COMMON_SRCS-yes += common/vp9_invtrans.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0b8677285..81f02ee6b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -27,6 +27,7 @@ struct vp8_extracfg {
   unsigned int                Sharpness;
   unsigned int                static_thresh;
   unsigned int                tile_columns;
+  unsigned int                tile_rows;
   unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
   unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
   unsigned int                arnr_type;        /* alt_ref filter type */
@@ -54,7 +55,8 @@ static const struct extraconfig_map extracfg_map[] = {
       0,                          /* noise_sensitivity */
       0,                          /* Sharpness */
       0,                          /* static_thresh */
-      VP8_ONE_TILE_COLUMN,        /* tile_columns */
+      0,                          /* tile_columns */
+      0,                          /* tile_rows */
       0,                          /* arnr_max_frames */
       3,                          /* arnr_strength */
       3,                          /* arnr_type*/
@@ -171,8 +173,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 
   RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
 
-  RANGE_CHECK(vp8_cfg, tile_columns,
-              VP8_ONE_TILE_COLUMN, VP8_FOUR_TILE_COLUMNS);
+  RANGE_CHECK(vp8_cfg, tile_columns, 0, 6);
+  RANGE_CHECK(vp8_cfg, tile_rows, 0, 2);
   RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
   RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
   RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
@@ -310,6 +312,7 @@ static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
   oxcf->tuning = vp8_cfg.tuning;
 
   oxcf->tile_columns = vp8_cfg.tile_columns;
+  oxcf->tile_rows = vp8_cfg.tile_rows;
 
 #if CONFIG_LOSSLESS
   oxcf->lossless = vp8_cfg.lossless;
@@ -417,6 +420,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
       MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);
       MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);
       MAP(VP9E_SET_TILE_COLUMNS,          xcfg.tile_columns);
+      MAP(VP9E_SET_TILE_ROWS,             xcfg.tile_rows);
 
       MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
       MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);
@@ -1007,6 +1011,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   {VP8E_SET_SHARPNESS,                set_param},
   {VP8E_SET_STATIC_THRESHOLD,         set_param},
   {VP9E_SET_TILE_COLUMNS,             set_param},
+  {VP9E_SET_TILE_ROWS,                set_param},
   {VP8E_GET_LAST_QUANTIZER,           get_param},
   {VP8E_GET_LAST_QUANTIZER_64,        get_param},
   {VP8E_SET_ARNR_MAXFRAMES,           set_param},