56 files changed, 7827 insertions, 2118 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
index e559272cd..751bc74bc 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
@@ -112,27 +112,27 @@
     vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
 
     ; only compare the largest value to limit
-    vmax.u8     q11, q11, q12               ; m1 = max(m1, m2)
-    vmax.u8     q12, q13, q14               ; m2 = max(m3, m4)
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
 
     vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
 
-    vmax.u8     q3, q3, q4                  ; m3 = max(m5, m6)
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
 
     vmov.u8     q10, #0x80
 
-    vmax.u8     q15, q11, q12               ; m1 = max(m1, m2)
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
 
     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3                ; m1 = max(m1, m3)
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
 
     vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
     vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
 
     veor        q7, q7, q10                 ; qs0
 
-    vcge.u8     q15, q1, q15                ; abs(m1) > limit
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
 
     vshr.u8     q2, q2, #1                  ; a = a / 2
     veor        q6, q6, q10                 ; ps0
@@ -142,7 +142,7 @@
 
     veor        q8, q8, q10                 ; qs1
 
-    vmov.u8     q4, #3
+    vmov.u16    q4, #3
 
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q11, d15, d13
@@ -150,13 +150,15 @@
     vcge.u8     q9, q0, q9                  ; a > blimit
 
     vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; hevmask
+    vorr        q14, q13, q14               ; hev
 
     vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
     vmul.i16    q11, q11, q4
 
     vand        q1, q1, q14                 ; filter &= hev
-    vand        q15, q15, q9                ; filter_mask
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
 
     vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
     vaddw.s8    q11, q11, d3
@@ -180,15 +182,14 @@
     ; outer tap adjustments
     vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
 
-    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
 
     vbic        q1, q1, q14                 ; filter &= ~hev
 
     vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
     vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
 
-
-    veor        q7, q0,  q10                ; *oq0 = u^0x80
+    veor        q6, q11, q10                ; *op0 = u^0x80
     veor        q5, q13, q10                ; *op1 = u^0x80
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 2f022dc1d..b97e7aa4a 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -10,17 +10,6 @@
 
 #include "./vp9_rtcd.h"
 
-void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
-                                             const uint8_t *blimit0,
-                                             const uint8_t *limit0,
-                                             const uint8_t *thresh0,
-                                             const uint8_t *blimit1,
-                                             const uint8_t *limit1,
-                                             const uint8_t *thresh1) {
-  vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
-  vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
 void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
                                                const uint8_t *blimit0,
                                                const uint8_t *limit0,
@@ -31,3 +20,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
   vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
   vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
 }
+
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);
+}
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
index 36cfc83c4..0c0f155ae 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -306,4 +306,59 @@ void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
     }
   }
 }
+
+void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+                                              const uint8_t *blimit0,
+                                              const uint8_t *limit0,
+                                              const uint8_t *thresh0,
+                                              const uint8_t *blimit1,
+                                              const uint8_t *limit1,
+                                              const uint8_t *thresh1) {
+  vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+                                                const uint8_t *blimit0,
+                                                const uint8_t *limit0,
+                                                const uint8_t *thresh0,
+                                                const uint8_t *blimit1,
+                                                const uint8_t *limit1,
+                                                const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1,
+                                          1);
+}
+
+void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+                                            const uint8_t *blimit0,
+                                            const uint8_t *limit0,
+                                            const uint8_t *thresh0,
+                                            const uint8_t *blimit1,
+                                            const uint8_t *limit1,
+                                            const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                      1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+                                              const uint8_t *blimit0,
+                                              const uint8_t *limit0,
+                                              const uint8_t *thresh0,
+                                              const uint8_t *blimit1,
+                                              const uint8_t *limit1,
+                                              const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
 #endif  // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index d2981601b..f495c29f3 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -200,9 +200,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 
 void vp9_create_common(VP9_COMMON *cm) {
   vp9_machine_specific_config(cm);
-
-  cm->tx_mode = ONLY_4X4;
-  cm->comp_pred_mode = HYBRID_PREDICTION;
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
@@ -211,9 +208,6 @@ void vp9_remove_common(VP9_COMMON *cm) {
 
 void vp9_initialize_common() {
   vp9_init_neighbors();
-  vp9_coef_tree_initialize();
-  vp9_entropy_mode_init();
-  vp9_entropy_mv_init();
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index df963d1cc..993ee7935 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -153,6 +153,34 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
 
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
+                                          const MODE_INFO *left_mi, int b) {
+  if (b == 0 || b == 2) {
+    if (!left_mi || is_inter_block(&left_mi->mbmi))
+      return DC_PRED;
+
+    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+                                             : left_mi->mbmi.mode;
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
+  }
+}
+
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+                                           const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi))
+      return DC_PRED;
+
+    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+                                              : above_mi->mbmi.mode;
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
+  }
+}
+
 enum mv_precision {
   MV_PRECISION_Q3,
   MV_PRECISION_Q4
@@ -170,7 +198,6 @@ struct buf_2d {
 };
 
 struct macroblockd_plane {
-  int16_t *qcoeff;
   int16_t *dqcoeff;
   uint16_t *eobs;
   PLANE_TYPE plane_type;
@@ -359,19 +386,6 @@ static INLINE void foreach_transformed_block_uv(
     foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 << b_width_log2(plane_bsize);
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                      TX_SIZE tx_size, int block,
                                      int *x, int *y) {
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 0f978cc95..b35c43fcd 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -113,49 +113,6 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-// Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
-  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
-  -ONE_TOKEN, 6,                              /* 2 = ONE */
-  8, 12,                                      /* 3 = LOW_VAL */
-  -TWO_TOKEN, 10,                            /* 4 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                   /* 6 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-  18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
-};
-
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  2, 6,                                     /* 0 = LOW_VAL */
-  -TWO_TOKEN, 4,                            /* 1 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 2 = THREE */
-  8, 10,                                    /* 3 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 4 = CAT_ONE */
-  12, 14,                                   /* 5 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 6 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 7 = CAT_FIVE */
-};
-
-
-
-struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits.  Probabilities are constant and
-   do not depend on previously encoded bits */
-
-static const vp9_prob Pcat1[] = { 159};
-static const vp9_prob Pcat2[] = { 165, 145};
-static const vp9_prob Pcat3[] = { 173, 148, 140};
-static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
-static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const vp9_prob Pcat6[] = {
-  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
 const vp9_tree_index vp9_coefmodel_tree[6] = {
   -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
   -ZERO_TOKEN, 4,                               /* 1 = ZERO */
@@ -446,43 +403,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
-
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
-}
-
-const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},           // ZERO_TOKEN
-  {0, 0, 0, 1},           // ONE_TOKEN
-  {0, 0, 0, 2},           // TWO_TOKEN
-  {0, 0, 0, 3},           // THREE_TOKEN
-  {0, 0, 0, 4},           // FOUR_TOKEN
-  {cat1, Pcat1, 1, 5},    // DCT_VAL_CATEGORY1
-  {cat2, Pcat2, 2, 7},    // DCT_VAL_CATEGORY2
-  {cat3, Pcat3, 3, 11},   // DCT_VAL_CATEGORY3
-  {cat4, Pcat4, 4, 19},   // DCT_VAL_CATEGORY4
-  {cat5, Pcat5, 5, 35},   // DCT_VAL_CATEGORY5
-  {cat6, Pcat6, 14, 67},  // DCT_VAL_CATEGORY6
-  {0, 0, 0, 0}            // DCT_EOB_TOKEN
-};
-
 #include "vp9/common/vp9_default_coef_probs.h"
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {
@@ -492,11 +412,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
 #define COEF_COUNT_SAT 24
 #define COEF_MAX_UPDATE_FACTOR 112
 #define COEF_COUNT_SAT_KEY 24
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 92a6c592a..941b251c3 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -44,15 +44,9 @@
 extern DECLARE_ALIGNED(16, const uint8_t,
                        vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
-
-extern const vp9_tree_index vp9_coef_con_tree[];
-
 #define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
 extern const vp9_tree_index vp9_coefmodel_tree[];
 
-extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
 typedef struct {
   const vp9_tree_index *tree;
   const vp9_prob *prob;
@@ -105,8 +99,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-
-void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3b2510dcd..265242129 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -232,21 +232,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
@@ -329,6 +326,7 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
   cm->fc.tx_probs = default_tx_probs;
   vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
+  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 }
 
 const vp9_tree_index vp9_switchable_interp_tree
@@ -336,15 +334,6 @@ const vp9_tree_index vp9_switchable_interp_tree
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
-                       vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree);
-}
 
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
@@ -466,7 +455,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vp9_default_coef_probs(cm);
   vp9_init_mbmode_probs(cm);
   vp9_init_mv_probs(cm);
-  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 38b419948..df58bea3c 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -37,24 +37,13 @@ struct tx_counts {
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-
 extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
-
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-
 extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-
 extern const vp9_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 290dcdd17..60ae79fdc 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -23,7 +23,6 @@ const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
-struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
 const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
@@ -37,19 +36,16 @@ const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_7, -MV_CLASS_8,
   -MV_CLASS_9, -MV_CLASS_10,
 };
-struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
 const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
-struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
   -0, 2,
   -1, 4,
   -2, -3
 };
-struct vp9_token vp9_mv_fp_encodings[4];
 
 static const nmv_context default_nmv_context = {
   {32, 64, 96},
@@ -235,13 +231,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   }
 }
 
-void vp9_entropy_mv_init() {
-  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
-  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
-  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
-  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
-}
-
 void vp9_init_mv_probs(VP9_COMMON *cm) {
   cm->fc.nmvc = default_nmv_context;
 }
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index b62f7c42f..3175a1e49 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -18,7 +18,6 @@
 
 struct VP9Common;
 
-void vp9_entropy_mv_init();
 void vp9_init_mv_probs(struct VP9Common *cm);
 
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
@@ -72,17 +71,10 @@ typedef enum {
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
-extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
-extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
-extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-
-extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)];
-extern struct vp9_token vp9_mv_fp_encodings[4];
+extern const vp9_tree_index vp9_mv_joint_tree[];
+extern const vp9_tree_index vp9_mv_class_tree[];
+extern const vp9_tree_index vp9_mv_class0_tree[];
+extern const vp9_tree_index vp9_mv_fp_tree[];
 
 typedef struct {
   vp9_prob sign;
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 9e4117e17..34411a34f 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -52,20 +52,22 @@ typedef enum PARTITION_TYPE {
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
+// block transform size
 typedef enum {
-  TX_4X4 = 0,                      // 4x4 dct transform
-  TX_8X8 = 1,                      // 8x8 dct transform
-  TX_16X16 = 2,                    // 16x16 dct transform
-  TX_32X32 = 3,                    // 32x32 dct transform
+  TX_4X4 = 0,                      // 4x4 transform
+  TX_8X8 = 1,                      // 8x8 transform
+  TX_16X16 = 2,                    // 16x16 transform
+  TX_32X32 = 3,                    // 32x32 transform
   TX_SIZES
 } TX_SIZE;
 
+// frame transform mode
 typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  ALLOW_32X32         = 3,
-  TX_MODE_SELECT      = 4,
+  ONLY_4X4            = 0,        // only 4x4 transform used
+  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
+  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
+  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
+  TX_MODE_SELECT      = 4,        // transform specified for each block
   TX_MODES            = 5,
 } TX_MODE;
 
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 66178cd1b..ad97c0277 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -63,10 +63,12 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
         break;
       }
   } else {
-    int_mv candidates[2 + MAX_MV_REF_CANDIDATES] = { bmi[1].as_mv[ref_idx],
-                                                     bmi[0].as_mv[ref_idx],
-                                                     mv_list[0],
-                                                     mv_list[1] };
+    int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+    candidates[0] = bmi[1].as_mv[ref_idx];
+    candidates[1] = bmi[0].as_mv[ref_idx];
+    candidates[2] = mv_list[0];
+    candidates[3] = mv_list[1];
+
     assert(block_idx == 3);
     dst_nearest->as_int = bmi[2].as_mv[ref_idx].as_int;
     for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) {
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 2362caa41..e9d4e1171 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -41,32 +41,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    int block_idx, int ref_idx,
                                    int mi_row, int mi_col);
 
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
-                                          const MODE_INFO *left_mi, int b) {
-  if (b == 0 || b == 2) {
-    if (!left_mi || is_inter_block(&left_mi->mbmi))
-      return DC_PRED;
-
-    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
-                                             : left_mi->mbmi.mode;
-  } else {
-    assert(b == 1 || b == 3);
-    return cur_mi->bmi[b - 1].as_mode;
-  }
-}
-
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
-                                           const MODE_INFO *above_mi, int b) {
-  if (b == 0 || b == 1) {
-    if (!above_mi || is_inter_block(&above_mi->mbmi))
-      return DC_PRED;
-
-    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
-                                              : above_mi->mbmi.mode;
-  } else {
-    assert(b == 2 || b == 3);
-    return cur_mi->bmi[b - 2].as_mode;
-  }
-}
-
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 0b48de2cb..ff2bc45e4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -353,29 +353,17 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 
     // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr);
-          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr);
-        } else {
-          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr);
-        }
+      if (mask_16x16_0 & 1) {
+        // if (mask_16x16_0 & 1) is 1, then (mask_16x16_1 & 1) is 1.
+        vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr);
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, 1);
-          vp9_mbloop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
-                                          lfi1->lim, lfi1->hev_thr, 1);
+          vp9_mbloop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, lfi1->mblim,
+                                          lfi1->lim, lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
           vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
                                           lfi0->hev_thr, 1);
@@ -387,11 +375,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
-                                        lfi0->hev_thr, 1);
-          vp9_loop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
-                                        lfi1->lim, lfi1->hev_thr, 1);
+          vp9_loop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
           vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
                                         lfi0->hev_thr, 1);
@@ -403,11 +389,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
-          vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                        lfi0->hev_thr, 1);
-          vp9_loop_filter_vertical_edge(s + 8 *pitch + 4, pitch, lfi1->mblim,
-                                        lfi1->lim, lfi1->hev_thr, 1);
+          vp9_loop_filter_vertical_edge_16(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
           vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                         lfi0->hev_thr, 1);
@@ -448,14 +432,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
     count = 1;
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 2);
-          count = 2;
-        } else {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 1);
-        }
+        // If (mask_16x16 & 1) is 1, then (mask_16x16 & 3) is 3.
+        vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr, 2);
+        count = 2;
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 9edf8701f..ef8de2010 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -169,6 +169,34 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
   }
 }
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+      const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+      filter4(mask, hev, s - 2, s - 1, s, s + 1);
+      s += pitch;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
                            uint8_t *op3, uint8_t *op2,
                            uint8_t *op1, uint8_t *op0,
@@ -264,6 +292,36 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
   }
 }
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+                                          const uint8_t *blimit0,
+                                          const uint8_t *limit0,
+                                          const uint8_t *thresh0,
+                                          const uint8_t *blimit1,
+                                          const uint8_t *limit1,
+                                          const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+      const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
+      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+      filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+                               s,     s + 1, s + 2, s + 3);
+      s += pitch;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 static INLINE void filter16(int8_t mask, uint8_t hev,
                             uint8_t flat, uint8_t flat2,
                             uint8_t *op7, uint8_t *op6,
@@ -366,3 +424,26 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
     s += p;
   }
 }
+
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                    q0, s[4], s[5], s[6], s[7]);
+
+    filter16(mask, hev, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    s += p;
+  }
+}
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index cda68a285..c5faf88f8 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -67,6 +67,7 @@ extern "C"
   typedef enum {
     NO_AQ = 0,
     VARIANCE_AQ = 1,
+    COMPLEXITY_AQ = 2,
     AQ_MODES_COUNT  // This should always be the last member of the enum
   } AQ_MODES;
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index fb959cb36..751accf02 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -81,11 +81,11 @@ typedef struct {
 
 
 typedef enum {
-  SINGLE_PREDICTION_ONLY = 0,
-  COMP_PREDICTION_ONLY   = 1,
-  HYBRID_PREDICTION      = 2,
-  NB_PREDICTION_TYPES    = 3,
-} COMPPREDMODE_TYPE;
+  SINGLE_REFERENCE      = 0,
+  COMPOUND_REFERENCE    = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES       = 3,
+} REFERENCE_MODE;
 
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
@@ -195,7 +195,7 @@ typedef struct VP9Common {
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
-  COMPPREDMODE_TYPE comp_pred_mode;
+  REFERENCE_MODE comp_pred_mode;
 
   FRAME_CONTEXT fc;  /* this frame entropy */
   FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index aa17b85c8..09a4fc826 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -98,7 +98,6 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
   return clamped_mv;
 }
 
-
 // TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
 // calculate the subsampled BLOCK_SIZE, but that type isn't defined for
 // sizes smaller than 16x16 yet.
@@ -206,6 +205,96 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     MAX_MB_PLANE - 1);
 }
 
+// TODO(jingning): This function serves as a placeholder for decoder prediction
+// using on demand border extension. It should be moved to /decoder/ directory.
+static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                       BLOCK_SIZE bsize, int pred_w, int pred_h,
+                                       int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int bwl = b_width_log2(plane_bsize);
+  const int bw = 4 << bwl;
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int x = 4 * (block & ((1 << bwl) - 1));
+  const int y = 4 * (block >> bwl);
+  const MODE_INFO *mi = xd->mi_8x8[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
+
+  assert(x < bw);
+  assert(y < bh);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    struct scale_factors *const scale = &xd->scale_factor[ref];
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
+    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+    // same MV (the average of the 4 luma MVs) but we could do something
+    // smarter for non-4:2:0. Just punt for now, pending the changes to get
+    // rid of SPLITMV mode entirely.
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
+                             : mi_mv_pred_q4(mi, ref))
+               : mi->mbmi.mv[ref].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    uint8_t *pre;
+    MV32 scaled_mv;
+    int xs, ys;
+
+    if (vp9_is_scaled(scale->sfc)) {
+      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
+      scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
+      scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+      xs = scale->sfc->x_step_q4;
+      ys = scale->sfc->y_step_q4;
+    } else {
+      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+
+    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                    &scaled_mv, scale,
+                    4 << pred_w, 4 << pred_h, ref,
+                    &xd->subpix, xs, ys);
+  }
+}
+
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
+  int plane;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+    const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+    if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < 1 << bhl; ++y)
+        for (x = 0; x < 1 << bwl; ++x)
+          dec_build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y);
+    }
+  }
+}
+
 // TODO(dkovalev: find better place for this function)
 void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
@@ -219,9 +308,6 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
     vp9_setup_scale_factors_for_frame(sf, sfc,
                                       fb->y_crop_width, fb->y_crop_height,
                                       cm->width, cm->height);
-
-    if (vp9_is_scaled(sfc))
-      vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
   }
 }
 
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index b328754e7..4a302f988 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -24,6 +24,9 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize);
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e18e757c1..627ea31ed 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -193,12 +193,21 @@ specialize vp9_dc_128_predictor_32x32
 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
 specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2
 
+prototype void vp9_mb_lpf_vertical_edge_w_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
+specialize vp9_mb_lpf_vertical_edge_w_16 sse2 neon dspr2
+
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2
 
+prototype void vp9_mbloop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_vertical_edge_16 sse2 neon dspr2
+
 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_loop_filter_vertical_edge mmx neon dspr2
 
+prototype void vp9_loop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_vertical_edge_16 sse2 neon dspr2
+
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
 
@@ -206,13 +215,13 @@ prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const u
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
 
 prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon
+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon dspr2
 
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
 
 prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_loop_filter_horizontal_edge_16 sse2 neon
+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon dspr2
 
 #
 # post proc
@@ -698,31 +707,31 @@ fi
 
 # fdct functions
 prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2
+specialize vp9_short_fht4x4 sse2 avx2
 
 prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2
+specialize vp9_short_fht8x8 sse2 avx2
 
 prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2
+specialize vp9_short_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
 
 prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2
+specialize vp9_fdct4x4 sse2 avx2
 
 prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2
+specialize vp9_fdct8x8 sse2 avx2
 
 prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
+specialize vp9_fdct16x16 sse2 avx2
 
 prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2
+specialize vp9_fdct32x32 sse2 avx2
 
 prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2
+specialize vp9_fdct32x32_rd sse2 avx2
 
 #
 # Motion search
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index c65184f9c..947c0ba44 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -431,6 +431,27 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
 #define IDCT8_1D  \
   /* Stage1 */      \
   { \
@@ -629,6 +650,25 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  out[4] = out[5] = out[6] = out[7] = zero;
+}
+
 static void idct8_1d_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -1118,14 +1158,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 #define IDCT16_1D \
   /* Stage2 */ \
   { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
     \
     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
                            stg2_0, stg2_1, stg2_2, stg2_3, \
@@ -1138,10 +1178,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     \
   /* Stage3 */ \
   { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
     \
     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
                            stg3_0, stg3_1, stg3_2, stg3_3, \
@@ -1160,10 +1200,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   \
   /* Stage4 */ \
   { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
     \
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -1275,16 +1315,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
-          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
-          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
-          in14 = zero, in15 = zero;
-  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
-          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
-          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
-          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
-          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
+  __m128i in[16], l[16], r[16], *curr1;
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
@@ -1293,162 +1324,132 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
 
-  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
-  for (i = 0; i < 4; i++) {
-    // 1-D idct
-    if (i < 2) {
-      if (i == 1) input += 128;
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+      // 1-D idct
 
       // Load input data.
-      in0 = _mm_load_si128((const __m128i *)input);
-      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
-      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
-      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
-      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
-      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
-      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
-      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
-      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-    }
-
-    if (i == 2) {
-      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
-                    in13, in14, in15);
-    }
+      in[0] = _mm_load_si128((const __m128i *)input);
+      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+
+      IDCT16_1D
+
+      // Stage7
+      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+      curr1 = r;
+      input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+      // 1-D idct
+      array_transpose_8x8(l+i*8, in);
+      array_transpose_8x8(r+i*8, in+8);
 
-    if (i == 3) {
-      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
-                    in12, in13, in14, in15);
-    }
+      IDCT16_1D
 
-    IDCT16_1D
-
-    // Stage7
-    if (i == 0) {
-      // Left 8x16
-      l0 = _mm_add_epi16(stp2_0, stp1_15);
-      l1 = _mm_add_epi16(stp2_1, stp1_14);
-      l2 = _mm_add_epi16(stp2_2, stp2_13);
-      l3 = _mm_add_epi16(stp2_3, stp2_12);
-      l4 = _mm_add_epi16(stp2_4, stp2_11);
-      l5 = _mm_add_epi16(stp2_5, stp2_10);
-      l6 = _mm_add_epi16(stp2_6, stp1_9);
-      l7 = _mm_add_epi16(stp2_7, stp1_8);
-      l8 = _mm_sub_epi16(stp2_7, stp1_8);
-      l9 = _mm_sub_epi16(stp2_6, stp1_9);
-      l10 = _mm_sub_epi16(stp2_5, stp2_10);
-      l11 = _mm_sub_epi16(stp2_4, stp2_11);
-      l12 = _mm_sub_epi16(stp2_3, stp2_12);
-      l13 = _mm_sub_epi16(stp2_2, stp2_13);
-      l14 = _mm_sub_epi16(stp2_1, stp1_14);
-      l15 = _mm_sub_epi16(stp2_0, stp1_15);
-    } else if (i == 1) {
-      // Right 8x16
-      r0 = _mm_add_epi16(stp2_0, stp1_15);
-      r1 = _mm_add_epi16(stp2_1, stp1_14);
-      r2 = _mm_add_epi16(stp2_2, stp2_13);
-      r3 = _mm_add_epi16(stp2_3, stp2_12);
-      r4 = _mm_add_epi16(stp2_4, stp2_11);
-      r5 = _mm_add_epi16(stp2_5, stp2_10);
-      r6 = _mm_add_epi16(stp2_6, stp1_9);
-      r7 = _mm_add_epi16(stp2_7, stp1_8);
-      r8 = _mm_sub_epi16(stp2_7, stp1_8);
-      r9 = _mm_sub_epi16(stp2_6, stp1_9);
-      r10 = _mm_sub_epi16(stp2_5, stp2_10);
-      r11 = _mm_sub_epi16(stp2_4, stp2_11);
-      r12 = _mm_sub_epi16(stp2_3, stp2_12);
-      r13 = _mm_sub_epi16(stp2_2, stp2_13);
-      r14 = _mm_sub_epi16(stp2_1, stp1_14);
-      r15 = _mm_sub_epi16(stp2_0, stp1_15);
-    } else {
       // 2-D
-      in0 = _mm_add_epi16(stp2_0, stp1_15);
-      in1 = _mm_add_epi16(stp2_1, stp1_14);
-      in2 = _mm_add_epi16(stp2_2, stp2_13);
-      in3 = _mm_add_epi16(stp2_3, stp2_12);
-      in4 = _mm_add_epi16(stp2_4, stp2_11);
-      in5 = _mm_add_epi16(stp2_5, stp2_10);
-      in6 = _mm_add_epi16(stp2_6, stp1_9);
-      in7 = _mm_add_epi16(stp2_7, stp1_8);
-      in8 = _mm_sub_epi16(stp2_7, stp1_8);
-      in9 = _mm_sub_epi16(stp2_6, stp1_9);
-      in10 = _mm_sub_epi16(stp2_5, stp2_10);
-      in11 = _mm_sub_epi16(stp2_4, stp2_11);
-      in12 = _mm_sub_epi16(stp2_3, stp2_12);
-      in13 = _mm_sub_epi16(stp2_2, stp2_13);
-      in14 = _mm_sub_epi16(stp2_1, stp1_14);
-      in15 = _mm_sub_epi16(stp2_0, stp1_15);
+      in[0] = _mm_add_epi16(stp2_0, stp1_15);
+      in[1] = _mm_add_epi16(stp2_1, stp1_14);
+      in[2] = _mm_add_epi16(stp2_2, stp2_13);
+      in[3] = _mm_add_epi16(stp2_3, stp2_12);
+      in[4] = _mm_add_epi16(stp2_4, stp2_11);
+      in[5] = _mm_add_epi16(stp2_5, stp2_10);
+      in[6] = _mm_add_epi16(stp2_6, stp1_9);
+      in[7] = _mm_add_epi16(stp2_7, stp1_8);
+      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
 
       dest += 8 - (stride * 16);
-    }
   }
 }
 
@@ -2468,15 +2469,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
-          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
-          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
-          in14 = zero, in15 = zero;
-  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
-          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
-          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-
+  __m128i in[16], l[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
@@ -2484,25 +2477,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
+  in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
   // 1-D idct. Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
-  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
+  TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
 
   // Stage2
   {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
-    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
-    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
+    const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
+    const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
 
     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
@@ -2544,8 +2538,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   // Stage3
   {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
 
     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
@@ -2580,8 +2574,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   // Stage4
   {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
 
@@ -2690,106 +2684,99 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   }
 
   // Stage7. Left 8x16 only.
-  l0 = _mm_add_epi16(stp2_0, stp1_15);
-  l1 = _mm_add_epi16(stp2_1, stp1_14);
-  l2 = _mm_add_epi16(stp2_2, stp2_13);
-  l3 = _mm_add_epi16(stp2_3, stp2_12);
-  l4 = _mm_add_epi16(stp2_4, stp2_11);
-  l5 = _mm_add_epi16(stp2_5, stp2_10);
-  l6 = _mm_add_epi16(stp2_6, stp1_9);
-  l7 = _mm_add_epi16(stp2_7, stp1_8);
-  l8 = _mm_sub_epi16(stp2_7, stp1_8);
-  l9 = _mm_sub_epi16(stp2_6, stp1_9);
-  l10 = _mm_sub_epi16(stp2_5, stp2_10);
-  l11 = _mm_sub_epi16(stp2_4, stp2_11);
-  l12 = _mm_sub_epi16(stp2_3, stp2_12);
-  l13 = _mm_sub_epi16(stp2_2, stp2_13);
-  l14 = _mm_sub_epi16(stp2_1, stp1_14);
-  l15 = _mm_sub_epi16(stp2_0, stp1_15);
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
   // 2-D idct. We do 2 8x16 blocks.
   for (i = 0; i < 2; i++) {
-    if (i == 0)
-      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-
-    if (i == 1)
-      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-
-    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
+    array_transpose_4X8(l + 8*i, in);
+    in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
 
     IDCT16_1D
 
     // Stage7
-    in0 = _mm_add_epi16(stp2_0, stp1_15);
-    in1 = _mm_add_epi16(stp2_1, stp1_14);
-    in2 = _mm_add_epi16(stp2_2, stp2_13);
-    in3 = _mm_add_epi16(stp2_3, stp2_12);
-    in4 = _mm_add_epi16(stp2_4, stp2_11);
-    in5 = _mm_add_epi16(stp2_5, stp2_10);
-    in6 = _mm_add_epi16(stp2_6, stp1_9);
-    in7 = _mm_add_epi16(stp2_7, stp1_8);
-    in8 = _mm_sub_epi16(stp2_7, stp1_8);
-    in9 = _mm_sub_epi16(stp2_6, stp1_9);
-    in10 = _mm_sub_epi16(stp2_5, stp2_10);
-    in11 = _mm_sub_epi16(stp2_4, stp2_11);
-    in12 = _mm_sub_epi16(stp2_3, stp2_12);
-    in13 = _mm_sub_epi16(stp2_2, stp2_13);
-    in14 = _mm_sub_epi16(stp2_1, stp1_14);
-    in15 = _mm_sub_epi16(stp2_0, stp1_15);
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
     // Final rounding and shift
-    in0 = _mm_adds_epi16(in0, final_rounding);
-    in1 = _mm_adds_epi16(in1, final_rounding);
-    in2 = _mm_adds_epi16(in2, final_rounding);
-    in3 = _mm_adds_epi16(in3, final_rounding);
-    in4 = _mm_adds_epi16(in4, final_rounding);
-    in5 = _mm_adds_epi16(in5, final_rounding);
-    in6 = _mm_adds_epi16(in6, final_rounding);
-    in7 = _mm_adds_epi16(in7, final_rounding);
-    in8 = _mm_adds_epi16(in8, final_rounding);
-    in9 = _mm_adds_epi16(in9, final_rounding);
-    in10 = _mm_adds_epi16(in10, final_rounding);
-    in11 = _mm_adds_epi16(in11, final_rounding);
-    in12 = _mm_adds_epi16(in12, final_rounding);
-    in13 = _mm_adds_epi16(in13, final_rounding);
-    in14 = _mm_adds_epi16(in14, final_rounding);
-    in15 = _mm_adds_epi16(in15, final_rounding);
-
-    in0 = _mm_srai_epi16(in0, 6);
-    in1 = _mm_srai_epi16(in1, 6);
-    in2 = _mm_srai_epi16(in2, 6);
-    in3 = _mm_srai_epi16(in3, 6);
-    in4 = _mm_srai_epi16(in4, 6);
-    in5 = _mm_srai_epi16(in5, 6);
-    in6 = _mm_srai_epi16(in6, 6);
-    in7 = _mm_srai_epi16(in7, 6);
-    in8 = _mm_srai_epi16(in8, 6);
-    in9 = _mm_srai_epi16(in9, 6);
-    in10 = _mm_srai_epi16(in10, 6);
-    in11 = _mm_srai_epi16(in11, 6);
-    in12 = _mm_srai_epi16(in12, 6);
-    in13 = _mm_srai_epi16(in13, 6);
-    in14 = _mm_srai_epi16(in14, 6);
-    in15 = _mm_srai_epi16(in15, 6);
-
-    RECON_AND_STORE(dest, in0);
-    RECON_AND_STORE(dest, in1);
-    RECON_AND_STORE(dest, in2);
-    RECON_AND_STORE(dest, in3);
-    RECON_AND_STORE(dest, in4);
-    RECON_AND_STORE(dest, in5);
-    RECON_AND_STORE(dest, in6);
-    RECON_AND_STORE(dest, in7);
-    RECON_AND_STORE(dest, in8);
-    RECON_AND_STORE(dest, in9);
-    RECON_AND_STORE(dest, in10);
-    RECON_AND_STORE(dest, in11);
-    RECON_AND_STORE(dest, in12);
-    RECON_AND_STORE(dest, in13);
-    RECON_AND_STORE(dest, in14);
-    RECON_AND_STORE(dest, in15);
+    in[0] = _mm_adds_epi16(in[0], final_rounding);
+    in[1] = _mm_adds_epi16(in[1], final_rounding);
+    in[2] = _mm_adds_epi16(in[2], final_rounding);
+    in[3] = _mm_adds_epi16(in[3], final_rounding);
+    in[4] = _mm_adds_epi16(in[4], final_rounding);
+    in[5] = _mm_adds_epi16(in[5], final_rounding);
+    in[6] = _mm_adds_epi16(in[6], final_rounding);
+    in[7] = _mm_adds_epi16(in[7], final_rounding);
+    in[8] = _mm_adds_epi16(in[8], final_rounding);
+    in[9] = _mm_adds_epi16(in[9], final_rounding);
+    in[10] = _mm_adds_epi16(in[10], final_rounding);
+    in[11] = _mm_adds_epi16(in[11], final_rounding);
+    in[12] = _mm_adds_epi16(in[12], final_rounding);
+    in[13] = _mm_adds_epi16(in[13], final_rounding);
+    in[14] = _mm_adds_epi16(in[14], final_rounding);
+    in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+    in[0] = _mm_srai_epi16(in[0], 6);
+    in[1] = _mm_srai_epi16(in[1], 6);
+    in[2] = _mm_srai_epi16(in[2], 6);
+    in[3] = _mm_srai_epi16(in[3], 6);
+    in[4] = _mm_srai_epi16(in[4], 6);
+    in[5] = _mm_srai_epi16(in[5], 6);
+    in[6] = _mm_srai_epi16(in[6], 6);
+    in[7] = _mm_srai_epi16(in[7], 6);
+    in[8] = _mm_srai_epi16(in[8], 6);
+    in[9] = _mm_srai_epi16(in[9], 6);
+    in[10] = _mm_srai_epi16(in[10], 6);
+    in[11] = _mm_srai_epi16(in[11], 6);
+    in[12] = _mm_srai_epi16(in[12], 6);
+    in[13] = _mm_srai_epi16(in[13], 6);
+    in[14] = _mm_srai_epi16(in[14], 6);
+    in[15] = _mm_srai_epi16(in[15], 6);
+
+    RECON_AND_STORE(dest, in[0]);
+    RECON_AND_STORE(dest, in[1]);
+    RECON_AND_STORE(dest, in[2]);
+    RECON_AND_STORE(dest, in[3]);
+    RECON_AND_STORE(dest, in[4]);
+    RECON_AND_STORE(dest, in[5]);
+    RECON_AND_STORE(dest, in[6]);
+    RECON_AND_STORE(dest, in[7]);
+    RECON_AND_STORE(dest, in[8]);
+    RECON_AND_STORE(dest, in[9]);
+    RECON_AND_STORE(dest, in[10]);
+    RECON_AND_STORE(dest, in[11]);
+    RECON_AND_STORE(dest, in[12]);
+    RECON_AND_STORE(dest, in[13]);
+    RECON_AND_STORE(dest, in[14]);
+    RECON_AND_STORE(dest, in[15]);
 
     dest += 8 - (stride * 16);
   }
@@ -2801,28 +2788,329 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     input += 8; \
   }  \
 
+#define IDCT32_1D_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
 #define IDCT32_1D \
 /* Stage1 */ \
 { \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
   \
   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
@@ -2840,15 +3128,15 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage2 */ \
 { \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
   \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
   \
   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
@@ -2880,10 +3168,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage3 */ \
 { \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
   \
   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
@@ -2927,10 +3215,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage4 */ \
 { \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
   \
   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -3187,10 +3475,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[32];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3202,296 +3487,225 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
-    i32 = (i << 5);
-    if (i == 0) {
-      // First 1-D idct: first 8 rows
-      // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
-
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else if (i < 4) {
-      // First 1-D idct: next 24 zero-coeff rows
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    } else {
-      // Second 1-D idct
-      j = i - 4;
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
-
-    IDCT32_1D
+  int i;
+  // Load input data.
+  LOAD_DQCOEFF(in[0], input);
+  LOAD_DQCOEFF(in[8], input);
+  LOAD_DQCOEFF(in[16], input);
+  LOAD_DQCOEFF(in[24], input);
+  LOAD_DQCOEFF(in[1], input);
+  LOAD_DQCOEFF(in[9], input);
+  LOAD_DQCOEFF(in[17], input);
+  LOAD_DQCOEFF(in[25], input);
+  LOAD_DQCOEFF(in[2], input);
+  LOAD_DQCOEFF(in[10], input);
+  LOAD_DQCOEFF(in[18], input);
+  LOAD_DQCOEFF(in[26], input);
+  LOAD_DQCOEFF(in[3], input);
+  LOAD_DQCOEFF(in[11], input);
+  LOAD_DQCOEFF(in[19], input);
+  LOAD_DQCOEFF(in[27], input);
+
+  LOAD_DQCOEFF(in[4], input);
+  LOAD_DQCOEFF(in[12], input);
+  LOAD_DQCOEFF(in[20], input);
+  LOAD_DQCOEFF(in[28], input);
+  LOAD_DQCOEFF(in[5], input);
+  LOAD_DQCOEFF(in[13], input);
+  LOAD_DQCOEFF(in[21], input);
+  LOAD_DQCOEFF(in[29], input);
+  LOAD_DQCOEFF(in[6], input);
+  LOAD_DQCOEFF(in[14], input);
+  LOAD_DQCOEFF(in[22], input);
+  LOAD_DQCOEFF(in[30], input);
+  LOAD_DQCOEFF(in[7], input);
+  LOAD_DQCOEFF(in[15], input);
+  LOAD_DQCOEFF(in[23], input);
+  LOAD_DQCOEFF(in[31], input);
 
-    // final stage
-    if (i < 4) {
-      // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+  array_transpose_8x8(in, in);
+  array_transpose_8x8(in+8, in+8);
+  array_transpose_8x8(in+16, in+16);
+  array_transpose_8x8(in+24, in+24);
+
+  IDCT32_1D
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+i*8, in);
+      IDCT32_1D_34
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
   }
-}
 
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
@@ -3546,10 +3760,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[128], zero_idx[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3562,66 +3773,63 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i, j, i32;
-  __m128i zero_idx[16];
   int zero_flag[2];
 
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
+  for (i = 0; i < 4; i++) {
     i32 = (i << 5);
-    if (i < 4) {
       // First 1-D idct
       // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
-
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
+      LOAD_DQCOEFF(in[0], input);
+      LOAD_DQCOEFF(in[8], input);
+      LOAD_DQCOEFF(in[16], input);
+      LOAD_DQCOEFF(in[24], input);
+      LOAD_DQCOEFF(in[1], input);
+      LOAD_DQCOEFF(in[9], input);
+      LOAD_DQCOEFF(in[17], input);
+      LOAD_DQCOEFF(in[25], input);
+      LOAD_DQCOEFF(in[2], input);
+      LOAD_DQCOEFF(in[10], input);
+      LOAD_DQCOEFF(in[18], input);
+      LOAD_DQCOEFF(in[26], input);
+      LOAD_DQCOEFF(in[3], input);
+      LOAD_DQCOEFF(in[11], input);
+      LOAD_DQCOEFF(in[19], input);
+      LOAD_DQCOEFF(in[27], input);
+
+      LOAD_DQCOEFF(in[4], input);
+      LOAD_DQCOEFF(in[12], input);
+      LOAD_DQCOEFF(in[20], input);
+      LOAD_DQCOEFF(in[28], input);
+      LOAD_DQCOEFF(in[5], input);
+      LOAD_DQCOEFF(in[13], input);
+      LOAD_DQCOEFF(in[21], input);
+      LOAD_DQCOEFF(in[29], input);
+      LOAD_DQCOEFF(in[6], input);
+      LOAD_DQCOEFF(in[14], input);
+      LOAD_DQCOEFF(in[22], input);
+      LOAD_DQCOEFF(in[30], input);
+      LOAD_DQCOEFF(in[7], input);
+      LOAD_DQCOEFF(in[15], input);
+      LOAD_DQCOEFF(in[23], input);
+      LOAD_DQCOEFF(in[31], input);
 
       // checking if all entries are zero
-      zero_idx[0] = _mm_or_si128(in0, in1);
-      zero_idx[1] = _mm_or_si128(in2, in3);
-      zero_idx[2] = _mm_or_si128(in4, in5);
-      zero_idx[3] = _mm_or_si128(in6, in7);
-      zero_idx[4] = _mm_or_si128(in8, in9);
-      zero_idx[5] = _mm_or_si128(in10, in11);
-      zero_idx[6] = _mm_or_si128(in12, in13);
-      zero_idx[7] = _mm_or_si128(in14, in15);
-      zero_idx[8] = _mm_or_si128(in16, in17);
-      zero_idx[9] = _mm_or_si128(in18, in19);
-      zero_idx[10] = _mm_or_si128(in20, in21);
-      zero_idx[11] = _mm_or_si128(in22, in23);
-      zero_idx[12] = _mm_or_si128(in24, in25);
-      zero_idx[13] = _mm_or_si128(in26, in27);
-      zero_idx[14] = _mm_or_si128(in28, in29);
-      zero_idx[15] = _mm_or_si128(in30, in31);
+      zero_idx[0] = _mm_or_si128(in[0], in[1]);
+      zero_idx[1] = _mm_or_si128(in[2], in[3]);
+      zero_idx[2] = _mm_or_si128(in[4], in[5]);
+      zero_idx[3] = _mm_or_si128(in[6], in[7]);
+      zero_idx[4] = _mm_or_si128(in[8], in[9]);
+      zero_idx[5] = _mm_or_si128(in[10], in[11]);
+      zero_idx[6] = _mm_or_si128(in[12], in[13]);
+      zero_idx[7] = _mm_or_si128(in[14], in[15]);
+      zero_idx[8] = _mm_or_si128(in[16], in[17]);
+      zero_idx[9] = _mm_or_si128(in[18], in[19]);
+      zero_idx[10] = _mm_or_si128(in[20], in[21]);
+      zero_idx[11] = _mm_or_si128(in[22], in[23]);
+      zero_idx[12] = _mm_or_si128(in[24], in[25]);
+      zero_idx[13] = _mm_or_si128(in[26], in[27]);
+      zero_idx[14] = _mm_or_si128(in[28], in[29]);
+      zero_idx[15] = _mm_or_si128(in[30], in[31]);
 
       zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
       zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
@@ -3683,44 +3891,13 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       }
 
       // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else {
-      // Second 1-D idct
-      j = i - 4;
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+      array_transpose_8x8(in+16, in+16);
+      array_transpose_8x8(in+24, in+24);
 
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
-
-    IDCT32_1D
+      IDCT32_1D
 
-    // final stage
-    if (i < 4) {
       // 1_D: Store 32 intermediate results for each 8x32 block.
       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
@@ -3754,146 +3931,156 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+    }
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Second 1-D idct
+      j = i << 3;
+
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+j, in);
+      array_transpose_8x8(col+j+32, in+8);
+      array_transpose_8x8(col+j+64, in+16);
+      array_transpose_8x8(col+j+96, in+24);
+
+      IDCT32_1D
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
-  }
 }  //NOLINT
 
 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 925f74d19..3ca55cfc3 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <emmintrin.h>  /* SSE2 */
+#include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
@@ -99,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -110,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filter2 = _mm_unpacklo_epi8(zero, filter2);
     filter2 = _mm_srai_epi16(filter2, 0xB);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi16(filter1, t1);
     filt = _mm_srai_epi16(filt, 1);
     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
@@ -473,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -487,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter1 = _mm_or_si128(filter1, work_a);
     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -495,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter2 = _mm_or_si128(filter2, work_a);
     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1014,23 +1014,23 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     filter1 = _mm_unpacklo_epi8(zero, filter1);
     filter1 = _mm_srai_epi16(filter1, 11);
     filter1 = _mm_packs_epi16(filter1, filter1);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     filter2 = _mm_unpacklo_epi8(zero, filter2);
     filter2 = _mm_srai_epi16(filter2, 11);
     filter2 = _mm_packs_epi16(filter2, zero);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     filt = _mm_unpacklo_epi8(zero, filt);
     filt = _mm_srai_epi16(filt, 9);
@@ -1083,7 +1083,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   }
 }
 
-void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p,
                                                const uint8_t *_blimit0,
                                                const uint8_t *_limit0,
                                                const uint8_t *_thresh0,
@@ -1255,27 +1255,27 @@ void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1427,27 +1427,27 @@ void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1474,7 +1474,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  /* Read in 16 lines */
+  // Read in 16 lines
   x0 = _mm_loadl_epi64((__m128i *)in0);
   x8 = _mm_loadl_epi64((__m128i *)in1);
   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
@@ -1512,7 +1512,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store first 4-line result */
+  // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1528,7 +1528,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store second 4-line result */
+  // Store second 4-line result
   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1598,61 +1598,129 @@ static INLINE void transpose(unsigned char *src[], int in_p,
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
-                                          int p,
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_loop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                          thresh0, blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p,
                                           const unsigned char *blimit,
                                           const unsigned char *limit,
                                           const unsigned char *thresh,
                                           int count) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+  unsigned char *src[1];
+  unsigned char *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 4 * 8, 8, blimit, limit,
+                                         thresh, 1);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
   unsigned char *src[2];
   unsigned char *dst[2];
 
-  (void)count;
-  /* Transpose 16x16 */
-  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
-  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, 1);
-  src[0] = t_dst + 3 * 16;
-  src[1] = t_dst + 3 * 16 + 8;
+  // Loop filtering
+  vp9_mbloop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                            thresh0, blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
 
-  dst[0] = s - 5;
-  dst[1] = s - 5 + p * 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
 
-  /* Transpose 16x8 */
+  // Transpose back
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
-                                     int p,
+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, int p,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
-  unsigned char *src[4];
-  unsigned char *dst[4];
-
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 16;
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+  unsigned char *src[2];
+  unsigned char *dst[2];
 
   src[0] = s - 8;
-  src[1] = s - 8 + 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
 
-  /* Transpose 16x16 */
-  transpose(src, p, dst, 16, 2);
+  // Transpose 16x8
+  transpose(src, p, dst, 8, 2);
 
-  /* Loop filtering */
-  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                    thresh, 1);
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
-  src[1] = t_dst + 8 * 16;
-
+  src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
-  dst[1] = s - 8 + 8;
+  dst[1] = s;
 
-  transpose(src, 16, dst, p, 2);
+  // Transpose back
+  transpose(src, 8, dst, p, 2);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_sse2(unsigned char *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+
+  // Transpose 16x16
+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                   thresh);
+
+  // Transpose back
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
 }
diff --git a/vp9/decoder/vp9_dboolhuff.c b/vp9/decoder/vp9_dboolhuff.c
index 06acec4db..4f16e95b0 100644
--- a/vp9/decoder/vp9_dboolhuff.c
+++ b/vp9/decoder/vp9_dboolhuff.c
@@ -18,32 +18,28 @@
 // Even relatively modest values like 100 would work fine.
 #define LOTS_OF_BITS 0x40000000
 
-
 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
-  int marker_bit;
-
-  r->buffer_end = buffer + size;
-  r->buffer = buffer;
-  r->value = 0;
-  r->count = -8;
-  r->range = 255;
-
-  if (size && !buffer)
+  if (size && !buffer) {
     return 1;
-
-  vp9_reader_fill(r);
-  marker_bit = vp9_read_bit(r);
-  return marker_bit != 0;
+  } else {
+    r->buffer_end = buffer + size;
+    r->buffer = buffer;
+    r->value = 0;
+    r->count = -8;
+    r->range = 255;
+    vp9_reader_fill(r);
+    return vp9_read_bit(r) != 0;  // marker bit
+  }
 }
 
 void vp9_reader_fill(vp9_reader *r) {
   const uint8_t *const buffer_end = r->buffer_end;
   const uint8_t *buffer = r->buffer;
-  VP9_BD_VALUE value = r->value;
+  BD_VALUE value = r->value;
   int count = r->count;
-  int shift = BD_VALUE_SIZE - 8 - (count + 8);
+  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
   int loop_end = 0;
-  const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT);
+  const int bits_left = (int)((buffer_end - buffer) * CHAR_BIT);
   const int x = shift + CHAR_BIT - bits_left;
 
   if (x >= 0) {
@@ -54,7 +50,7 @@ void vp9_reader_fill(vp9_reader *r) {
   if (x < 0 || bits_left) {
     while (shift >= loop_end) {
       count += CHAR_BIT;
-      value |= (VP9_BD_VALUE)*buffer++ << shift;
+      value |= (BD_VALUE)*buffer++ << shift;
       shift -= CHAR_BIT;
     }
   }
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index fd8e74ca4..8339c2701 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -18,46 +18,50 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
-typedef size_t VP9_BD_VALUE;
+#include "vp9/common/vp9_treecoder.h"
 
-#define BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
 typedef struct {
   const uint8_t *buffer_end;
   const uint8_t *buffer;
-  VP9_BD_VALUE value;
+  BD_VALUE value;
   int count;
   unsigned int range;
 } vp9_reader;
 
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
-
 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size);
 
 void vp9_reader_fill(vp9_reader *r);
 
+int vp9_reader_has_error(vp9_reader *r);
+
 const uint8_t *vp9_reader_find_end(vp9_reader *r);
 
-static int vp9_read(vp9_reader *br, int probability) {
+static int vp9_read(vp9_reader *r, int prob) {
   unsigned int bit = 0;
-  VP9_BD_VALUE value;
-  VP9_BD_VALUE bigsplit;
+  BD_VALUE value;
+  BD_VALUE bigsplit;
   int count;
   unsigned int range;
-  unsigned int split = ((br->range * probability) + (256 - probability)) >> 8;
+  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
 
-  if (br->count < 0)
-    vp9_reader_fill(br);
+  if (r->count < 0)
+    vp9_reader_fill(r);
 
-  value = br->value;
-  count = br->count;
+  value = r->value;
+  count = r->count;
 
-  bigsplit = (VP9_BD_VALUE)split << (BD_VALUE_SIZE - 8);
+  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
 
   range = split;
 
   if (value >= bigsplit) {
-    range = br->range - split;
+    range = r->range - split;
     value = value - bigsplit;
     bit = 1;
   }
@@ -68,9 +72,9 @@ static int vp9_read(vp9_reader *br, int probability) {
     value <<= shift;
     count -= shift;
   }
-  br->value = value;
-  br->count = count;
-  br->range = range;
+  r->value = value;
+  r->count = count;
+  r->range = range;
 
   return bit;
 }
@@ -79,15 +83,23 @@ static int vp9_read_bit(vp9_reader *r) {
   return vp9_read(r, 128);  // vp9_prob_half
 }
 
-static int vp9_read_literal(vp9_reader *br, int bits) {
-  int z = 0, bit;
+static int vp9_read_literal(vp9_reader *r, int bits) {
+  int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    z |= vp9_read_bit(br) << bit;
+    literal |= vp9_read_bit(r) << bit;
 
-  return z;
+  return literal;
 }
 
-int vp9_reader_has_error(vp9_reader *r);
+static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
+                         const vp9_prob *probs) {
+  vp9_tree_index i = 0;
+
+  while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
+    continue;
+
+  return -i;
+}
 
 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 916cb424e..9b6740eea 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -11,6 +11,8 @@
 #include <assert.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
 
@@ -34,14 +36,11 @@
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_thread.h"
-#include "vp9/decoder/vp9_treereader.h"
 
 typedef struct TileWorkerData {
   VP9_COMMON *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
-  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
 } TileWorkerData;
@@ -50,7 +49,7 @@ static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
 
-static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+static int is_compound_reference_allowed(const VP9_COMMON *cm) {
   int i;
   for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
     if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
@@ -59,7 +58,7 @@ static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
   return 0;
 }
 
-static void setup_compound_prediction(VP9_COMMON *cm) {
+static void setup_compound_reference(VP9_COMMON *cm) {
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -94,7 +93,7 @@ static TX_MODE read_tx_mode(vp9_reader *r) {
   return tx_mode;
 }
 
-static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
   int i, j;
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
@@ -124,33 +123,31 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
       vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
-static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
-  COMPPREDMODE_TYPE mode = vp9_read_bit(r);
-  if (mode)
-    mode += vp9_read_bit(r);
-  return mode;
+static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) {
+  if (is_compound_reference_allowed(cm)) {
+    REFERENCE_MODE mode = vp9_read_bit(r);
+    if (mode)
+      mode += vp9_read_bit(r);
+    setup_compound_reference(cm);
+    return mode;
+  } else {
+    return SINGLE_REFERENCE;
+  }
 }
 
-static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
   int i;
-
-  const int compound_allowed = is_compound_prediction_allowed(cm);
-  cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
-                                        : SINGLE_PREDICTION_ONLY;
-  if (compound_allowed)
-    setup_compound_prediction(cm);
-
-  if (cm->comp_pred_mode == HYBRID_PREDICTION)
+  if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
 
-  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+  if (cm->comp_pred_mode != COMPOUND_REFERENCE)
     for (i = 0; i < REF_CONTEXTS; i++) {
       vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
       vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
     }
 
-  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+  if (cm->comp_pred_mode != SINGLE_REFERENCE)
     for (i = 0; i < REF_CONTEXTS; i++)
       vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
@@ -241,8 +238,7 @@ static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) {
 }
 
 static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    TX_SIZE tx_size, uint8_t *dst, int stride,
-                                    uint8_t *token_cache) {
+                                    TX_SIZE tx_size, uint8_t *dst, int stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int eob = pd->eobs[block];
   if (eob > 0) {
@@ -275,20 +271,13 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
 
     if (eob == 1) {
       vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
-      vpx_memset(token_cache, 0, 2 * sizeof(token_cache[0]));
     } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-        vpx_memset(token_cache, 0,
-                   4 * (4 << tx_size) * sizeof(token_cache[0]));
-      } else if (tx_size == TX_32X32 && eob <= 34) {
+      else if (tx_size == TX_32X32 && eob <= 34)
         vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-        vpx_memset(token_cache, 0, 256 * sizeof(token_cache[0]));
-      } else {
+      else
         vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-        vpx_memset(token_cache, 0,
-                   (16 << (tx_size << 1)) * sizeof(token_cache[0]));
-      }
     }
   }
 }
@@ -297,7 +286,6 @@ struct intra_args {
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
   vp9_reader *r;
-  uint8_t *token_cache;
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -326,9 +314,8 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
 
   if (!mi->mbmi.skip_coeff) {
     vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size,
-                            args->r, args->token_cache);
-    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
-                            args->token_cache);
+                            args->r);
+    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride);
   }
 }
 
@@ -337,7 +324,6 @@ struct inter_args {
   MACROBLOCKD *xd;
   vp9_reader *r;
   int *eobtotal;
-  uint8_t *token_cache;
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -351,10 +337,10 @@ static void reconstruct_inter_block(int plane, int block,
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
   *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
                                              plane_bsize, x, y, tx_size,
-                                             args->r, args->token_cache);
+                                             args->r);
   inverse_transform_block(xd, plane, block, tx_size,
                           &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
-                          pd->dst.stride, args->token_cache);
+                          pd->dst.stride);
 }
 
 static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -404,8 +390,7 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize,
-                           uint8_t *token_cache) {
+                           vp9_reader *r, BLOCK_SIZE bsize) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
@@ -427,9 +412,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = {
-      cm, xd, r, token_cache
-    };
+    struct intra_args arg = { cm, xd, r };
     foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
                               &arg);
   } else {
@@ -442,14 +425,12 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         vp9_get_filter_kernel(mbmi->interp_filter);
 
     // Prediction
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
     // Reconstruction
     if (!mbmi->skip_coeff) {
       int eobtotal = 0;
-      struct inter_args arg = {
-        cm, xd, r, &eobtotal, token_cache
-      };
+      struct inter_args arg = { cm, xd, r, &eobtotal };
       foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
         mbmi->skip_coeff = 1;  // skip loopfilter
@@ -471,7 +452,7 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
-    p = treed_read(r, vp9_partition_tree, probs);
+    p = vp9_read_tree(r, vp9_partition_tree, probs);
   else if (!has_rows && has_cols)
     p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
@@ -488,8 +469,7 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
 static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize,
-                            uint8_t *token_cache) {
+                            vp9_reader* r, BLOCK_SIZE bsize) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -500,33 +480,27 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
         break;
       case PARTITION_HORZ:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
         if (mi_row + hbs < cm->mi_rows)
-          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                         token_cache);
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
         break;
       case PARTITION_VERT:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
         if (mi_col + hbs < cm->mi_cols)
-          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                         token_cache);
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
         break;
       case PARTITION_SPLIT:
-        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
-                        token_cache);
-        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                        token_cache);
-        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                        token_cache);
-        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
-                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
         break;
       default:
         assert(!"Invalid partition type");
@@ -809,8 +783,7 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
-                      pbi->token_cache);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
     }
 
     if (pbi->do_loopfilter_inline) {
@@ -951,11 +924,9 @@ static void setup_tile_macroblockd(TileWorkerData *const tile_data) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    pd[i].qcoeff  = tile_data->qcoeff[i];
     pd[i].dqcoeff = tile_data->dqcoeff[i];
     pd[i].eobs    = tile_data->eobs[i];
     vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
-    vpx_memset(tile_data->token_cache, 0, sizeof(tile_data->token_cache));
   }
 }
 
@@ -971,8 +942,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
-                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
-                      tile_data->token_cache);
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1164,8 +1134,12 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
       cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
       cm->mcomp_filter_type = read_interp_filter_type(rb);
 
-      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
         vp9_setup_scale_factors(cm, i);
+        if (vp9_is_scaled(&cm->active_ref_scale_comm[i]))
+          vp9_extend_frame_borders(&cm->yv12_fb[cm->active_ref_idx[i]],
+                                   cm->subsampling_x, cm->subsampling_y);
+      }
     }
   }
 
@@ -1212,7 +1186,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
 
   cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
   if (cm->tx_mode == TX_MODE_SELECT)
-    read_tx_probs(&fc->tx_probs, &r);
+    read_tx_mode_probs(&fc->tx_probs, &r);
   read_coef_probs(fc, cm->tx_mode, &r);
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
@@ -1230,7 +1204,8 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-    read_comp_pred(cm, &r);
+    cm->comp_pred_mode = read_reference_mode(cm, &r);
+    read_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 75f0ae865..327a9166c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -20,13 +20,13 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#include "vp9/decoder/vp9_dboolhuff.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_treereader.h"
 
 static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
+  return (MB_PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
 }
 
 static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
@@ -49,8 +49,8 @@ static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
 
 static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
                                           int ctx) {
-  const int mode = treed_read(r, vp9_inter_mode_tree,
-                              cm->fc.inter_mode_probs[ctx]);
+  const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
+                                 cm->fc.inter_mode_probs[ctx]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.inter_mode[ctx][mode];
 
@@ -58,7 +58,7 @@ static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
 }
 
 static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
-  return treed_read(r, vp9_segment_tree, seg->tree_probs);
+  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -210,12 +210,12 @@ static int read_mv_component(vp9_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
-  const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+  const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
+    d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
@@ -226,8 +226,8 @@ static int read_mv_component(vp9_reader *r,
   }
 
   // Fractional part
-  fr = treed_read(r, vp9_mv_fp_tree,
-                  class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+  fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+                                               : mvcomp->fp);
 
 
   // High precision part (if hp is not used, the default value of the hp is 1)
@@ -242,7 +242,7 @@ static int read_mv_component(vp9_reader *r,
 static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
-  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+  const MV_JOINT_TYPE j = vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints);
   const int use_hp = allow_hp && vp9_use_mv_hp(ref);
   MV diff = {0, 0};
 
@@ -258,14 +258,14 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
   mv->col = ref->col + diff.col;
 }
 
-static COMPPREDMODE_TYPE read_reference_mode(VP9_COMMON *cm,
+static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm,
                                              const MACROBLOCKD *xd,
                                              vp9_reader *r) {
   const int ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
   const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.comp_inter[ctx][mode];
-  return mode;  // SINGLE_PREDICTION_ONLY or COMP_PREDICTION_ONLY
+  return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
 }
 
 // Read the referncence frame
@@ -279,12 +279,12 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const COMPPREDMODE_TYPE mode = (cm->comp_pred_mode == HYBRID_PREDICTION)
+    const REFERENCE_MODE mode = (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
                                       ? read_reference_mode(cm, xd, r)
                                       : cm->comp_pred_mode;
 
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
-    if (mode == COMP_PREDICTION_ONLY) {
+    if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
       const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
@@ -292,7 +292,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         ++counts->comp_ref[ctx][bit];
       ref_frame[idx] = cm->comp_fixed_ref;
       ref_frame[!idx] = cm->comp_var_ref[bit];
-    } else if (mode == SINGLE_PREDICTION_ONLY) {
+    } else if (mode == SINGLE_REFERENCE) {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
       const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
       if (!cm->frame_parallel_decoding_mode)
@@ -318,8 +318,8 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
     VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
-  const int type = treed_read(r, vp9_switchable_interp_tree,
-                              cm->fc.switchable_interp_prob[ctx]);
+  const int type = vp9_read_tree(r, vp9_switchable_interp_tree,
+                                 cm->fc.switchable_interp_prob[ctx]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.switchable_interp[ctx][type];
   return type;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 214c1c198..bdbe67dbc 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -18,7 +18,6 @@
 #include "vp9/decoder/vp9_dboolhuff.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_treereader.h"
 
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
@@ -61,16 +60,10 @@ static const vp9_prob cat6_prob[15] = {
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
-static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
-  ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN,
-  TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN,
-  TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
-};
-
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
      if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][pt][token_to_counttoken[token]]; \
+       ++coef_counts[band][pt][token];                      \
   } while (0)
 
 
@@ -78,7 +71,6 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
   {                                                      \
     v = (val * dqv) >> dq_shift; \
     dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -v : v); \
-    INCREMENT_COUNT(token);                              \
     token_cache[scan[c]] = vp9_pt_energy_class[token];   \
     ++c;                                                 \
     pt = get_coef_context(nb, token_cache, c);           \
@@ -94,9 +86,8 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
-                        PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
-                        TX_SIZE tx_size, const int16_t *dq, int pt,
-                        uint8_t *token_cache) {
+                        PLANE_TYPE type, int max_eob, int16_t *dqcoeff_ptr,
+                        TX_SIZE tx_size, const int16_t *dq, int pt) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -108,6 +99,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       counts->coef[tx_size][type][ref];
   unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
       counts->eob_branch[tx_size][type][ref];
+  uint8_t token_cache[32 * 32];
   const uint8_t *cat6;
   const uint8_t *band_translate = get_band_translate(tx_size);
   const int dq_shift = (tx_size == TX_32X32);
@@ -117,38 +109,39 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
   int v;
   int16_t dqv = dq[0];
 
-
-
-  while (c < seg_eob) {
+  while (c < max_eob) {
     int val;
     band = *band_translate++;
     prob = coef_probs[band][pt];
     if (!cm->frame_parallel_decoding_mode)
       ++eob_branch_count[band][pt];
-    if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
+    if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+      INCREMENT_COUNT(DCT_EOB_MODEL_TOKEN);
       break;
+    }
 
-  DECODE_ZERO:
-    if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
+      token_cache[scan[c]] = 0;
       ++c;
-      if (c >= seg_eob)
-        break;
+      if (c >= max_eob)
+        return c;  // zero tokens at the end (no eob token)
       pt = get_coef_context(nb, token_cache, c);
       band = *band_translate++;
       prob = coef_probs[band][pt];
-      goto DECODE_ZERO;
     }
 
     // ONE_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ONE_TOKEN);
       WRITE_COEF_CONTINUE(1, ONE_TOKEN);
     }
 
-    prob = vp9_pareto8_full[coef_probs[band][pt][PIVOT_NODE]-1];
+    INCREMENT_COUNT(TWO_TOKEN);
+
+    prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
 
-    // LOW_VAL_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
         WRITE_COEF_CONTINUE(2, TWO_TOKEN);
@@ -158,7 +151,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       }
       WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
     }
-    // HIGH_LOW_CONTEXT_NODE_0_
+
     if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
         val = CAT1_MIN_VAL;
@@ -170,7 +163,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       ADJUST_COEF(CAT2_PROB0, 0);
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2);
     }
-    // CAT_THREEFOUR_CONTEXT_NODE_0_
+
     if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
         val = CAT3_MIN_VAL;
@@ -186,7 +179,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       ADJUST_COEF(CAT4_PROB0, 0);
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4);
     }
-    // CAT_FIVE_CONTEXT_NODE_0_:
+
     if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
       val = CAT5_MIN_VAL;
       ADJUST_COEF(CAT5_PROB4, 4);
@@ -205,18 +198,12 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
-  if (c < seg_eob) {
-    if (!cm->frame_parallel_decoding_mode)
-      ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN];
-  }
-
   return c;
 }
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache) {
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
                                  tx_size);
@@ -224,7 +211,7 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                                               pd->left_context + y);
   const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob,
                                BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
-                               pd->dequant, pt, token_cache);
+                               pd->dequant, pt);
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   pd->eobs[block] = eob;
   return eob;
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index e858a19f7..2a8807379 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -17,7 +17,6 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache);
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 7c0f91d88..740ad72cb 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -113,7 +113,6 @@ static void init_macroblockd(VP9D_COMP *const pbi) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    pd[i].qcoeff  = pbi->qcoeff[i];
     pd[i].dqcoeff = pbi->dqcoeff[i];
     pd[i].eobs    = pbi->eobs[i];
   }
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index d3d29e98d..038cd96a5 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -22,7 +22,6 @@ typedef struct VP9Decompressor {
 
   DECLARE_ALIGNED(16, VP9_COMMON, common);
 
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
 
@@ -53,8 +52,6 @@ typedef struct VP9Decompressor {
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   PARTITION_CONTEXT *above_seg_context;
-
-  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h
deleted file mode 100644
index 41680d245..000000000
--- a/vp9/decoder/vp9_treereader.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_TREEREADER_H_
-#define VP9_DECODER_VP9_TREEREADER_H_
-
-#include "vp9/common/vp9_treecoder.h"
-#include "vp9/decoder/vp9_dboolhuff.h"
-
-// Intent of tree data structure is to make decoding trivial.
-static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
-                      vp9_tree t,
-                      const vp9_prob *const p) {
-  register vp9_tree_index i = 0;
-
-  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0)
-    continue;
-
-  return -i;
-}
-
-#endif  // VP9_DECODER_VP9_TREEREADER_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a0fced576..9f79f8cdc 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
 
@@ -151,6 +152,30 @@ void write_switchable_interp_stats() {
 }
 #endif
 
+static struct vp9_token intra_mode_encodings[INTRA_MODES];
+static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+static struct vp9_token partition_encodings[PARTITION_TYPES];
+static struct vp9_token inter_mode_encodings[INTER_MODES];
+
+void vp9_entropy_mode_init() {
+  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
+  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
+  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
+  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
+}
+
+static void write_intra_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
+}
+
+static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  assert(is_inter_mode(mode));
+  write_token(w, vp9_inter_mode_tree, probs,
+              &inter_mode_encodings[INTER_OFFSET(mode)]);
+}
+
 static INLINE void write_be32(uint8_t *p, int value) {
   p[0] = value >> 24;
   p[1] = value >> 16;
@@ -169,6 +194,8 @@ static void prob_diff_update(const vp9_tree_index *tree,
                              int n, vp9_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
   assert(n <= 32);
 
   vp9_tree_probs_from_distribution(tree, branch_ct, counts);
@@ -211,10 +238,6 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
     vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
 }
 
-static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
-}
-
 static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
   int j;
@@ -291,14 +314,6 @@ static void pack_mb_tokens(vp9_writer* const w,
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
-                            const vp9_prob *p) {
-  assert(is_inter_mode(mode));
-  write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[INTER_OFFSET(mode)]);
-}
-
-
 static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
@@ -319,12 +334,12 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
   if (!seg_ref_active) {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
       vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
                 vp9_get_pred_prob_comp_inter_inter(cm, xd));
     } else {
       assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-                 (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+                 (cm->comp_pred_mode == SINGLE_REFERENCE));
     }
 
     if (mi->ref_frame[1] > INTRA_FRAME) {
@@ -420,7 +435,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_sb_mv_ref(bc, mode, mv_ref_p);
+        write_inter_mode(bc, mode, mv_ref_p);
         ++cm->counts.inter_mode[mi->mode_context[rf]]
                                [INTER_OFFSET(mode)];
       }
@@ -430,7 +445,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       write_token(bc, vp9_switchable_interp_tree,
                   cm->fc.switchable_interp_prob[ctx],
-                  &vp9_switchable_interp_encodings[mi->interp_filter]);
+                  &switchable_interp_encodings[mi->interp_filter]);
     } else {
       assert(mi->interp_filter == cm->mcomp_filter_type);
     }
@@ -443,7 +458,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
           const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_sb_mv_ref(bc, blockmode, mv_ref_p);
+          write_inter_mode(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [INTER_OFFSET(blockmode)];
 
@@ -559,7 +574,7 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
-    write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+    write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
     vp9_write(w, p == PARTITION_SPLIT, probs[1]);
@@ -1357,8 +1372,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
 
     if (cm->allow_comp_inter_inter) {
       const int comp_pred_mode = cpi->common.comp_pred_mode;
-      const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY;
-      const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION;
+      const int use_compound_pred = comp_pred_mode != SINGLE_REFERENCE;
+      const int use_hybrid_pred = comp_pred_mode == REFERENCE_MODE_SELECT;
 
       vp9_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
@@ -1370,7 +1385,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
       }
     }
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->comp_pred_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
                                   cpi->single_ref_count[i][0]);
@@ -1379,7 +1394,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
       }
     }
 
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    if (cm->comp_pred_mode != SINGLE_REFERENCE)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
                                   cpi->comp_ref_count[i]);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 444597067..71f7e7a52 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -69,6 +69,7 @@ typedef struct {
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  int16_t *qcoeff;
   int16_t *coeff;
   struct buf_2d src;
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 33839370a..89da78190 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -360,6 +360,52 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   adjust_act_zbin(cpi, x);
 }
 
+// Select a segment for the current SB64
+static void select_in_frame_q_segment(VP9_COMP *cpi,
+                                      int mi_row, int mi_col,
+                                      int output_enabled, int projected_rate) {
+  VP9_COMMON * const cm = &cpi->common;
+  int target_rate = cpi->rc.sb64_target_rate << 8;   // convert to bits << 8
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = 1 << mi_width_log2(BLOCK_64X64);
+  const int bh = 1 << mi_height_log2(BLOCK_64X64);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int complexity_metric = 64;
+  int x, y;
+
+  unsigned char segment;
+
+  if (!output_enabled) {
+    segment = 0;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units
+    target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+
+    if (projected_rate < (target_rate / 4)) {
+      segment = 2;
+    } else if (projected_rate < (target_rate / 2)) {
+      segment = 1;
+    } else {
+      segment = 0;
+    }
+
+    complexity_metric =
+      clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
+        (unsigned char)complexity_metric;
+    }
+  }
+}
+
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          BLOCK_SIZE bsize, int output_enabled) {
   int i, x_idx, y;
@@ -383,19 +429,24 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
   assert(mi->mbmi.sb_type == bsize);
 
+  // For in frame adaptive Q copy over the chosen segment id into the
+  // mode innfo context for the chosen mode / partition.
+  if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && output_enabled)
+    mi->mbmi.segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+
   *mi_addr = *mi;
 
   max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
   for (i = 0; i < max_plane; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs = ctx->eobs_pbuf[i][1];
   }
 
   for (i = max_plane; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][2];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
     pd[i].eobs = ctx->eobs_pbuf[i][2];
   }
@@ -405,10 +456,12 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   for (y = 0; y < mi_height; y++)
     for (x_idx = 0; x_idx < mi_width; x_idx++)
       if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
-          && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
         xd->mi_8x8[x_idx + y * mis] = mi_addr;
+      }
 
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if ((cpi->oxcf.aq_mode == VARIANCE_AQ) ||
+        (cpi->oxcf.aq_mode == COMPLEXITY_AQ)) {
     vp9_mb_init_quantizer(cpi, x);
   }
 
@@ -478,9 +531,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
     }
 
-    cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
+    cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
@@ -557,7 +610,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
 
   /* segment ID */
   if (seg->enabled) {
-    if (!cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
       uint8_t *map = seg->update_map ? cpi->segmentation_map
           : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
@@ -622,7 +675,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][0];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
     pd[i].eobs = ctx->eobs_pbuf[i][0];
   }
@@ -653,6 +706,14 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_clear_system_state();  // __asm emms;
     x->rdmult = round(x->rdmult * rdmult_ratio);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    const int mi_offset = mi_row * cm->mi_cols + mi_col;
+    unsigned char complexity = cpi->complexity_map[mi_offset];
+    const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) ||
+                        (mi_col == 0) || (mi_col == (cm->mi_cols - 1));
+
+    if (!is_edge && (complexity > 128))
+      x->rdmult = x->rdmult  + ((x->rdmult * (complexity - 128)) / 256);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -697,7 +758,7 @@ static void update_stats(VP9_COMP *cpi) {
     // reference frame allowed for the segment so exclude it from
     // the reference frame counts used to work out probabilities.
     if (is_inter_block(mbmi) && !seg_ref_active) {
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
         cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
                              [has_second_ref(mbmi)]++;
 
@@ -1261,8 +1322,19 @@ static void rd_use_partition(VP9_COMP *cpi,
   if ( bsize == BLOCK_64X64)
     assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
 
-  if (do_recon)
-    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      select_in_frame_q_segment(cpi, mi_row, mi_col,
+                                output_enabled, chosen_rate);
+    }
+
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
 
   *rate = chosen_rate;
   *dist = chosen_dist;
@@ -1495,10 +1567,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + ms >= cm->mi_rows);
   const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
 
   int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
-  int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
 
   int partition_split_done = 0;
   (void) *tp_orig;
@@ -1740,8 +1816,17 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   *rate = best_rate;
   *dist = best_dist;
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
-    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate);
+    }
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
@@ -1868,10 +1953,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   xd->mode_info_stride = cm->mode_info_stride;
 
-  // reset intra mode contexts
-  if (frame_is_intra_only(cm))
-    vp9_init_mbmode_probs(cm);
-
   // Copy data over into macro block data structures.
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
 
@@ -2234,18 +2315,18 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
     /* prediction (compound, single or hybrid) mode selection */
     if (frame_type == 3 || !cm->allow_comp_inter_inter)
-      pred_type = SINGLE_PREDICTION_ONLY;
+      pred_type = SINGLE_REFERENCE;
     else if (cpi->rd_prediction_type_threshes[frame_type][1]
              > cpi->rd_prediction_type_threshes[frame_type][0]
              && cpi->rd_prediction_type_threshes[frame_type][1]
              > cpi->rd_prediction_type_threshes[frame_type][2]
              && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
-      pred_type = COMP_PREDICTION_ONLY;
+      pred_type = COMPOUND_REFERENCE;
     else if (cpi->rd_prediction_type_threshes[frame_type][0]
              > cpi->rd_prediction_type_threshes[frame_type][2])
-      pred_type = SINGLE_PREDICTION_ONLY;
+      pred_type = SINGLE_REFERENCE;
     else
-      pred_type = HYBRID_PREDICTION;
+      pred_type = REFERENCE_MODE_SELECT;
 
     /* filter type selection */
     // FIXME(rbultje) for some odd reason, we often select smooth_filter
@@ -2282,7 +2363,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     cpi->common.mcomp_filter_type = filter_type;
     encode_frame_internal(cpi);
 
-    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    for (i = 0; i < REFERENCE_MODES; ++i) {
       const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
       cpi->rd_prediction_type_threshes[frame_type][i] += diff;
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
@@ -2305,7 +2386,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
     }
 
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+    if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
 
@@ -2315,10 +2396,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
 
       if (comp_count_zero == 0) {
-        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
+        cpi->common.comp_pred_mode = SINGLE_REFERENCE;
         vp9_zero(cpi->comp_inter_count);
       } else if (single_count_zero == 0) {
-        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
+        cpi->common.comp_pred_mode = COMPOUND_REFERENCE;
         vp9_zero(cpi->comp_inter_count);
       }
     }
@@ -2415,7 +2496,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8;
+  x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
+                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ);
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 88cf11214..3691e7a7b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -136,6 +136,7 @@ static void optimize_b(MACROBLOCK *mb,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
@@ -163,7 +164,7 @@ static void optimize_b(MACROBLOCK *mb,
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -368,26 +369,23 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const scan_order *so;
   uint16_t *eob = &pd->eobs[block];
-  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
-  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
-  int xoff, yoff;
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
   int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
       so = &vp9_default_scan_orders[TX_32X32];
-      block >>= 6;
-      xoff = 32 * (block & twmask);
-      yoff = 32 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
+        vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
       else
-        vp9_fdct32x32(src_diff, coeff, bw * 4);
+        vp9_fdct32x32(src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, so->scan,
@@ -395,32 +393,21 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
       break;
     case TX_16X16:
       so = &vp9_default_scan_orders[TX_16X16];
-      block >>= 4;
-      xoff = 16 * (block & twmask);
-      yoff = 16 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_fdct16x16(src_diff, coeff, bw * 4);
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, so->scan, so->iscan);
       break;
     case TX_8X8:
       so = &vp9_default_scan_orders[TX_8X8];
-      block >>= 2;
-      xoff = 8 * (block & twmask);
-      yoff = 8 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_fdct8x8(src_diff, coeff, bw * 4);
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, so->scan, so->iscan);
       break;
     case TX_4X4:
       so = &vp9_default_scan_orders[TX_4X4];
-      xoff = 4 * (block & twmask);
-      yoff = 4 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm4x4(src_diff, coeff, bw * 4);
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, so->scan, so->iscan);
@@ -544,7 +531,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const scan_order *so;
   TX_TYPE tx_type;
@@ -572,8 +559,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       block >>= 6;
       vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
       if (!x->skip_recode) {
         vp9_subtract_block(32, 32, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
@@ -595,7 +583,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       block >>= 4;
       vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
       if (!x->skip_recode) {
         vp9_subtract_block(16, 16, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
@@ -613,7 +603,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       block >>= 2;
       vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
       if (!x->skip_recode) {
         vp9_subtract_block(8, 8, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
@@ -634,7 +626,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
 
       if (!x->skip_recode) {
         vp9_subtract_block(4, 4, src_diff, diff_stride,
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index cc4e347a3..3f01c778f 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -15,11 +15,22 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
 
-
 #ifdef ENTROPY_STATS
 extern unsigned int active_section;
 #endif
 
+static struct vp9_token mv_joint_encodings[MV_JOINTS];
+static struct vp9_token mv_class_encodings[MV_CLASSES];
+static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
+static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
+
+void vp9_entropy_mv_init() {
+  vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
+  vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
+  vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
+  vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
+}
+
 static void encode_mv_component(vp9_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
@@ -37,12 +48,12 @@ static void encode_mv_component(vp9_writer* w, int comp,
 
   // Class
   write_token(w, vp9_mv_class_tree, mvcomp->classes,
-              &vp9_mv_class_encodings[mv_class]);
+              &mv_class_encodings[mv_class]);
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
     write_token(w, vp9_mv_class0_tree, mvcomp->class0,
-                &vp9_mv_class0_encodings[d]);
+                &mv_class0_encodings[d]);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
@@ -53,7 +64,7 @@ static void encode_mv_component(vp9_writer* w, int comp,
   // Fractional bits
   write_token(w, vp9_mv_fp_tree,
               mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
-              &vp9_mv_fp_encodings[fr]);
+              &mv_fp_encodings[fr]);
 
   // High precision bit
   if (usehp)
@@ -137,111 +148,55 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
   return update;
 }
 
-static void counts_to_nmv_context(
-    nmv_context_counts *nmv_count,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][MV_FP_SIZE - 1][2],
-    unsigned int (*branch_ct_fp)[MV_FP_SIZE - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]) {
-  int i, j, k;
-  vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint,
-                                   nmv_count->joints);
-  for (i = 0; i < 2; ++i) {
-    branch_ct_sign[i][0] = nmv_count->comps[i].sign[0];
-    branch_ct_sign[i][1] = nmv_count->comps[i].sign[1];
-    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                    branch_ct_classes[i],
-                                    nmv_count->comps[i].classes);
-    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     branch_ct_class0[i],
-                                     nmv_count->comps[i].class0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      branch_ct_bits[i][j][0] = nmv_count->comps[i].bits[j][0];
-      branch_ct_bits[i][j][1] = nmv_count->comps[i].bits[j][1];
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (k = 0; k < CLASS0_SIZE; ++k) {
-      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       branch_ct_class0_fp[i][k],
-                                       nmv_count->comps[i].class0_fp[k]);
-    }
-    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     branch_ct_fp[i],
-                                     nmv_count->comps[i].fp);
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      branch_ct_class0_hp[i][0] = nmv_count->comps[i].class0_hp[0];
-      branch_ct_class0_hp[i][1] = nmv_count->comps[i].class0_hp[1];
+static void write_mv_update(const vp9_tree_index *tree,
+                            vp9_prob probs[/*n - 1*/],
+                            const unsigned int counts[/*n - 1*/],
+                            int n, vp9_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
 
-      branch_ct_hp[i][0] = nmv_count->comps[i].hp[0];
-      branch_ct_hp[i][1] = nmv_count->comps[i].hp[1];
-    }
-  }
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    update_mv(w, branch_ct[i], &probs[i], NMV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
+void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer *w) {
   int i, j;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][MV_FP_SIZE - 1][2];
-  unsigned int branch_ct_fp[2][MV_FP_SIZE - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
   nmv_context *mvc = &cpi->common.fc.nmvc;
+  nmv_context_counts *counts = &cpi->NMVcount;
 
-  counts_to_nmv_context(&cpi->NMVcount, usehp,
-                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                        branch_ct_class0, branch_ct_bits,
-                        branch_ct_class0_fp, branch_ct_fp,
-                        branch_ct_class0_hp, branch_ct_hp);
-
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
+  write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
   for (i = 0; i < 2; ++i) {
-    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
-                NMV_UPDATE_PROB);
-
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
-                NMV_UPDATE_PROB);
-
+    nmv_component *comp = &mvc->comps[i];
+    nmv_component_counts *comp_counts = &counts->comps[i];
+
+    update_mv(w, comp_counts->sign, &comp->sign, NMV_UPDATE_PROB);
+    write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes,
+                    MV_CLASSES, w);
+    write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0,
+                    CLASS0_SIZE, w);
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
-                NMV_UPDATE_PROB);
+      update_mv(w, comp_counts->bits[j], &comp->bits[j], NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      int k;
-      for (k = 0; k < MV_FP_SIZE - 1; ++k)
-        update_mv(bc, branch_ct_class0_fp[i][j][k],
-                  &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
-    }
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
 
-    for (j = 0; j < MV_FP_SIZE - 1; ++j)
-      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
+    write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                    MV_FP_SIZE, w);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
-                NMV_UPDATE_PROB);
-      update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
+      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
                 NMV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, NMV_UPDATE_PROB);
     }
   }
 }
@@ -254,7 +209,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
   usehp = usehp && vp9_use_mv_hp(ref);
 
-  write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
+  write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
@@ -314,3 +269,4 @@ void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
     inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
   }
 }
+
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index 633177885..4cc10da73 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -14,6 +14,8 @@
 
 #include "vp9/encoder/vp9_onyx_int.h"
 
+void vp9_entropy_mv_init();
+
 void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
 
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index df2841020..50d803680 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -76,6 +76,19 @@ static int select_cq_level(int qindex) {
   return ret_val;
 }
 
+static int gfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000828 * q * q * q) +
+               (-0.0055 * q * q) +
+               (1.32 * q) + 79.3);
+}
+
+static int kfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000973 * q * q * q) +
+               (-0.00613 * q * q) +
+               (1.316 * q) + 121.2);
+}
 
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
@@ -336,9 +349,11 @@ static int frame_max_bits(VP9_COMP *cpi) {
   const double max_bits = (1.0 * cpi->twopass.bits_left /
       (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
       (cpi->oxcf.two_pass_vbrmax_section / 100.0);
-
-  // Trap case where we are out of bits.
-  return MAX((int)max_bits, 0);
+  if (max_bits < 0)
+      return 0;
+  if (max_bits >= INT_MAX)
+    return INT_MAX;
+  return (int)max_bits;
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -528,7 +543,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs = ctx->eobs_pbuf[i][1];
   }
@@ -926,11 +941,11 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi,
   intra_cost = bitcost(av_intra);
 
   // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
+  // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
   mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
   // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
+  // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
   mode_cost =
     (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
            (av_pct_motion * motion_cost) +
@@ -1050,8 +1065,8 @@ static int estimate_max_q(VP9_COMP *cpi,
                                 sr_correction * speed_correction *
                                 cpi->twopass.est_max_qcorrection_factor;
 
-    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
-                                            err_correction_factor);
+    bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+                                               err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1138,7 +1153,7 @@ static int estimate_cq(VP9_COMP *cpi,
       sr_correction * speed_correction * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
+      vp9_rc_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1934,7 +1949,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int q = cpi->rc.last_q[INTER_FRAME];
     int gf_bits;
 
-    int boost = (cpi->rc.gfu_boost * vp9_gfboost_qadjust(q)) / 100;
+    int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
     boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
@@ -2726,3 +2741,21 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // sizes.
   cpi->twopass.modified_error_left -= kf_group_err;
 }
+
+void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+#ifdef DISABLE_RC_LONG_TERM_MEM
+  cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
+#else
+  cpi->twopass.bits_left -= 8 * bytes_used;
+#endif
+  if (!cpi->refresh_alt_ref_frame) {
+    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+                                        cpi->oxcf.two_pass_vbrmin_section
+                                        / 100);
+    if (two_pass_min_rate < lower_bounds_min_rate)
+      two_pass_min_rate = lower_bounds_min_rate;
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate /
+                                        cpi->oxcf.framerate);
+  }
+}
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
deleted file mode 100644
index 7eb659232..000000000
--- a/vp9/encoder/vp9_modecosts.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/common/vp9_entropymode.h"
-
-
-void vp9_init_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *const cm = &c->common;
-  const vp9_tree_index *KT = vp9_intra_mode_tree;
-  int i, j;
-
-  for (i = 0; i < INTRA_MODES; i++) {
-    for (j = 0; j < INTRA_MODES; j++) {
-      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
-                      KT);
-    }
-  }
-
-  // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
-                  vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
-                  vp9_intra_mode_tree);
-
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    cm->fc.switchable_interp_prob[i],
-                    vp9_switchable_interp_tree);
-}
diff --git a/vp9/encoder/vp9_modecosts.h b/vp9/encoder/vp9_modecosts.h
deleted file mode 100644
index f43033e5f..000000000
--- a/vp9/encoder/vp9_modecosts.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_MODECOSTS_H_
-#define VP9_ENCODER_VP9_MODECOSTS_H_
-
-void vp9_init_mode_costs(VP9_COMP *x);
-
-#endif  // VP9_ENCODER_VP9_MODECOSTS_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index d7b179689..8ae70c9bb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -24,6 +24,8 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -37,6 +39,9 @@
 
 #include "vpx_ports/vpx_timer.h"
 
+void vp9_entropy_mode_init();
+void vp9_coef_tree_initialize();
+
 static void set_default_lf_deltas(struct loopfilter *lf);
 
 #define DEFAULT_INTERP_FILTER SWITCHABLE
@@ -109,6 +114,9 @@ extern unsigned __int64 Sectionbits[500];
 
 extern void vp9_init_quantizer(VP9_COMP *cpi);
 
+static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
+  {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -151,11 +159,14 @@ void vp9_initialize_enc() {
 
   if (!init_done) {
     vp9_initialize_common();
+    vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
     vp9_init_quant_tables();
     vp9_init_me_luts();
-    vp9_init_minq_luts();
+    vp9_rc_init_minq_luts();
     // init_base_skip_probs();
+    vp9_entropy_mv_init();
+    vp9_entropy_mode_init();
     init_done = 1;
   }
 }
@@ -192,6 +203,8 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
   cpi->coding_context.last_frame_seg_map_copy = 0;
 
+  vpx_free(cpi->complexity_map);
+  cpi->complexity_map = 0;
   vpx_free(cpi->active_map);
   cpi->active_map = 0;
 
@@ -243,6 +256,79 @@ int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
   return target_index - start_index;
 }
 
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to thegiven rate ratio.
+
+int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
+                               double base_q_index, double rate_target_ratio) {
+  int i;
+  int base_bits_per_mb;
+  int target_bits_per_mb;
+  int target_index = cpi->rc.worst_quality;
+
+  // Make SURE use of floating point in this function is safe.
+  vp9_clear_system_state();
+
+  // Look up the current projected bits per block for the base index
+  base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+                                        base_q_index, 1.0);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
+
+  // Convert the q target to an index
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+    target_index = i;
+    if (vp9_rc_bits_per_mb(cpi->common.frame_type,
+                           i, 1.0) <= target_bits_per_mb )
+      break;
+  }
+
+  return target_index - base_q_index;
+}
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+static void setup_in_frame_q_adj(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  // double q_ratio;
+  int segment;
+  int qindex_delta;
+
+  // Make SURE use of floating point in this function is safe.
+  vp9_clear_system_state();
+
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)) {
+    // Clear down the segment map
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+    // Clear down the complexity map used for rd
+    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+
+    // Enable segmentation
+    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_clearall_segfeatures(seg);
+
+    // Select delta coding method
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q
+    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment
+    for (segment = 1; segment < 3; segment++) {
+      qindex_delta =
+        vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
+                                   in_frame_q_adj_ratio[segment]);
+      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+    }
+  }
+}
+
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
@@ -1446,6 +1532,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
+  // Create a complexity map used for rd adjustment
+  CHECK_MEM_ERROR(cm, cpi->complexity_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
@@ -2597,7 +2688,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 static void encode_with_recode_loop(VP9_COMP *cpi,
                                     unsigned long *size,
                                     uint8_t *dest,
-                                    int q,
+                                    int *q,
                                     int bottom_index,
                                     int top_index,
                                     int frame_over_shoot_limit,
@@ -2607,12 +2698,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
   int loop = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
-  int active_worst_qchanged = 0;
   int q_low = bottom_index, q_high = top_index;
   do {
     vp9_clear_system_state();  // __asm emms;
 
-    vp9_set_quantizer(cpi, q);
+    vp9_set_quantizer(cpi, *q);
 
     if (loop_count == 0) {
       // Set up entropy context depending on frame type. The decoder mandates
@@ -2630,8 +2720,12 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
       }
     }
 
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        vp9_vaq_frame_setup(cpi);
+      vp9_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      setup_in_frame_q_adj(cpi);
     }
 
     // transform / motion compensation build reconstruction frame
@@ -2655,14 +2749,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
 
     if (frame_over_shoot_limit == 0)
       frame_over_shoot_limit = 1;
-    active_worst_qchanged = 0;
 
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       loop = 0;
     } else {
       // Special case handling for forced key frames
       if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-        int last_q = q;
+        int last_q = *q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
         int high_err_target = cpi->ambient_err;
@@ -2678,32 +2771,32 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
             (kf_err > low_err_target &&
              cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
           // Lower q_high
-          q_high = q > q_low ? q - 1 : q_low;
+          q_high = *q > q_low ? *q - 1 : q_low;
 
           // Adjust Q
-          q = (q * high_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low) >> 1);
+          *q = ((*q) * high_err_target) / kf_err;
+          *q = MIN((*q), (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
                    cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
           // The key frame is much better than the previous frame
           // Raise q_low
-          q_low = q < q_high ? q + 1 : q_high;
+          q_low = *q < q_high ? *q + 1 : q_high;
 
           // Adjust Q
-          q = (q * low_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low + 1) >> 1);
+          *q = ((*q) * low_err_target) / kf_err;
+          *q = MIN((*q), (q_high + q_low + 1) >> 1);
         }
 
         // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
+        *q = clamp(*q, q_low, q_high);
 
-        loop = q != last_q;
+        loop = *q != last_q;
       } else if (recode_loop_test(
           cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          q, top_index, bottom_index)) {
+          *q, top_index, bottom_index)) {
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
-        int last_q = q;
+        int last_q = *q;
         int retries = 0;
 
         // Frame size out of permitted range:
@@ -2712,26 +2805,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         // Frame is too large
         if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
           // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
+          q_low = *q < q_high ? *q + 1 : q_high;
 
           if (undershoot_seen || loop_count > 1) {
             // Update rate_correction_factor unless
-            // cpi->rc.active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi, 1);
 
-            q = (q_high + q_low + 1) / 2;
+            *q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
-            // cpi->rc.active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
 
-            while (q < q_low && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            while (*q < q_low && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
               retries++;
             }
           }
@@ -2739,34 +2828,33 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
+          q_high = *q > q_low ? *q - 1 : q_low;
 
           if (overshoot_seen || loop_count > 1) {
             // Update rate_correction_factor unless
             // cpi->rc.active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi, 1);
 
-            q = (q_high + q_low) / 2;
+            *q = (q_high + q_low) / 2;
           } else {
             // Update rate_correction_factor unless
             // cpi->rc.active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
 
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
             // the user passsed in value.
-            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
-              q_low = q;
+            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+                *q < q_low) {
+              q_low = *q;
             }
 
-            while (q > q_high && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            while (*q > q_high && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
               retries++;
             }
           }
@@ -2775,9 +2863,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         }
 
         // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
+        *q = clamp(*q, q_low, q_high);
 
-        loop = q != last_q;
+        loop = *q != last_q;
       } else {
         loop = 0;
       }
@@ -2794,7 +2882,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
 #endif
     }
   } while (loop);
-  cpi->rc.active_worst_qchanged = active_worst_qchanged;
 }
 
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
@@ -2919,15 +3006,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     configure_static_seg_features(cpi);
   }
 
-  // Decide how big to make the frame.
-  vp9_pick_frame_size(cpi);
-
   vp9_clear_system_state();
 
-  q = vp9_pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+  // Decide how big to make the frame.
+  vp9_rc_pick_frame_size_and_bounds(cpi,
+                                    &frame_under_shoot_limit,
+                                    &frame_over_shoot_limit);
 
-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
-                                &frame_over_shoot_limit);
+  q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
+                                        &bottom_index,
+                                        &top_index);
 
 #if CONFIG_MULTIPLE_ARF
   // Force the quantizer determined by the coding order pattern.
@@ -2991,7 +3079,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   encode_with_recode_loop(cpi,
                           size,
                           dest,
-                          q,
+                          &q,
                           bottom_index,
                           top_index,
                           frame_over_shoot_limit,
@@ -3073,106 +3161,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
    * needed in motion search besides loopfilter */
   cm->last_frame_type = cm->frame_type;
 
-  // Update rate control heuristics
-  cpi->rc.projected_frame_size = (*size) << 3;
-
-  // Post encode loop adjustment of Q prediction.
-  if (!cpi->rc.active_worst_qchanged)
-    vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
-        cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
-
-
-  cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
-
-  // Keep record of last boosted (KF/KF/ARF) Q value.
-  // If the current frame is coded at a lower Q then we also update it.
-  // If all mbs in this group are skipped only update if the Q value is
-  // better than that already stored.
-  // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
-      ((cpi->static_mb_pct < 100) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
-    cpi->rc.last_boosted_qindex = cm->base_qindex;
-  }
-
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_adjust_key_frame_context(cpi);
-  }
-
-  // Keep a record of ambient average Q.
-  if (cm->frame_type != KEY_FRAME)
-    cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
-                            cm->base_qindex) >> 2;
-
-  // Keep a record from which we can calculate the average Q excluding GF
-  // updates and key frames.
-  if (cm->frame_type != KEY_FRAME &&
-      !cpi->refresh_golden_frame &&
-      !cpi->refresh_alt_ref_frame) {
-    cpi->rc.ni_frames++;
-    cpi->rc.tot_q += vp9_convert_qindex_to_q(q);
-    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
-
-    // Calculate the average Q for normal inter frames (not key or GFU frames).
-    cpi->rc.ni_tot_qi += q;
-    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
-  }
-
-  // Update the buffer level variable.
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame)
-    cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
-  else
-    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
-                               cpi->rc.projected_frame_size;
-
-  // Clip the buffer level at the maximum buffer size
-  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
-    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
-
-  // Rolling monitors of whether we are over or underspending used to help
-  // regulate min and Max Q in two pass.
-  if (cm->frame_type != KEY_FRAME) {
-    cpi->rc.rolling_target_bits =
-        ((cpi->rc.rolling_target_bits * 3) +
-         cpi->rc.this_frame_target + 2) / 4;
-    cpi->rc.rolling_actual_bits =
-        ((cpi->rc.rolling_actual_bits * 3) +
-         cpi->rc.projected_frame_size + 2) / 4;
-    cpi->rc.long_rolling_target_bits =
-        ((cpi->rc.long_rolling_target_bits * 31) +
-         cpi->rc.this_frame_target + 16) / 32;
-    cpi->rc.long_rolling_actual_bits =
-        ((cpi->rc.long_rolling_actual_bits * 31) +
-         cpi->rc.projected_frame_size + 16) / 32;
-  }
-
-  // Actual bits spent
-  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
-
-  // Debug stats
-  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
-                                     cpi->rc.projected_frame_size);
-
-  cpi->rc.buffer_level = cpi->rc.bits_off_target;
-
-#ifndef DISABLE_RC_LONG_TERM_MEM
-  // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
-                                  cpi->rc.projected_frame_size;
-
-    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
-                                  cpi->rc.projected_frame_size;
-
-    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
-  }
-#endif
+  vp9_rc_postencode_update(cpi, *size, q);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -3302,6 +3291,10 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
 
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
   // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
+
+  vp9_twopass_postencode_update(cpi, *size);
+
+  /*
 #ifdef DISABLE_RC_LONG_TERM_MEM
   cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
 #else
@@ -3320,6 +3313,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
     cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
                               / cpi->oxcf.framerate);
   }
+  */
 }
 
 static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
@@ -3614,8 +3608,12 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
                            VP9BORDERINPIXELS);
 
   // Calculate scaling factors for each of the 3 available references
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
     vp9_setup_scale_factors(cm, i);
+    if (vp9_is_scaled(&cm->active_ref_scale_comm[i]))
+      vp9_extend_frame_borders(&cm->yv12_fb[cm->active_ref_idx[i]],
+                               cm->subsampling_x, cm->subsampling_y);
+  }
 
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 54af75633..b8602e094 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -30,7 +30,6 @@
 #include "vp9/encoder/vp9_lookahead.h"
 
 #define DISABLE_RC_LONG_TERM_MEM 0
-
 // #define MODE_TEST_HIT_STATS
 
 // #define SPEEDSTATS 1
@@ -293,6 +292,7 @@ typedef struct {
   // Rate targetting variables
   int this_frame_target;
   int projected_frame_size;
+  int sb64_target_rate;
   int last_q[2];                   // Separate values for Intra/Inter
   int last_boosted_qindex;         // Last boosted GF/KF/ARF q
 
@@ -339,7 +339,6 @@ typedef struct {
   int active_worst_quality;
   int best_quality;
   int active_best_quality;
-  int active_worst_qchanged;
 } RATE_CONTROL;
 
 typedef struct VP9_COMP {
@@ -431,8 +430,8 @@ typedef struct VP9_COMP {
   int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
   int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
 
-  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
-  int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+  int64_t rd_comp_pred_diff[REFERENCE_MODES];
+  int64_t rd_prediction_type_threshes[4][REFERENCE_MODES];
   unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref_count[REF_CONTEXTS][2][2];
@@ -516,6 +515,8 @@ typedef struct VP9_COMP {
   // segment threashold for encode breakout
   int  segment_encode_breakout[MAX_SEGMENTS];
 
+  unsigned char *complexity_map;
+
   unsigned char *active_map;
   unsigned int active_map_enabled;
 
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index d24be96f6..2591a5783 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -137,45 +137,18 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob + 1;
 }
 
-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
-}
-
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  struct macroblock_plane* p = &x->plane[pb_idx.plane];
-  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+  struct macroblock_plane* p = &x->plane[plane];
+  struct macroblockd_plane* pd = &xd->plane[plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
            16, x->skip_block,
            p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
-           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
-           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
+           BLOCK_OFFSET(p->qcoeff, block),
+           BLOCK_OFFSET(pd->dqcoeff, block),
+           pd->dequant, p->zbin_extra, &pd->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index c078e1d41..41cfa5283 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -13,7 +13,7 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1293e860f..bf1fc4f31 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -26,6 +26,8 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
@@ -67,7 +69,7 @@ static int calculate_minq_index(double maxq,
   return QINDEX_RANGE - 1;
 }
 
-void vp9_init_minq_luts(void) {
+void vp9_rc_init_minq_luts(void) {
   int i;
 
   for (i = 0; i < QINDEX_RANGE; i++) {
@@ -121,22 +123,8 @@ double vp9_convert_qindex_to_q(int qindex) {
   return vp9_ac_quant(qindex, 0) / 4.0;
 }
 
-int vp9_gfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
-
-static int kfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000973 * q * q * q) +
-               (-0.00613 * q * q) +
-               (1.316 * q) + 121.2);
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor) {
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor) {
   const double q = vp9_convert_qindex_to_q(qindex);
   int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
@@ -213,7 +201,7 @@ void vp9_setup_inter_frame(VP9_COMP *cpi) {
 
 static int estimate_bits_at_q(int frame_kind, int q, int mbs,
                               double correction_factor) {
-  const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_kind, q, correction_factor));
 
   // Attempt to retain reasonable accuracy without overflow. The cutoff is
   // chosen such that the maximum product of Bpm and MBs fits 31 bits. The
@@ -240,11 +228,9 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
     if (target > max_rate)
       target = max_rate;
   }
-
   cpi->rc.this_frame_target = target;
 }
 
-
 //  Do the best we can to define the parameters for the next GF based
 //  on what information we have available.
 //
@@ -300,7 +286,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
 }
 
 
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   const int q = cpi->common.base_qindex;
   int correction_factor = 100;
   double rate_correction_factor;
@@ -381,7 +367,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
 }
 
 
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame) {
   int q = cpi->rc.active_worst_quality;
 
   int i;
@@ -413,8 +399,8 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
   i = cpi->rc.active_best_quality;
 
   do {
-    bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,
-                                                 correction_factor);
+    bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cpi->common.frame_type, i,
+                                                    correction_factor);
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -452,8 +438,9 @@ static int get_active_quality(int q,
   return active_best_quality;
 }
 
-int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
-                                   int * bottom_index, int * top_index) {
+int vp9_rc_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+                                      int *bottom_index,
+                                      int *top_index) {
   // Set an active best quality and if necessary active worst quality
   int q = cpi->rc.active_worst_quality;
   VP9_COMMON *const cm = &cpi->common;
@@ -472,7 +459,12 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
                                         (last_boosted_q * 0.75));
 
       cpi->rc.active_best_quality = MAX(qindex + delta_qindex,
-                                     cpi->rc.best_quality);
+                                        cpi->rc.best_quality);
+    } else if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+      // If this is the first (key) frame in 1-pass, active best/worst is
+      // the user best/worst-allowed, and leave the top_index to active_worst.
+      cpi->rc.active_best_quality = cpi->oxcf.best_allowed_q;
+      cpi->rc.active_worst_quality = cpi->oxcf.worst_allowed_q;
     } else {
       int high = 5000;
       int low = 400;
@@ -481,9 +473,9 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
 
       // Baseline value derived from cpi->active_worst_quality and kf boost
       cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.kf_boost,
-                                                    low, high,
-                                                    kf_low_motion_minq,
-                                                    kf_high_motion_minq);
+                                                       low, high,
+                                                       kf_low_motion_minq,
+                                                       kf_high_motion_minq);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -524,14 +516,14 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
         q = cpi->cq_target_quality;
       if (cpi->frames_since_key > 1) {
         cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                      low, high,
-                                                      afq_low_motion_minq,
-                                                      afq_high_motion_minq);
+                                                         low, high,
+                                                         afq_low_motion_minq,
+                                                         afq_high_motion_minq);
       } else {
         cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                      low, high,
-                                                      gf_low_motion_minq,
-                                                      gf_high_motion_minq);
+                                                         low, high,
+                                                         gf_low_motion_minq,
+                                                         gf_high_motion_minq);
       }
       // Constrained quality use slightly lower active best.
       cpi->rc.active_best_quality = cpi->rc.active_best_quality * 15 / 16;
@@ -541,22 +533,19 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
         cpi->rc.active_best_quality = cpi->cq_target_quality;
       } else {
         if (cpi->frames_since_key > 1) {
-          cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                        low, high,
-                                                        afq_low_motion_minq,
-                                                        afq_high_motion_minq);
+          cpi->rc.active_best_quality = get_active_quality(
+              q, cpi->rc.gfu_boost, low, high,
+              afq_low_motion_minq, afq_high_motion_minq);
         } else {
-          cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                        low, high,
-                                                        gf_low_motion_minq,
-                                                        gf_high_motion_minq);
+          cpi->rc.active_best_quality = get_active_quality(
+              q, cpi->rc.gfu_boost, low, high,
+              gf_low_motion_minq, gf_high_motion_minq);
         }
       }
     } else {
-        cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                         low, high,
-                                                         gf_low_motion_minq,
-                                                         gf_high_motion_minq);
+        cpi->rc.active_best_quality = get_active_quality(
+            q, cpi->rc.gfu_boost, low, high,
+            gf_low_motion_minq, gf_high_motion_minq);
     }
   } else {
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
@@ -596,25 +585,23 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
   if (cpi->rc.active_worst_quality < cpi->rc.active_best_quality)
     cpi->rc.active_worst_quality = cpi->rc.active_best_quality;
 
+  *top_index = cpi->rc.active_worst_quality;
+  *bottom_index = cpi->rc.active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   // Limit Q range for the adaptive loop.
   if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
-    *top_index =
-      (cpi->rc.active_worst_quality + cpi->rc.active_best_quality * 3) / 4;
-    // If this is the first (key) frame in 1-pass, active best is the user
-    // best-allowed, and leave the top_index to active_worst.
-    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
-      cpi->rc.active_best_quality = cpi->oxcf.best_allowed_q;
-      *top_index = cpi->oxcf.worst_allowed_q;
+    if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+      *top_index =
+          (cpi->rc.active_worst_quality + cpi->rc.active_best_quality * 3) / 4;
     }
   } else if (!cpi->is_src_frame_alt_ref &&
              (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (cpi->rc.active_worst_quality + cpi->rc.active_best_quality) / 2;
-  } else {
-    *top_index = cpi->rc.active_worst_quality;
   }
-  *bottom_index = cpi->rc.active_best_quality;
+#endif
 
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     q = cpi->rc.active_best_quality;
@@ -627,14 +614,13 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
       // 1-pass: for now, use per-frame-bw for target size of frame, scaled
       // by |x| for key frame.
       int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
-      q = vp9_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth);
+      q = vp9_rc_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth);
     } else {
-      q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+      q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
     }
     if (q > *top_index)
       q = *top_index;
   }
-
   return q;
 }
 
@@ -686,7 +672,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
 }
 
 
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
+static void adjust_key_frame_context(VP9_COMP *cpi) {
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();
 
@@ -695,28 +681,30 @@ void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
 }
 
 
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit) {
+static void compute_frame_size_bounds(const VP9_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
   // Set-up bounds on acceptable frame size:
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit  = INT_MAX;
   } else {
     if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit  = cpi->rc.this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = cpi->rc.this_frame_target * 7 / 8;
+      *frame_over_shoot_limit  = this_frame_target * 9 / 8;
+      *frame_under_shoot_limit = this_frame_target * 7 / 8;
     } else {
       if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
-        *frame_over_shoot_limit  = cpi->rc.this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = cpi->rc.this_frame_target * 7 / 8;
+        *frame_over_shoot_limit  = this_frame_target * 9 / 8;
+        *frame_under_shoot_limit = this_frame_target * 7 / 8;
       } else {
         // Stron overshoot limit for constrained quality
         if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-          *frame_over_shoot_limit  = cpi->rc.this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->rc.this_frame_target * 2 / 8;
+          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = this_frame_target * 2 / 8;
         } else {
-          *frame_over_shoot_limit  = cpi->rc.this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->rc.this_frame_target * 5 / 8;
+          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = this_frame_target * 5 / 8;
         }
       }
     }
@@ -731,9 +719,10 @@ void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
   }
 }
 
-
 // return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
+int vp9_rc_pick_frame_size_and_bounds(VP9_COMP *cpi,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
   VP9_COMMON *cm = &cpi->common;
 
   if (cm->frame_type == KEY_FRAME)
@@ -741,5 +730,112 @@ int vp9_pick_frame_size(VP9_COMP *cpi) {
   else
     calc_pframe_target_size(cpi);
 
+  // Target rate per SB64 (including partial SB64s.
+  cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
+                             (cpi->common.width * cpi->common.height);
+  compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
+                            frame_under_shoot_limit, frame_over_shoot_limit);
+
   return 1;
 }
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used, int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  // Update rate control heuristics
+  cpi->rc.projected_frame_size = (bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  vp9_rc_update_rate_correction_factors(
+      cpi, (cpi->sf.recode_loop ||
+            cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+
+  cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+      ((cpi->static_mb_pct < 100) &&
+       ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+    cpi->rc.last_boosted_qindex = cm->base_qindex;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    adjust_key_frame_context(cpi);
+  }
+
+  // Keep a record of ambient average Q.
+  if (cm->frame_type != KEY_FRAME)
+    cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
+                            cm->base_qindex) >> 2;
+
+  // Keep a record from which we can calculate the average Q excluding GF
+  // updates and key frames.
+  if (cm->frame_type != KEY_FRAME &&
+      !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+    cpi->rc.ni_frames++;
+    cpi->rc.tot_q += vp9_convert_qindex_to_q(q);
+    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+
+    // Calculate the average Q for normal inter frames (not key or GFU frames).
+    cpi->rc.ni_tot_qi += q;
+    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+  }
+
+  // Update the buffer level variable.
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame)
+    cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
+  else
+    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
+                               cpi->rc.projected_frame_size;
+
+  // Clip the buffer level at the maximum buffer size
+  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
+    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    cpi->rc.rolling_target_bits =
+        ((cpi->rc.rolling_target_bits * 3) +
+         cpi->rc.this_frame_target + 2) / 4;
+    cpi->rc.rolling_actual_bits =
+        ((cpi->rc.rolling_actual_bits * 3) +
+         cpi->rc.projected_frame_size + 2) / 4;
+    cpi->rc.long_rolling_target_bits =
+        ((cpi->rc.long_rolling_target_bits * 31) +
+         cpi->rc.this_frame_target + 16) / 32;
+    cpi->rc.long_rolling_actual_bits =
+        ((cpi->rc.long_rolling_actual_bits * 31) +
+         cpi->rc.projected_frame_size + 16) / 32;
+  }
+
+  // Actual bits spent
+  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+
+  // Debug stats
+  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
+                                     cpi->rc.projected_frame_size);
+
+  cpi->rc.buffer_level = cpi->rc.bits_off_target;
+
+#ifndef DISABLE_RC_LONG_TERM_MEM
+  // Update bits left to the kf and gf groups to account for overshoot or
+  // undershoot on these frames
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
+                                  cpi->rc.projected_frame_size;
+
+    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
+                                  cpi->rc.projected_frame_size;
+
+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+  }
+#endif
+}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 57dcd3f15..f01d18672 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -20,24 +20,41 @@ void vp9_save_coding_context(VP9_COMP *cpi);
 void vp9_restore_coding_context(VP9_COMP *cpi);
 
 void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
-                                   int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit);
+void vp9_setup_inter_frame(VP9_COMP *cpi);
 
-void vp9_init_minq_luts(void);
+double vp9_convert_qindex_to_q(int qindex);
 
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi);
+// Updates rate correction factors
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
 
-double vp9_convert_qindex_to_q(int qindex);
-int vp9_gfboost_qadjust(int qindex);
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
-int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
-                                   int * bottom_index, int * top_index);
+// initialize luts for minq
+void vp9_rc_init_minq_luts(void);
+
+// return of 0 means drop frame
+// Changes rc.this_frame_target and rc.sb64_rate_target
+int vp9_rc_pick_frame_size_and_bounds(VP9_COMP *cpi,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+                                      int * bottom_index,
+                                      int * top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame);
+
+// Post encode update of the rate control parameters based
+// on bytes used and q used for the frame
+void vp9_rc_postencode_update(VP9_COMP *cpi,
+                              uint64_t bytes_used,
+                              int q_used);
+
+// estimates bits per mb for a given qindex and correction factor
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(VP9_COMP *cpi,
+                                   uint64_t bytes_used);
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index fde84298f..65cf5c797 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -17,7 +17,6 @@
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_modecosts.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -114,6 +113,43 @@ static int rd_thresh_block_size_factor[BLOCK_SIZES] =
 #define MV_COST_WEIGHT      108
 #define MV_COST_WEIGHT_SUB  120
 
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 << b_width_log2(plane_bsize);
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+static void fill_mode_costs(VP9_COMP *c) {
+  VP9_COMMON *const cm = &c->common;
+  int i, j;
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+                      vp9_intra_mode_tree);
+
+  // TODO(rbultje) separate tables for superblock costing?
+  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
+                  vp9_intra_mode_tree);
+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
+                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
+                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
+                  vp9_intra_mode_tree);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
+                    cm->fc.switchable_interp_prob[i],
+                    vp9_switchable_interp_tree);
+}
+
 static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
   int i, j, k, l;
@@ -247,7 +283,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 
   cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
                               cm->frame_type != KEY_FRAME) ?
-                             0 : 1;
+                              0 : 1;
 
   set_block_thresholds(cpi);
 
@@ -258,7 +294,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
                     vp9_partition_tree);
 
   /*rough estimate for costing*/
-  vp9_init_mode_costs(cpi);
+  fill_mode_costs(cpi);
 
   if (!frame_is_intra_only(cm)) {
     vp9_build_nmv_cost_table(
@@ -267,15 +303,9 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
         &cm->fc.nmvc,
         cm->allow_high_precision_mv, 1, 1);
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      MB_PREDICTION_MODE m;
-
-      for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] =
-            cost_token(vp9_inter_mode_tree,
-                       cm->fc.inter_mode_probs[i],
-                       &vp9_inter_mode_encodings[INTER_OFFSET(m)]);
-    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+                      cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
   }
 }
 
@@ -491,11 +521,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
                               const int16_t *scan, const int16_t *nb) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = pd->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][ref];
@@ -588,8 +619,8 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
                            args->scan, args->nb);
 }
 
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
-                           TX_SIZE tx_size, void *arg) {
+static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -698,7 +729,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   rd_stack->nb = so->neighbors;
 
   foreach_transformed_block_in_plane(xd, bsize, plane,
-                                     block_yrd_txfm, rd_stack);
+                                     block_rd_txfm, rd_stack);
   if (rd_stack->skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
@@ -745,59 +776,42 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    r[n][1] = r[n][0];
-    if (r[n][0] == INT_MAX)
-      continue;
-    for (m = 0; m <= n - (n == max_tx_size); m++) {
-      if (m == n)
-        r[n][1] += vp9_cost_zero(tx_probs[m]);
-      else
-        r[n][1] += vp9_cost_one(tx_probs[m]);
-    }
-  }
-
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    r[n][1] = r[n][0];
+    if (r[n][0] < INT_MAX) {
+      for (m = 0; m <= n - (n == max_tx_size); m++) {
+        if (m == n)
+          r[n][1] += vp9_cost_zero(tx_probs[m]);
+        else
+          r[n][1] += vp9_cost_one(tx_probs[m]);
+      }
+    }
     if (d[n] == INT64_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
-      continue;
-    }
-    if (s[n]) {
+    } else if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
-  }
 
-  if (max_tx_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       (cm->tx_mode == TX_MODE_SELECT &&
-        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-        rd[TX_32X32][1] < rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              (cm->tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] < rd[TX_8X8][1] &&
-               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode == ALLOW_8X8 ||
-             cm->tx_mode == ALLOW_16X16 ||
-             cm->tx_mode == ALLOW_32X32 ||
-           (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
+    if (rd[n][1] < best_rd) {
+      best_tx = n;
+      best_rd = rd[n][1];
+    }
   }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
+
 
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
@@ -807,29 +821,18 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (max_tx_size >= TX_16X16 &&
-           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-  else
-    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
-                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] &&
-      rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] < rd[TX_8X8][1] &&
-             rd[TX_16X16][1] < rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
@@ -849,14 +852,17 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   int n, m;
   int s0, s1;
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
-  // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
-
-  // for (n = TX_4X4; n <= max_txfm_size; n++)
-  //   r[n][0] = (r[n][0] * scale_r[n]);
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    double scale = scale_rd[n];
     r[n][1] = r[n][0];
     for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
@@ -864,62 +870,29 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
       else
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
-  }
-
-  assert(skip_prob > 0);
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+    }
+    if (rd[n][1] < best_rd) {
+      best_rd = rd[n][1];
+      best_tx = n;
     }
-  }
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
-    rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
   }
 
-  if (max_tx_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       (cm->tx_mode == TX_MODE_SELECT &&
-        rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-        rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-        rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              (cm->tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-               rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode == ALLOW_8X8 ||
-             cm->tx_mode == ALLOW_16X16 ||
-             cm->tx_mode == ALLOW_32X32 ||
-           (cm->tx_mode == TX_MODE_SELECT &&
-            rd[TX_8X8][1] <= rd[TX_4X4][1])) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
-  }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
   txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-      rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-      rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-             rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
@@ -939,6 +912,9 @@ static void super_block_yrd(VP9_COMP *cpi,
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
   const int b_inter_mode = is_inter_block(mbmi);
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE tx_size;
+
 
   assert(bs == mbmi->sb_type);
   if (b_inter_mode)
@@ -957,34 +933,16 @@ static void super_block_yrd(VP9_COMP *cpi,
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
       b_inter_mode) {
-    if (bs >= BLOCK_32X32)
-      model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
-                           &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-    if (bs >= BLOCK_16X16)
-      model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
-                           &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
-                         &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
-                         &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
+                           &r[tx_size][0], &d[tx_size], &s[tx_size]);
     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
                                   skip, sse, ref_best_rd, bs);
   } else {
-    if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
-                       &s[TX_32X32], &sse[TX_32X32],
-                       ref_best_rd, 0, bs, TX_32X32);
-    if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
-                       &s[TX_16X16], &sse[TX_16X16],
-                       ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                     &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                     &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      txfm_rd_in_plane(x, rdcost_stack, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
@@ -1097,7 +1055,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
 
-        vp9_regular_quantize_b_4x4(x, 4, block, so->scan, so->iscan);
+        vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 
         ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                              so->scan, so->neighbors);
@@ -1341,11 +1299,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
-  // int mode_mask = (bsize <= BLOCK_8X8)
-  //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
-    // if (!(mode_mask & (1 << mode)))
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
           & (1 << mode)))
       continue;
@@ -1373,7 +1327,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
         struct macroblockd_plane *const pd = x->e_mbd.plane;
         for (i = 1; i < MAX_MB_PLANE; ++i) {
           p[i].coeff    = ctx->coeff_pbuf[i][2];
-          pd[i].qcoeff  = ctx->qcoeff_pbuf[i][2];
+          p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
           pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
           pd[i].eobs    = ctx->eobs_pbuf[i][2];
 
@@ -1383,7 +1337,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
           ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
 
           ctx->coeff_pbuf[i][0]   = p[i].coeff;
-          ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
+          ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
           ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
           ctx->eobs_pbuf[i][0]    = pd[i].eobs;
         }
@@ -1392,7 +1346,6 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
-
   return best_rd;
 }
 
@@ -1588,7 +1541,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 4, k, so->scan, so->iscan);
+      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
@@ -2135,7 +2088,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
-  unsigned int max_mv = 0;
+  int max_mv = 0;
 
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
@@ -2194,7 +2147,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
     vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
     vp9_prob comp_inter_p = 128;
 
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
       comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
       *comp_mode_p = comp_inter_p;
     } else {
@@ -2203,12 +2156,12 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
 
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->comp_pred_mode != COMPOUND_REFERENCE) {
       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
         base_cost += vp9_cost_bit(comp_inter_p, 0);
 
       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
@@ -2223,11 +2176,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
     }
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+    if (cm->comp_pred_mode != SINGLE_REFERENCE) {
       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
         base_cost += vp9_cost_bit(comp_inter_p, 1);
 
       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
@@ -2243,7 +2196,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int_mv *ref_mv,
                          int_mv *second_ref_mv,
-                         int64_t comp_pred_diff[NB_PREDICTION_TYPES],
+                         int64_t comp_pred_diff[REFERENCE_MODES],
                          int64_t tx_size_diff[TX_MODES],
                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2257,9 +2210,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
-  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
-  ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
-  ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 
   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
@@ -2782,9 +2735,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (!(*mode_excluded)) {
     if (is_comp_pred) {
-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_REFERENCE);
     } else {
-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+      *mode_excluded = (cpi->common.comp_pred_mode == COMPOUND_REFERENCE);
     }
   }
 
@@ -3050,7 +3003,7 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 
   for (i = 0; i < max_plane; ++i) {
     p[i].coeff    = ctx->coeff_pbuf[i][1];
-    pd[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
+    p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs    = ctx->eobs_pbuf[i][1];
 
@@ -3060,7 +3013,7 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
     ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
 
     ctx->coeff_pbuf[i][0]   = p[i].coeff;
-    ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
+    ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
     ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
     ctx->eobs_pbuf[i][0]    = pd[i].eobs;
   }
@@ -3149,8 +3102,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_rd = best_rd_so_far;
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
@@ -3186,7 +3139,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+  for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
@@ -3363,12 +3316,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
       mode_excluded = mode_excluded
                          ? mode_excluded
-                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+                         : cm->comp_pred_mode == SINGLE_REFERENCE;
     } else {
       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
         mode_excluded =
             mode_excluded ?
-                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+                mode_excluded : cm->comp_pred_mode == COMPOUND_REFERENCE;
       }
     }
 
@@ -3491,7 +3444,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
     }
 
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
       rate2 += compmode_cost;
     }
 
@@ -3576,7 +3529,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+      for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -3636,9 +3589,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
         hybrid_rate = rate2;
       } else {
@@ -3650,14 +3603,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
       if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+          single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
       } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+                 single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
     /* keep record of best filter type */
@@ -3717,7 +3670,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
@@ -3779,7 +3734,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+  for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
     else
@@ -3850,8 +3805,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
@@ -3886,7 +3841,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+  for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
@@ -4030,12 +3985,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
       mode_excluded = mode_excluded
                          ? mode_excluded
-                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+                         : cm->comp_pred_mode == SINGLE_REFERENCE;
     } else {
       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
         mode_excluded =
             mode_excluded ?
-                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+                mode_excluded : cm->comp_pred_mode == COMPOUND_REFERENCE;
       }
     }
 
@@ -4241,9 +4196,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
       if (!mode_excluded) {
         if (comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_REFERENCE;
         else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+          mode_excluded = cpi->common.comp_pred_mode == COMPOUND_REFERENCE;
       }
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
 
@@ -4271,7 +4226,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+    if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
       rate2 += compmode_cost;
     }
 
@@ -4332,7 +4287,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+      for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -4387,9 +4342,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
         hybrid_rate = rate2;
       } else {
@@ -4401,14 +4356,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
       if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+          single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
       } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+                 single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
     /* keep record of best filter type */
@@ -4465,7 +4420,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
@@ -4524,7 +4481,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
   }
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+  for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
     else
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3f1cc6fe8..389ec152a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -26,6 +26,86 @@ const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
 const int *vp9_dct_value_cost_ptr;
 
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
+  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
+  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
+  -ONE_TOKEN, 6,                              /* 2 = ONE */
+  8, 12,                                      /* 3 = LOW_VAL */
+  -TWO_TOKEN, 10,                            /* 4 = TWO */
+  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+  14, 16,                                   /* 6 = HIGH_LOW */
+  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+  18, 20,                                   /* 8 = CAT_THREEFOUR */
+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
+};
+
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
+  2, 6,                                     /* 0 = LOW_VAL */
+  -TWO_TOKEN, 4,                            /* 1 = TWO */
+  -THREE_TOKEN, -FOUR_TOKEN,                /* 2 = THREE */
+  8, 10,                                    /* 3 = HIGH_LOW */
+  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 4 = CAT_ONE */
+  12, 14,                                   /* 5 = CAT_THREEFOUR */
+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 6 = CAT_THREE */
+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 7 = CAT_FIVE */
+};
+
+static const vp9_prob Pcat1[] = { 159};
+static const vp9_prob Pcat2[] = { 165, 145};
+static const vp9_prob Pcat3[] = { 173, 148, 140};
+static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp9_prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+  int i = 0;
+
+  while (++i < n) {
+    p[0] = p[1] = i << 1;
+    p += 2;
+  }
+
+  p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+  init_bit_tree(cat1, 1);
+  init_bit_tree(cat2, 2);
+  init_bit_tree(cat3, 3);
+  init_bit_tree(cat4, 4);
+  init_bit_tree(cat5, 5);
+  init_bit_tree(cat6, 14);
+}
+
+const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
+  {0, 0, 0, 0},           // ZERO_TOKEN
+  {0, 0, 0, 1},           // ONE_TOKEN
+  {0, 0, 0, 2},           // TWO_TOKEN
+  {0, 0, 0, 3},           // THREE_TOKEN
+  {0, 0, 0, 4},           // FOUR_TOKEN
+  {cat1, Pcat1, 1, 5},    // DCT_VAL_CATEGORY1
+  {cat2, Pcat2, 2, 7},    // DCT_VAL_CATEGORY2
+  {cat3, Pcat3, 3, 11},   // DCT_VAL_CATEGORY3
+  {cat4, Pcat4, 4, 19},   // DCT_VAL_CATEGORY4
+  {cat5, Pcat5, 5, 35},   // DCT_VAL_CATEGORY5
+  {cat6, Pcat6, 14, 67},  // DCT_VAL_CATEGORY6
+  {0, 0, 0, 0}            // DCT_EOB_TOKEN
+};
+
+struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+void vp9_coef_tree_initialize() {
+  init_bit_trees();
+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
 static void fill_value_tokens() {
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
@@ -108,7 +188,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const int eob = pd->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  struct macroblock_plane *p = &cpi->mb.plane[plane];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
 
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index e24e31b80..2e3bf5203 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -28,6 +28,10 @@ typedef struct {
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
+extern const vp9_tree_index vp9_coef_tree[];
+extern const vp9_tree_index vp9_coef_con_tree[];
+extern struct vp9_token vp9_coef_encodings[];
+
 int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
 int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                               int plane);
diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index c9bf4dabe..3245960ac 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h
@@ -64,11 +64,6 @@ static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
   return cost;
 }
 
-static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
-                             const struct vp9_token *token) {
-  return treed_cost(tree, probs, token->value, token->len);
-}
-
 void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
 void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
 
diff --git a/vp9/encoder/x86/vp9_dct32x32_avx2.c b/vp9/encoder/x86/vp9_dct32x32_avx2.c
new file mode 100644
index 000000000..9ea22fed2
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct32x32_avx2.c
@@ -0,0 +1,2710 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+#define pair256_set_epi16(a, b) \
+  _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+
+#define pair256_set_epi32(a, b) \
+  _mm256_set_epi32(b, a, b, a, b, a, b, a)
+
+
+
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+  __m256i buf0, buf1;
+  buf0 = _mm256_mul_epu32(a, b);
+  a = _mm256_srli_epi64(a, 32);
+  b = _mm256_srli_epi64(b, 32);
+  buf1 = _mm256_mul_epu32(a, b);
+  return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input,
+                  int16_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+  const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kOne  = _mm256_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process sixteen columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 16) {
+      __m256i step1[32];
+      __m256i step2[32];
+      __m256i step3[32];
+      __m256i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m256i *step1a = &step1[ 0];
+          __m256i *step1b = &step1[31];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m256i *step1a = &step1[ 4];
+          __m256i *step1b = &step1[27];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m256i *step1a = &step1[ 8];
+          __m256i *step1b = &step1[23];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m256i *step1a = &step1[12];
+          __m256i *step1b = &step1[19];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m256i in00  = _mm256_loadu_si256((const __m256i *)(in +  0 * 32));
+          __m256i in01  = _mm256_loadu_si256((const __m256i *)(in +  1 * 32));
+          __m256i in02  = _mm256_loadu_si256((const __m256i *)(in +  2 * 32));
+          __m256i in03  = _mm256_loadu_si256((const __m256i *)(in +  3 * 32));
+          __m256i in28  = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+          __m256i in29  = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+          __m256i in30  = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+          __m256i in31  = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+          step1[ 0] = _mm256_add_epi16(in00, in31);
+          step1[ 1] = _mm256_add_epi16(in01, in30);
+          step1[ 2] = _mm256_add_epi16(in02, in29);
+          step1[ 3] = _mm256_add_epi16(in03, in28);
+          step1[28] = _mm256_sub_epi16(in03, in28);
+          step1[29] = _mm256_sub_epi16(in02, in29);
+          step1[30] = _mm256_sub_epi16(in01, in30);
+          step1[31] = _mm256_sub_epi16(in00, in31);
+        }
+        {
+          __m256i in04  = _mm256_loadu_si256((const __m256i *)(in +  4 * 32));
+          __m256i in05  = _mm256_loadu_si256((const __m256i *)(in +  5 * 32));
+          __m256i in06  = _mm256_loadu_si256((const __m256i *)(in +  6 * 32));
+          __m256i in07  = _mm256_loadu_si256((const __m256i *)(in +  7 * 32));
+          __m256i in24  = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+          __m256i in25  = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+          __m256i in26  = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+          __m256i in27  = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+          step1[ 4] = _mm256_add_epi16(in04, in27);
+          step1[ 5] = _mm256_add_epi16(in05, in26);
+          step1[ 6] = _mm256_add_epi16(in06, in25);
+          step1[ 7] = _mm256_add_epi16(in07, in24);
+          step1[24] = _mm256_sub_epi16(in07, in24);
+          step1[25] = _mm256_sub_epi16(in06, in25);
+          step1[26] = _mm256_sub_epi16(in05, in26);
+          step1[27] = _mm256_sub_epi16(in04, in27);
+        }
+        {
+          __m256i in08  = _mm256_loadu_si256((const __m256i *)(in +  8 * 32));
+          __m256i in09  = _mm256_loadu_si256((const __m256i *)(in +  9 * 32));
+          __m256i in10  = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+          __m256i in11  = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+          __m256i in20  = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+          __m256i in21  = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+          __m256i in22  = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+          __m256i in23  = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+          step1[ 8] = _mm256_add_epi16(in08, in23);
+          step1[ 9] = _mm256_add_epi16(in09, in22);
+          step1[10] = _mm256_add_epi16(in10, in21);
+          step1[11] = _mm256_add_epi16(in11, in20);
+          step1[20] = _mm256_sub_epi16(in11, in20);
+          step1[21] = _mm256_sub_epi16(in10, in21);
+          step1[22] = _mm256_sub_epi16(in09, in22);
+          step1[23] = _mm256_sub_epi16(in08, in23);
+        }
+        {
+          __m256i in12  = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+          __m256i in13  = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+          __m256i in14  = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+          __m256i in15  = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+          __m256i in16  = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+          __m256i in17  = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+          __m256i in18  = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+          __m256i in19  = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+          step1[12] = _mm256_add_epi16(in12, in19);
+          step1[13] = _mm256_add_epi16(in13, in18);
+          step1[14] = _mm256_add_epi16(in14, in17);
+          step1[15] = _mm256_add_epi16(in15, in16);
+          step1[16] = _mm256_sub_epi16(in15, in16);
+          step1[17] = _mm256_sub_epi16(in14, in17);
+          step1[18] = _mm256_sub_epi16(in13, in18);
+          step1[19] = _mm256_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
+        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
+        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
+        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
+        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
+        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
+        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
+        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
+        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
+        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
+        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
+        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
+        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
+        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
+        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
+        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
+        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
+        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
+        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
+        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
+        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
+        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
+        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
+        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
+        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
+        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
+        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
+        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
+        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
+        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
+        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
+        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
+
+        step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm256_add_epi16(step2[10], kOne);
+        step2[11] = _mm256_add_epi16(step2[11], kOne);
+        step2[12] = _mm256_add_epi16(step2[12], kOne);
+        step2[13] = _mm256_add_epi16(step2[13], kOne);
+        step2[14] = _mm256_add_epi16(step2[14], kOne);
+        step2[15] = _mm256_add_epi16(step2[15], kOne);
+        step1[16] = _mm256_add_epi16(step1[16], kOne);
+        step1[17] = _mm256_add_epi16(step1[17], kOne);
+        step1[18] = _mm256_add_epi16(step1[18], kOne);
+        step1[19] = _mm256_add_epi16(step1[19], kOne);
+        step2[20] = _mm256_add_epi16(step2[20], kOne);
+        step2[21] = _mm256_add_epi16(step2[21], kOne);
+        step2[22] = _mm256_add_epi16(step2[22], kOne);
+        step2[23] = _mm256_add_epi16(step2[23], kOne);
+        step2[24] = _mm256_add_epi16(step2[24], kOne);
+        step2[25] = _mm256_add_epi16(step2[25], kOne);
+        step2[26] = _mm256_add_epi16(step2[26], kOne);
+        step2[27] = _mm256_add_epi16(step2[27], kOne);
+        step1[28] = _mm256_add_epi16(step1[28], kOne);
+        step1[29] = _mm256_add_epi16(step1[29], kOne);
+        step1[30] = _mm256_add_epi16(step1[30], kOne);
+        step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm256_srai_epi16(step2[10], 2);
+        step2[11] = _mm256_srai_epi16(step2[11], 2);
+        step2[12] = _mm256_srai_epi16(step2[12], 2);
+        step2[13] = _mm256_srai_epi16(step2[13], 2);
+        step2[14] = _mm256_srai_epi16(step2[14], 2);
+        step2[15] = _mm256_srai_epi16(step2[15], 2);
+        step1[16] = _mm256_srai_epi16(step1[16], 2);
+        step1[17] = _mm256_srai_epi16(step1[17], 2);
+        step1[18] = _mm256_srai_epi16(step1[18], 2);
+        step1[19] = _mm256_srai_epi16(step1[19], 2);
+        step2[20] = _mm256_srai_epi16(step2[20], 2);
+        step2[21] = _mm256_srai_epi16(step2[21], 2);
+        step2[22] = _mm256_srai_epi16(step2[22], 2);
+        step2[23] = _mm256_srai_epi16(step2[23], 2);
+        step2[24] = _mm256_srai_epi16(step2[24], 2);
+        step2[25] = _mm256_srai_epi16(step2[25], 2);
+        step2[26] = _mm256_srai_epi16(step2[26], 2);
+        step2[27] = _mm256_srai_epi16(step2[27], 2);
+        step1[28] = _mm256_srai_epi16(step1[28], 2);
+        step1[29] = _mm256_srai_epi16(step1[29], 2);
+        step1[30] = _mm256_srai_epi16(step1[30], 2);
+        step1[31] = _mm256_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+        const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+        const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+        const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+        const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+      }
+
+      // Stage 4
+      {
+        step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+        const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+        const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+        const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+        const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+        const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+        const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+        const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+        const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+        const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+        const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+        const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+        const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+        const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+        const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+        const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+        const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+        const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+        const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+        const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+        const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+        const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+        const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+        const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+        const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+        const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+        const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+        const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+        const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+        const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+        const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+        const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+        const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+        const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+        const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+        const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+        const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+        const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+        const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+        const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+        const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+        const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+        const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+        const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+        const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m256i lstep1[64], lstep2[64], lstep3[64];
+        __m256i u[32], v[32], sign[16];
+        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+        const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+        u[0] = k_packs_epi64_avx2(v[0], v[1]);
+        u[1] = k_packs_epi64_avx2(v[2], v[3]);
+        u[2] = k_packs_epi64_avx2(v[4], v[5]);
+        u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+        v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+          out[16] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+          out[24] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+          out[20] = _mm256_packs_epi32(u[2], u[3]);
+          out[12] = _mm256_packs_epi32(u[4], u[5]);
+          out[28] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
+          const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
+          const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
+          const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm256_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm256_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm256_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm256_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm256_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm256_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm256_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm256_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm256_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm256_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm256_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm256_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm256_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm256_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm256_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm256_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+          out[18] = _mm256_packs_epi32(u[2], u[3]);
+          out[10] = _mm256_packs_epi32(u[4], u[5]);
+          out[26] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+          out[22] = _mm256_packs_epi32(u[10], u[11]);
+          out[14] = _mm256_packs_epi32(u[12], u[13]);
+          out[30] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
+          const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
+          const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
+          const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
+          const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+          out[17] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+          out[25] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+          out[23] = _mm256_packs_epi32(u[10], u[11]);
+          out[15] = _mm256_packs_epi32(u[12], u[13]);
+          out[31] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
+          const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
+          const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
+          const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
+          const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+          out[21] = _mm256_packs_epi32(u[2], u[3]);
+          out[13] = _mm256_packs_epi32(u[4], u[5]);
+          out[29] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+          out[19] = _mm256_packs_epi32(u[10], u[11]);
+          out[11] = _mm256_packs_epi32(u[12], u[13]);
+          out[27] = _mm256_packs_epi32(u[14], u[15]);
+        }
+      }
+#endif
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output_currStep,*output_nextStep;
+        if (0 == pass){
+                 output_currStep = &intermediate[column_start * 32];
+                 output_nextStep = &intermediate[(column_start + 8) * 32];
+        } else{
+                 output_currStep = &output_org[column_start * 32];
+                 output_nextStep = &output_org[(column_start + 8) * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m256i *this_out = &out[8 * transpose_block];
+          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
+          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
+          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
+          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
+          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
+          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
+          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
+          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
+          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
+          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
+          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
+          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
+          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
+          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
+          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
+          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
+          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
+          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
+          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
+          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
+
+          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+          // Process next 8x8
+          output_currStep += 8;
+          output_nextStep += 8;
+        }
+      }
+    }
+  }
+}  // NOLINT
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
new file mode 100644
index 000000000..d81b72bba
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -0,0 +1,2579 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in0, in1, in2, in3;
+  // Load inputs.
+  {
+    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
+    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
+    // x = x << 4
+    in0 = _mm_slli_epi16(in0, 4);
+    in1 = _mm_slli_epi16(in1, 4);
+    in2 = _mm_slli_epi16(in2, 4);
+    in3 = _mm_slli_epi16(in3, 4);
+    // if (i == 0 && input[0]) input[0] += 1;
+    {
+      // The mask will only contain wether the first value is zero, all
+      // other comparison will fail as something shifted by 4 (above << 4)
+      // can never be equal to one. To increment in the non-zero case, we
+      // add the mask and one for the first element:
+      //   - if zero, mask = -1, v = v - 1 + 1 = v
+      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+      in0 = _mm_add_epi16(in0, mask);
+      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+    }
+  }
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // Transform 1/2: Add/substract
+    const __m128i r0 = _mm_add_epi16(in0, in3);
+    const __m128i r1 = _mm_add_epi16(in1, in2);
+    const __m128i r2 = _mm_sub_epi16(in1, in2);
+    const __m128i r3 = _mm_sub_epi16(in0, in3);
+    // Transform 1/2: Interleave to do the multiply by constants which gets us
+    //                into 32 bits.
+    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+    // Combine and transpose
+    const __m128i res0 = _mm_packs_epi32(w0, w2);
+    const __m128i res1 = _mm_packs_epi32(w4, w6);
+    // 00 01 02 03 20 21 22 23
+    // 10 11 12 13 30 31 32 33
+    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
+    // 00 10 01 11 02 12 03 13
+    // 20 30 21 31 22 32 23 33
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
+    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
+    if (0 == pass) {
+      // Extract values in the high part for second pass as transform code
+      // only uses the first four values.
+      in1 = _mm_unpackhi_epi64(in0, in0);
+      in3 = _mm_unpackhi_epi64(in2, in2);
+    } else {
+      // Post-condition output and store it (v + 1) >> 2, taking advantage
+      // of the fact 1/3 are stored just after 0/2.
+      __m128i out01 = _mm_add_epi16(in0, kOne);
+      __m128i out23 = _mm_add_epi16(in2, kOne);
+      out01 = _mm_srai_epi16(out01, 2);
+      out23 = _mm_srai_epi16(out23, 2);
+      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
+    }
+  }
+}
+
+static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4_avx2(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_avx2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void fadst4_1d_avx2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[4];
+  load_buffer_4x4_avx2(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct4_1d_avx2(in);
+      fdct4_1d_avx2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst4_1d_avx2(in);
+      fdct4_1d_avx2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct4_1d_avx2(in);
+      fadst4_1d_avx2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst4_1d_avx2(in);
+      fadst4_1d_avx2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_4x4_avx2(output, in);
+}
+
+void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/substract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/substract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/substract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  const int bit_m02 = bit - 2;
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit_m02 >= 0) {
+    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+    res[0] = _mm_add_epi16(res[0], k_const_rounding);
+    res[1] = _mm_add_epi16(res[1], k_const_rounding);
+    res[2] = _mm_add_epi16(res[2], k_const_rounding);
+    res[3] = _mm_add_epi16(res[3], k_const_rounding);
+    res[4] = _mm_add_epi16(res[4], k_const_rounding);
+    res[5] = _mm_add_epi16(res[5], k_const_rounding);
+    res[6] = _mm_add_epi16(res[6], k_const_rounding);
+    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  res[0] = _mm_srai_epi16(res[0], bit);
+  res[1] = _mm_srai_epi16(res[1], bit);
+  res[2] = _mm_srai_epi16(res[2], bit);
+  res[3] = _mm_srai_epi16(res[3], bit);
+  res[4] = _mm_srai_epi16(res[4], bit);
+  res[5] = _mm_srai_epi16(res[5], bit);
+  res[6] = _mm_srai_epi16(res[6], bit);
+  res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
+  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_avx2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void fadst8_1d_avx2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[8];
+  load_buffer_8x8_avx2(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct8_1d_avx2(in);
+      fdct8_1d_avx2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst8_1d_avx2(in);
+      fdct8_1d_avx2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct8_1d_avx2(in);
+      fadst8_1d_avx2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst8_1d_avx2(in);
+      fadst8_1d_avx2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  right_shift_8x8_avx2(in, 1);
+  write_buffer_8x8_avx2(output, in, 8);
+}
+
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
+  const int16_t *in = input;
+  int16_t *out = intermediate;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = _mm_add_epi16(in00, in15);
+        input1 = _mm_add_epi16(in01, in14);
+        input2 = _mm_add_epi16(in02, in13);
+        input3 = _mm_add_epi16(in03, in12);
+        input4 = _mm_add_epi16(in04, in11);
+        input5 = _mm_add_epi16(in05, in10);
+        input6 = _mm_add_epi16(in06, in09);
+        input7 = _mm_add_epi16(in07, in08);
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = _mm_sub_epi16(in07, in08);
+        step1_1 = _mm_sub_epi16(in06, in09);
+        step1_2 = _mm_sub_epi16(in05, in10);
+        step1_3 = _mm_sub_epi16(in04, in11);
+        step1_4 = _mm_sub_epi16(in03, in12);
+        step1_5 = _mm_sub_epi16(in02, in13);
+        step1_6 = _mm_sub_epi16(in01, in14);
+        step1_7 = _mm_sub_epi16(in00, in15);
+      }
+      // Work on the first eight values; fdct8_1d(input, even_results);
+      {
+        // Add/substract
+        const __m128i q0 = _mm_add_epi16(input0, input7);
+        const __m128i q1 = _mm_add_epi16(input1, input6);
+        const __m128i q2 = _mm_add_epi16(input2, input5);
+        const __m128i q3 = _mm_add_epi16(input3, input4);
+        const __m128i q4 = _mm_sub_epi16(input3, input4);
+        const __m128i q5 = _mm_sub_epi16(input2, input5);
+        const __m128i q6 = _mm_sub_epi16(input1, input6);
+        const __m128i q7 = _mm_sub_epi16(input0, input7);
+        // Work on first four results
+        {
+          // Add/substract
+          const __m128i r0 = _mm_add_epi16(q0, q3);
+          const __m128i r1 = _mm_add_epi16(q1, q2);
+          const __m128i r2 = _mm_sub_epi16(q1, q2);
+          const __m128i r3 = _mm_sub_epi16(q0, q3);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res00 = _mm_packs_epi32(w0, w1);
+          res08 = _mm_packs_epi32(w2, w3);
+          res04 = _mm_packs_epi32(w4, w5);
+          res12 = _mm_packs_epi32(w6, w7);
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+          // Combine
+          const __m128i r0 = _mm_packs_epi32(s0, s1);
+          const __m128i r1 = _mm_packs_epi32(s2, s3);
+          // Add/substract
+          const __m128i x0 = _mm_add_epi16(q4, r0);
+          const __m128i x1 = _mm_sub_epi16(q4, r0);
+          const __m128i x2 = _mm_sub_epi16(q7, r1);
+          const __m128i x3 = _mm_add_epi16(q7, r1);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res02 = _mm_packs_epi32(w0, w1);
+          res14 = _mm_packs_epi32(w2, w3);
+          res10 = _mm_packs_epi32(w4, w5);
+          res06 = _mm_packs_epi32(w6, w7);
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_2 = _mm_packs_epi32(w0, w1);
+          step2_3 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_5 = _mm_packs_epi32(w0, w1);
+          step2_4 = _mm_packs_epi32(w2, w3);
+        }
+        // step 3
+        {
+          step3_0 = _mm_add_epi16(step1_0, step2_3);
+          step3_1 = _mm_add_epi16(step1_1, step2_2);
+          step3_2 = _mm_sub_epi16(step1_1, step2_2);
+          step3_3 = _mm_sub_epi16(step1_0, step2_3);
+          step3_4 = _mm_sub_epi16(step1_7, step2_4);
+          step3_5 = _mm_sub_epi16(step1_6, step2_5);
+          step3_6 = _mm_add_epi16(step1_6, step2_5);
+          step3_7 = _mm_add_epi16(step1_7, step2_4);
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_1 = _mm_packs_epi32(w0, w1);
+          step2_2 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_6 = _mm_packs_epi32(w0, w1);
+          step2_5 = _mm_packs_epi32(w2, w3);
+        }
+        // step 5
+        {
+          step1_0 = _mm_add_epi16(step3_0, step2_1);
+          step1_1 = _mm_sub_epi16(step3_0, step2_1);
+          step1_2 = _mm_sub_epi16(step3_3, step2_2);
+          step1_3 = _mm_add_epi16(step3_3, step2_2);
+          step1_4 = _mm_add_epi16(step3_4, step2_5);
+          step1_5 = _mm_sub_epi16(step3_4, step2_5);
+          step1_6 = _mm_sub_epi16(step3_7, step2_6);
+          step1_7 = _mm_add_epi16(step3_7, step2_6);
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res01 = _mm_packs_epi32(w0, w1);
+          res09 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res05 = _mm_packs_epi32(w0, w1);
+          res13 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res11 = _mm_packs_epi32(w0, w1);
+          res03 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res15 = _mm_packs_epi32(w0, w1);
+          res07 = _mm_packs_epi32(w2, w3);
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
+        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
+        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
+        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
+        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
+        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
+        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
+        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
+      }
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        // Store results
+        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+      }
+      out += 8*16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8_avx2(input, in0, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8_avx2(input, in1, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8_avx2(output, in0, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8_avx2(output, in1, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8_avx2(res0, res0);
+  array_transpose_8x8_avx2(res1, tbuf);
+  array_transpose_8x8_avx2(res0 + 8, res1);
+  array_transpose_8x8_avx2(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8_avx2(res0, 2);
+  right_shift_8x8_avx2(res0 + 8, 2);
+  right_shift_8x8_avx2(res1, 2);
+  right_shift_8x8_avx2(res1 + 8, 2);
+}
+
+void fdct16_1d_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
+  fdct16_1d_8col_avx2(in0);
+  fdct16_1d_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
+  fadst16_1d_8col_avx2(in0);
+  fadst16_1d_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
+                             int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+  load_buffer_16x16_avx2(input, in0, in1, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_1d_avx2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      fadst16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_1d_avx2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      fdct16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_1d_avx2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      fadst16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_1d_avx2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_16x16_avx2(output, in0, in1, 16);
+}
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 6e4a498cb..eefbd1ac9 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -124,7 +124,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-#VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index bd13518f5..ce83a6703 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -38,7 +38,6 @@ VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
-VP9_CX_SRCS-yes += encoder/vp9_modecosts.h
 VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h
 VP9_CX_SRCS-yes += encoder/vp9_psnr.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
@@ -49,7 +48,6 @@ VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
 VP9_CX_SRCS-yes += encoder/vp9_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
-VP9_CX_SRCS-yes += encoder/vp9_modecosts.c
 VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
@@ -106,4 +104,7 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 7e76682d4..f43172170 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -30,7 +30,6 @@ VP9_DX_SRCS-yes += decoder/vp9_onyxd.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h
 VP9_DX_SRCS-yes += decoder/vp9_thread.c
 VP9_DX_SRCS-yes += decoder/vp9_thread.h
-VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h