15 files changed, 1106 insertions, 315 deletions
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 9a7be4578..89dea4edc 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -42,9 +42,10 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
   -MV_CLASS_2, -MV_CLASS_3,
   10, 12,
   -MV_CLASS_4, -MV_CLASS_5,
-  14, 16,
-  -MV_CLASS_6, -MV_CLASS_7,
-  -MV_CLASS_8, -MV_CLASS_9,
+  -MV_CLASS_6, 14,
+  16, 18,
+  -MV_CLASS_7, -MV_CLASS_8,
+  -MV_CLASS_9, -MV_CLASS_10,
 };
 struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
 
@@ -64,24 +65,24 @@ const nmv_context vp9_default_nmv_context = {
   {32, 64, 96},
   {
     { /* vert component */
-      128,                                             /* sign */
-      {224, 144, 192, 168, 192, 176, 192, 198, 198},   /* class */
-      {216},                                           /* class0 */
-      {136, 140, 148, 160, 176, 192, 224, 234, 234},   /* bits */
-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
-      {64, 96, 64},                                    /* fp */
-      160,                                             /* class0_hp bit */
-      128,                                             /* hp */
+      128,                                                  /* sign */
+      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   /* class */
+      {216},                                                /* class0 */
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   /* bits */
+      {{128, 128, 64}, {96, 112, 64}},                      /* class0_fp */
+      {64, 96, 64},                                         /* fp */
+      160,                                                  /* class0_hp bit */
+      128,                                                  /* hp */
     },
     { /* hor component */
-      128,                                             /* sign */
-      {216, 128, 176, 160, 176, 176, 192, 198, 198},   /* class */
-      {208},                                           /* class0 */
-      {136, 140, 148, 160, 176, 192, 224, 234, 234},   /* bits */
-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */
-      {64, 96, 64},                                    /* fp */
-      160,                                             /* class0_hp bit */
-      128,                                             /* hp */
+      128,                                                  /* sign */
+      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   /* class */
+      {208},                                                /* class0 */
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   /* bits */
+      {{128, 128, 64}, {96, 112, 64}},                      /* class0_fp */
+      {64, 96, 64},                                         /* fp */
+      160,                                                  /* class0_hp bit */
+      128,                                                  /* hp */
     }
   },
 };
@@ -107,6 +108,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
   else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
   else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
+  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
   else assert(0);
   if (offset)
     *offset = z - mv_class_base(c);
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 33500069e..162d2b44f 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -49,7 +49,7 @@ extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
 extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
 
 /* Symbols for coding magnitude class of nonzero components */
-#define MV_CLASSES     10
+#define MV_CLASSES     11
 typedef enum {
   MV_CLASS_0 = 0,      /* (0, 2]     integer pel */
   MV_CLASS_1 = 1,      /* (2, 4]     integer pel */
@@ -61,6 +61,7 @@ typedef enum {
   MV_CLASS_7 = 7,      /* (128, 256] integer pel */
   MV_CLASS_8 = 8,      /* (256, 512] integer pel */
   MV_CLASS_9 = 9,      /* (512, 1024] integer pel */
+  MV_CLASS_10 = 10,    /* (1024,2048] integer pel */
 } MV_CLASS_TYPE;
 
 extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 119038121..90eb88ed3 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -52,21 +52,6 @@ int vp9_dc_quant(int QIndex, int Delta) {
   return retval;
 }
 
-int vp9_dc2quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = dc_qlookup[ QIndex ];
-
-  return retval;
-
-}
 int vp9_dc_uv_quant(int QIndex, int Delta) {
   int retval;
 
@@ -94,22 +79,6 @@ int vp9_ac_yquant(int QIndex) {
   return retval;
 }
 
-int vp9_ac2quant(int QIndex, int Delta) {
-  int retval;
-
-  QIndex = QIndex + Delta;
-
-  if (QIndex > MAXQ)
-    QIndex = MAXQ;
-  else if (QIndex < 0)
-    QIndex = 0;
-
-  retval = (ac_qlookup[ QIndex ] * 775) / 1000;
-  if (retval < 4)
-    retval = 4;
-
-  return retval;
-}
 int vp9_ac_uv_quant(int QIndex, int Delta) {
   int retval;
 
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index 3031fb699..b97b6089d 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -17,9 +17,10 @@
 void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
 
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n);
+                                              int stride, int n,
+                                              int tx, int ty);
 
-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);
 
 #if CONFIG_COMP_INTERINTRA_PRED
 void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index 7fbee7c32..eab5ab495 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -15,17 +15,17 @@
 #include "vp9_rtcd.h"
 
 #if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(uint8_t *x, int stride, int n, int t,
+static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,
                              int dx, int dy) {
   int i, j;
   int count = 0, gsum = 0, gdiv;
   /* TODO: Make this code more efficient by breaking up into two loops */
-  for (i = -t; i < n; ++i)
-    for (j = -t; j < n; ++j) {
+  for (i = -ty; i < n; ++i)
+    for (j = -tx; j < n; ++j) {
       int g;
       if (i >= 0 && j >= 0) continue;
       if (i + dy >= 0 && j + dx >= 0) continue;
-      if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue;
+      if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;
       g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);
       gsum += g * g;
       count++;
@@ -36,14 +36,15 @@ static int find_grad_measure(uint8_t *x, int stride, int n, int t,
 
 #if CONTEXT_PRED_REPLACEMENTS == 6
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n) {
+                                              int stride, int n,
+                                              int tx, int ty) {
   int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
+  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
   imin = 1;
   for (i = 2; i < 8; i += 1 + (i == 3))
     imin = (g[i] < g[imin] ? i : imin);
@@ -73,12 +74,13 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 4
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n) {
+                                              int stride, int n,
+                                              int tx, int ty) {
   int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
   imin = 1;
   for (i = 3; i < 8; i+=2)
     imin = (g[i] < g[imin] ? i : imin);
@@ -104,16 +106,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 0
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n) {
+                                              int stride, int n,
+                                              int tx, int ty) {
   int g[8], i, imin, imax;
-  g[0] = find_grad_measure(ptr, stride, n, 4,  1, 0);
-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
-  g[4] = find_grad_measure(ptr, stride, n, 4,  0, 1);
-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+  g[0] = find_grad_measure(ptr, stride, n, tx, ty,  1, 0);
+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
+  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
+  g[4] = find_grad_measure(ptr, stride, n, tx, ty,  0, 1);
+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
   imax = 0;
   for (i = 1; i < 8; i++)
     imax = (g[i] > g[imax] ? i : imax);
@@ -144,10 +147,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
 }
 #endif
 
-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {
+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) {
+  const int block_idx = x - xd->block;
+  const int have_top = (block_idx >> 2) || xd->up_available;
+  const int have_left = (block_idx & 3)  || xd->left_available;
   uint8_t *ptr = *(x->base_dst) + x->dst;
   int stride = x->dst_stride;
-  return vp9_find_dominant_direction(ptr, stride, 4);
+  int tx = have_left ? 4 : 0;
+  int ty = have_top ? 4 : 0;
+  if (!have_left && !have_top)
+    return B_DC_PRED;
+  return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);
 }
 #endif
 
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 3e941859e..0c2a5c94a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -598,7 +598,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int
 specialize vp9_short_fdct32x32
 
 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct16x16
+specialize vp9_short_fdct16x16 sse2
 
 prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 07cf227b8..d1d07c753 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -44,6 +44,21 @@
 int dec_debug = 0;
 #endif
 
+
+static int read_le16(const uint8_t *p) {
+  return (p[1] << 8) | p[0];
+}
+
+static int read_le32(const uint8_t *p) {
+  return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
+}
+
+// len == 0 is not allowed
+static int read_is_valid(const unsigned char *start, size_t len,
+                         const unsigned char *end) {
+  return start + len > start && start + len <= end;
+}
+
 static int merge_index(int v, int n, int modulus) {
   int max1 = (n - 1 - modulus / 2) / modulus + 1;
   if (v < max1) v = v * modulus + modulus / 2;
@@ -61,14 +76,13 @@ static int merge_index(int v, int n, int modulus) {
 static int inv_remap_prob(int v, int m) {
   const int n = 256;
   const int modulus = MODULUS_PARAM;
-  int i;
+
   v = merge_index(v, n - 1, modulus);
   if ((m << 1) <= n) {
-    i = vp9_inv_recenter_nonneg(v + 1, m);
+    return vp9_inv_recenter_nonneg(v + 1, m);
   } else {
-    i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
+    return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
   }
-  return i;
 }
 
 static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
@@ -112,8 +126,8 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) {
   int i;
 
   VP9_COMMON *const pc = &pbi->common;
-  int segment_id = mb->mode_info_context->mbmi.segment_id;
-  int qindex = get_qindex(mb, segment_id, pc->base_qindex);
+  const int segment_id = mb->mode_info_context->mbmi.segment_id;
+  const int qindex = get_qindex(mb, segment_id, pc->base_qindex);
   mb->q_index = qindex;
 
   for (i = 0; i < 16; i++)
@@ -287,12 +301,14 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
       int ib = vp9_i8x8_block[i];
       BLOCKD *b = &xd->block[ib];
       int i8x8mode = b->bmi.as_mode.first;
+
       b = &xd->block[16 + i];
-      vp9_intra_uv4x4_predict(xd, &xd->block[16 + i], i8x8mode, b->predictor);
+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
       xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
                    *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);
+
       b = &xd->block[20 + i];
-      vp9_intra_uv4x4_predict(xd, &xd->block[20 + i], i8x8mode, b->predictor);
+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
       xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
                    *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);
     }
@@ -361,7 +377,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
 #if CONFIG_NEWBINTRAMODES
       xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =
-          vp9_find_bpred_context(b);
+          vp9_find_bpred_context(xd, b);
 #endif
       if (!xd->mode_info_context->mbmi.mb_skip_coeff)
         eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
@@ -798,9 +814,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp9_reset_mb_tokens_context(xd);
   } else if (!bool_error(bc)) {
-    if (mode != B_PRED) {
+    if (mode != B_PRED)
       eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
-    }
   }
 
   //mode = xd->mode_info_context->mbmi.mode;
@@ -923,10 +938,9 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
   xd->above_context = cm->above_context + mb_col;
   xd->left_context = cm->left_context + (mb_row & 3);
 
-  /* Distance of Mb to the various image edges.
-   * These are specified to 8th pel as they are always compared to
-   * values that are in 1/8th pel units
-   */
+  // Distance of Mb to the various image edges.
+  // These are specified to 8th pel as they are always compared to
+  // values that are in 1/8th pel units
   block_size >>= 4;  // in mb units
 
   set_mb_row(cm, xd, mb_row, block_size);
@@ -937,37 +951,31 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
   xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 }
 
-static void set_refs(VP9D_COMP *pbi, int block_size,
-                     int mb_row, int mb_col) {
+static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   if (mbmi->ref_frame > INTRA_FRAME) {
-    int ref_fb_idx;
-
-    /* Select the appropriate reference frame for this MB */
-    ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
+    // Select the appropriate reference frame for this MB
+    int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
     xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
     xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
     setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,
                      &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
-    /* propagate errors from reference frames */
+    // propagate errors from reference frames
     xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
 
     if (mbmi->second_ref_frame > INTRA_FRAME) {
-      int second_ref_fb_idx;
-
-      /* Select the appropriate reference frame for this MB */
-      second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
+      // Select the appropriate reference frame for this MB
+      int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
 
       setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
                        mb_row, mb_col,
                        &xd->scale_factor[1], &xd->scale_factor_uv[1]);
 
-      /* propagate errors from reference frames */
+      // propagate errors from reference frames
       xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
     }
   }
@@ -1054,15 +1062,6 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
   }
 }
 
-static unsigned int read_partition_size(const unsigned char *cx_size) {
-  return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
-}
-
-static int read_is_valid(const unsigned char *start, size_t len,
-                         const unsigned char *end) {
-  return start + len > start && start + len <= end;
-}
-
 
 static void setup_token_decoder(VP9D_COMP *pbi,
                                 const unsigned char *cx_data,
@@ -1090,7 +1089,7 @@ static void setup_token_decoder(VP9D_COMP *pbi,
 
 static void init_frame(VP9D_COMP *pbi) {
   VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
+  MACROBLOCKD *const xd = &pbi->mb;
 
   if (pc->frame_type == KEY_FRAME) {
     vp9_setup_past_independence(pc, xd);
@@ -1113,7 +1112,6 @@ static void init_frame(VP9D_COMP *pbi) {
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_stride = pc->mode_info_stride;
   xd->corrupted = 0;
-
   xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff;
 }
 
@@ -1241,8 +1239,8 @@ static void update_frame_size(VP9D_COMP *pbi) {
   VP9_COMMON *cm = &pbi->common;
 
   /* our internal buffers are always multiples of 16 */
-  int width = (cm->Width + 15) & ~15;
-  int height = (cm->Height + 15) & ~15;
+  const int width = (cm->Width + 15) & ~15;
+  const int height = (cm->Height + 15) & ~15;
 
   cm->mb_rows = height >> 4;
   cm->mb_cols = width >> 4;
@@ -1261,8 +1259,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   BOOL_DECODER header_bc, residual_bc;
   VP9_COMMON *const pc = &pbi->common;
   MACROBLOCKD *const xd  = &pbi->mb;
-  const unsigned char *data = (const unsigned char *)pbi->Source;
-  const unsigned char *data_end = data + pbi->source_sz;
+  const uint8_t *data = (const uint8_t *)pbi->Source;
+  const uint8_t *data_end = data + pbi->source_sz;
   ptrdiff_t first_partition_length_in_bytes = 0;
 
   int mb_row;
@@ -1284,8 +1282,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     first_partition_length_in_bytes =
       (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
 
-    if ((data + first_partition_length_in_bytes > data_end
-         || data + first_partition_length_in_bytes < data))
+    if (!read_is_valid(data, first_partition_length_in_bytes, data_end))
       vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt partition 0 length");
 
@@ -1314,8 +1311,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
        * size.
        */
       if (data + 5 < data_end) {
-        pc->Width  = (data[0] | (data[1] << 8));
-        pc->Height = (data[2] | (data[3] << 8));
+        pc->Width  = read_le16(data);
+        pc->Height = read_le16(data + 2);
 
         pc->horiz_scale = data[4] >> 4;
         pc->vert_scale  = data[4] & 0x0F;
@@ -1467,7 +1464,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     pc->ref_pred_probs[0] = 120;
     pc->ref_pred_probs[1] = 80;
     pc->ref_pred_probs[2] = 40;
-
   } else {
     for (i = 0; i < PREDICTION_PROBS; i++) {
       if (vp9_read_bit(&header_bc))
@@ -1481,10 +1477,11 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   if (xd->lossless) {
     pc->txfm_mode = ONLY_4X4;
   } else {
-    /* Read the loop filter level and type */
+    // Read the loop filter level and type
     pc->txfm_mode = vp9_read_literal(&header_bc, 2);
     if (pc->txfm_mode == 3)
       pc->txfm_mode += vp9_read_bit(&header_bc);
+
     if (pc->txfm_mode == TX_MODE_SELECT) {
       pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
       pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
@@ -1511,7 +1508,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
 
           if (vp9_read_bit(&header_bc))        /* Apply sign */
-            xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+            xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i];
         }
       }
 
@@ -1522,7 +1519,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
 
           if (vp9_read_bit(&header_bc))        /* Apply sign */
-            xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+            xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i];
         }
       }
     }
@@ -1570,14 +1567,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
     pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
     pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
 
-    /* Is high precision mv allowed */
+    // Is high precision mv allowed
     xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
+
     // Read the type of subpel filter to use
-    if (vp9_read_bit(&header_bc)) {
-      pc->mcomp_filter_type = SWITCHABLE;
-    } else {
-      pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);
-    }
+    pc->mcomp_filter_type = vp9_read_bit(&header_bc) ? SWITCHABLE :
+                            vp9_read_literal(&header_bc, 2);
+
 #if CONFIG_COMP_INTERINTRA_PRED
     pc->use_interintra = vp9_read_bit(&header_bc);
 #endif
@@ -1731,7 +1727,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   /* tile info */
   {
-    const unsigned char *data_ptr = data + first_partition_length_in_bytes;
+    const uint8_t *data_ptr = data + first_partition_length_in_bytes;
     int tile_row, tile_col, delta_log2_tiles;
 
     vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);
@@ -1753,26 +1749,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
     if (pbi->oxcf.inv_tile_order) {
       const int n_cols = pc->tile_columns;
-      const unsigned char *data_ptr2[4][1 << 6];
+      const uint8_t *data_ptr2[4][1 << 6];
       BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak);
 
       // pre-initialize the offsets, we're going to read in inverse order
       data_ptr2[0][0] = data_ptr;
       for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
         if (tile_row) {
-          int size = data_ptr2[tile_row - 1][n_cols - 1][0] |
-                    (data_ptr2[tile_row - 1][n_cols - 1][1] << 8) |
-                    (data_ptr2[tile_row - 1][n_cols - 1][2] << 16) |
-                    (data_ptr2[tile_row - 1][n_cols - 1][3] << 24);
+          const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]);
           data_ptr2[tile_row - 1][n_cols - 1] += 4;
           data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
         }
 
         for (tile_col = 1; tile_col < n_cols; tile_col++) {
-          int size = data_ptr2[tile_row][tile_col - 1][0] |
-                    (data_ptr2[tile_row][tile_col - 1][1] << 8) |
-                    (data_ptr2[tile_row][tile_col - 1][2] << 16) |
-                    (data_ptr2[tile_row][tile_col - 1][3] << 24);
+          const int size = read_le32(data_ptr2[tile_row][tile_col - 1]);
           data_ptr2[tile_row][tile_col - 1] += 4;
           data_ptr2[tile_row][tile_col] =
               data_ptr2[tile_row][tile_col - 1] + size;
@@ -1813,10 +1803,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           }
 
           if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
-            int size = data_ptr[0] |
-                      (data_ptr[1] << 8) |
-                      (data_ptr[2] << 16) |
-                      (data_ptr[3] << 24);
+            int size = read_le32(data_ptr);
             data_ptr += 4 + size;
           }
         }
@@ -1829,31 +1816,29 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   pc->last_width = pc->Width;
   pc->last_height = pc->Height;
 
-  /* Collect information about decoder corruption. */
-  /* 1. Check first boolean decoder for errors. */
-  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
-  /* 2. Check the macroblock information */
-  pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
+  // Collect information about decoder corruption.
+  // 1. Check first boolean decoder for errors.
+  // 2. Check the macroblock information
+  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) |
+                                          corrupt_tokens;
 
   if (!pbi->decoded_key_frame) {
-    if (pc->frame_type == KEY_FRAME &&
-        !pc->yv12_fb[pc->new_fb_idx].corrupted)
+    if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted)
       pbi->decoded_key_frame = 1;
     else
       vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
                          "A stream must start with a complete key frame");
   }
 
-  if (!pc->error_resilient_mode &&
-      !pc->frame_parallel_decoding_mode) {
+  if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
 #if CONFIG_CODE_NONZEROCOUNT
     vp9_adapt_nzc_probs(pc);
 #endif
   }
+
   if (pc->frame_type != KEY_FRAME) {
-    if (!pc->error_resilient_mode &&
-        !pc->frame_parallel_decoding_mode) {
+    if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
       vp9_adapt_mode_probs(pc);
       vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
       vp9_adapt_mode_context(&pbi->common);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 6c0ab863a..6365ed9a2 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -269,6 +269,185 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
   }
 }
 
+void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  const int stride = pitch >> 1;
+  int pass;
+  // We need an intermediate buffer between passes.
+  int16_t intermediate[256];
+  int16_t *in = input;
+  int16_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    /*canbe16*/ int step1[8];
+    /*canbe16*/ int step2[8];
+    /*canbe16*/ int step3[8];
+    /*canbe16*/ int input[8];
+    /*needs32*/ int temp1, temp2;
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in[0 * stride] + in[15 * stride]) << 2;
+        input[1] = (in[1 * stride] + in[14 * stride]) << 2;
+        input[2] = (in[2 * stride] + in[13 * stride]) << 2;
+        input[3] = (in[3 * stride] + in[12 * stride]) << 2;
+        input[4] = (in[4 * stride] + in[11 * stride]) << 2;
+        input[5] = (in[5 * stride] + in[10 * stride]) << 2;
+        input[6] = (in[6 * stride] + in[ 9 * stride]) << 2;
+        input[7] = (in[7 * stride] + in[ 8 * stride]) << 2;
+        // Calculate input for the next 8 results.
+        step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2;
+        step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2;
+        step1[2] = (in[5 * stride] - in[10 * stride]) << 2;
+        step1[3] = (in[4 * stride] - in[11 * stride]) << 2;
+        step1[4] = (in[3 * stride] - in[12 * stride]) << 2;
+        step1[5] = (in[2 * stride] - in[13 * stride]) << 2;
+        step1[6] = (in[1 * stride] - in[14 * stride]) << 2;
+        step1[7] = (in[0 * stride] - in[15 * stride]) << 2;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8_1d(input, even_results);
+      {
+        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+        /*needs32*/ int t0, t1, t2, t3;
+        /*canbe16*/ int x0, x1, x2, x3;
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4_1d(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = dct_const_round_shift(t0);
+        out[4] = dct_const_round_shift(t2);
+        out[8] = dct_const_round_shift(t1);
+        out[12] = dct_const_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = dct_const_round_shift(t0);
+        t3 = dct_const_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = dct_const_round_shift(t0);
+        out[6] = dct_const_round_shift(t2);
+        out[10] = dct_const_round_shift(t1);
+        out[14] = dct_const_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = dct_const_round_shift(temp1);
+        step2[3] = dct_const_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = dct_const_round_shift(temp1);
+        step2[5] = dct_const_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
+        step2[1] = dct_const_round_shift(temp1);
+        step2[2] = dct_const_round_shift(temp2);
+        temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = dct_const_round_shift(temp1);
+        step2[6] = dct_const_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] - step2[2];
+        step1[3] = step3[3] + step2[2];
+        step1[4] = step3[4] + step2[5];
+        step1[5] = step3[4] - step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = dct_const_round_shift(temp1);
+        out[9] = dct_const_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = dct_const_round_shift(temp1);
+        out[13] = dct_const_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = dct_const_round_shift(temp1);
+        out[11] = dct_const_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = dct_const_round_shift(temp1);
+        out[15] = dct_const_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
 static void fadst8_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -421,132 +600,145 @@ void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
 
 
 // Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t input[16], int16_t output[16]) {
-  int16_t step[16];
-  int temp1, temp2;
+static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+  /*canbe16*/ int step1[8];
+  /*canbe16*/ int step2[8];
+  /*canbe16*/ int step3[8];
+  /*canbe16*/ int input[8];
+  /*needs32*/ int temp1, temp2;
 
   // step 1
-  step[ 0] = input[0] + input[15];
-  step[ 1] = input[1] + input[14];
-  step[ 2] = input[2] + input[13];
-  step[ 3] = input[3] + input[12];
-  step[ 4] = input[4] + input[11];
-  step[ 5] = input[5] + input[10];
-  step[ 6] = input[6] + input[ 9];
-  step[ 7] = input[7] + input[ 8];
-  step[ 8] = input[7] - input[ 8];
-  step[ 9] = input[6] - input[ 9];
-  step[10] = input[5] - input[10];
-  step[11] = input[4] - input[11];
-  step[12] = input[3] - input[12];
-  step[13] = input[2] - input[13];
-  step[14] = input[1] - input[14];
-  step[15] = input[0] - input[15];
-
-  fdct8_1d(step, step);
+  input[0] = in[0] + in[15];
+  input[1] = in[1] + in[14];
+  input[2] = in[2] + in[13];
+  input[3] = in[3] + in[12];
+  input[4] = in[4] + in[11];
+  input[5] = in[5] + in[10];
+  input[6] = in[6] + in[ 9];
+  input[7] = in[7] + in[ 8];
+
+  step1[0] = in[7] - in[ 8];
+  step1[1] = in[6] - in[ 9];
+  step1[2] = in[5] - in[10];
+  step1[3] = in[4] - in[11];
+  step1[4] = in[3] - in[12];
+  step1[5] = in[2] - in[13];
+  step1[6] = in[1] - in[14];
+  step1[7] = in[0] - in[15];
+
+  // fdct8_1d(step, step);
+  {
+    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+    /*needs32*/ int t0, t1, t2, t3;
+    /*canbe16*/ int x0, x1, x2, x3;
+
+    // stage 1
+    s0 = input[0] + input[7];
+    s1 = input[1] + input[6];
+    s2 = input[2] + input[5];
+    s3 = input[3] + input[4];
+    s4 = input[3] - input[4];
+    s5 = input[2] - input[5];
+    s6 = input[1] - input[6];
+    s7 = input[0] - input[7];
+
+    // fdct4_1d(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+    out[0] = dct_const_round_shift(t0);
+    out[4] = dct_const_round_shift(t2);
+    out[8] = dct_const_round_shift(t1);
+    out[12] = dct_const_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = dct_const_round_shift(t0);
+    t3 = dct_const_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+    out[2] = dct_const_round_shift(t0);
+    out[6] = dct_const_round_shift(t2);
+    out[10] = dct_const_round_shift(t1);
+    out[14] = dct_const_round_shift(t3);
+  }
 
   // step 2
-  output[8] = step[8];
-  output[9] = step[9];
-  temp1 = (-step[10] + step[13]) * cospi_16_64;
-  temp2 = (-step[11] + step[12]) * cospi_16_64;
-  output[10] = dct_const_round_shift(temp1);
-  output[11] = dct_const_round_shift(temp2);
-  temp1 = (step[11] + step[12]) * cospi_16_64;
-  temp2 = (step[10] + step[13]) * cospi_16_64;
-  output[12] = dct_const_round_shift(temp1);
-  output[13] = dct_const_round_shift(temp2);
-  output[14] = step[14];
-  output[15] = step[15];
+  temp1 = (step1[5] - step1[2]) * cospi_16_64;
+  temp2 = (step1[4] - step1[3]) * cospi_16_64;
+  step2[2] = dct_const_round_shift(temp1);
+  step2[3] = dct_const_round_shift(temp2);
+  temp1 = (step1[4] + step1[3]) * cospi_16_64;
+  temp2 = (step1[5] + step1[2]) * cospi_16_64;
+  step2[4] = dct_const_round_shift(temp1);
+  step2[5] = dct_const_round_shift(temp2);
 
   // step 3
-  step[ 8] = output[8] + output[11];
-  step[ 9] = output[9] + output[10];
-  step[ 10] = output[9] - output[10];
-  step[ 11] = output[8] - output[11];
-  step[ 12] = -output[12] + output[15];
-  step[ 13] = -output[13] + output[14];
-  step[ 14] = output[13] + output[14];
-  step[ 15] = output[12] + output[15];
+  step3[0] = step1[0] + step2[3];
+  step3[1] = step1[1] + step2[2];
+  step3[2] = step1[1] - step2[2];
+  step3[3] = step1[0] - step2[3];
+  step3[4] = step1[7] - step2[4];
+  step3[5] = step1[6] - step2[5];
+  step3[6] = step1[6] + step2[5];
+  step3[7] = step1[7] + step2[4];
 
   // step 4
-  output[8] = step[8];
-  temp1 = -step[9] * cospi_8_64 + step[14] * cospi_24_64;
-  temp2 = -step[10] * cospi_24_64 - step[13] * cospi_8_64;
-  output[9] = dct_const_round_shift(temp1);
-  output[10] = dct_const_round_shift(temp2);
-  output[11] = step[11];
-  output[12] = step[12];
-  temp1 = -step[10] * cospi_8_64 + step[13] * cospi_24_64;
-  temp2 = step[9] * cospi_24_64 + step[14] * cospi_8_64;
-  output[13] = dct_const_round_shift(temp1);
-  output[14] = dct_const_round_shift(temp2);
-  output[15] = step[15];
+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+  temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
+  step2[1] = dct_const_round_shift(temp1);
+  step2[2] = dct_const_round_shift(temp2);
+  temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+  step2[5] = dct_const_round_shift(temp1);
+  step2[6] = dct_const_round_shift(temp2);
 
   // step 5
-  step[8] = output[8] + output[9];
-  step[9] = output[8] - output[9];
-  step[10] = -output[10] + output[11];
-  step[11] = output[10] + output[11];
-  step[12] = output[12] + output[13];
-  step[13] = output[12] - output[13];
-  step[14] = -output[14] + output[15];
-  step[15] = output[14] + output[15];
+  step1[0] = step3[0] + step2[1];
+  step1[1] = step3[0] - step2[1];
+  step1[2] = step3[3] - step2[2];
+  step1[3] = step3[3] + step2[2];
+  step1[4] = step3[4] + step2[5];
+  step1[5] = step3[4] - step2[5];
+  step1[6] = step3[7] - step2[6];
+  step1[7] = step3[7] + step2[6];
 
   // step 6
-  output[0] = step[0];
-  output[8] = step[4];
-  output[4] = step[2];
-  output[12] = step[6];
-  output[2] = step[1];
-  output[10] = step[5];
-  output[6] = step[3];
-  output[14] = step[7];
-
-  temp1 = step[8] * cospi_30_64 + step[15] * cospi_2_64;
-  temp2 = step[9] * cospi_14_64 + step[14] * cospi_18_64;
-  output[1] = dct_const_round_shift(temp1);
-  output[9] = dct_const_round_shift(temp2);
-
-  temp1 = step[10] * cospi_22_64 + step[13] * cospi_10_64;
-  temp2 = step[11] * cospi_6_64 + step[12] * cospi_26_64;
-  output[5] = dct_const_round_shift(temp1);
-  output[13] = dct_const_round_shift(temp2);
-
-  temp1 = -step[11] * cospi_26_64 + step[12] * cospi_6_64;
-  temp2 = -step[10] * cospi_10_64 + step[13] * cospi_22_64;
-  output[3] = dct_const_round_shift(temp1);
-  output[11] = dct_const_round_shift(temp2);
-
-  temp1 = -step[9] * cospi_18_64 + step[14] * cospi_14_64;
-  temp2 = -step[8] * cospi_2_64 + step[15] * cospi_30_64;
-  output[7] = dct_const_round_shift(temp1);
-  output[15] = dct_const_round_shift(temp2);
-}
-
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
-  int i, j;
-  int16_t output[256];
-  int16_t temp_in[16], temp_out[16];
-
-  // First transform columns
-  for (i = 0; i < 16; i++) {
-    for (j = 0; j < 16; j++)
-      temp_in[j] = input[j * shortpitch + i] << 2;
-    fdct16_1d(temp_in, temp_out);
-    for (j = 0; j < 16; j++)
-      output[j * 16 + i] = (temp_out[j] + 1) >> 2;
-  }
-
-  // Then transform rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = output[j + i * 16];
-    fdct16_1d(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      out[j + i * 16] = temp_out[j];
-  }
+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  out[1] = dct_const_round_shift(temp1);
+  out[9] = dct_const_round_shift(temp2);
+
+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  out[5] = dct_const_round_shift(temp1);
+  out[13] = dct_const_round_shift(temp2);
+
+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  out[3] = dct_const_round_shift(temp1);
+  out[11] = dct_const_round_shift(temp2);
+
+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+  out[7] = dct_const_round_shift(temp1);
+  out[15] = dct_const_round_shift(temp2);
 }
 
 void fadst16_1d(int16_t *input, int16_t *output) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8922661fe..428e585e1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -594,9 +594,6 @@ static void update_state(VP9_COMP *cpi,
           [vp9_switchable_interp_map[mbmi->interp_filter]];
     }
 
-    cpi->prediction_error += ctx->distortion;
-    cpi->intra_error += ctx->intra_error;
-
     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;
@@ -1265,8 +1262,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // Reset frame count of inter 0,0 motion vector usage.
   cpi->inter_zz_count = 0;
 
-  cpi->prediction_error = 0;
-  cpi->intra_error = 0;
   cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
   cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
 
@@ -1292,8 +1287,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(cpi->mb_mv_ref_count);
 #endif
 
-
-  // force lossless mode when Q0 is selected
+  // force lossless mode
+  if (cm->base_qindex <= 4)
+    cm->base_qindex = 0;
   cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
                             cm->y1dc_delta_q == 0 &&
                             cm->uvdc_delta_q == 0 &&
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 3c98d4aa6..9e5bcea16 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -44,7 +44,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   TX_TYPE tx_type;
 
 #if CONFIG_NEWBINTRAMODES
-  b->bmi.as_mode.context = vp9_find_bpred_context(b);
+  b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);
 #endif
 
   vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5c2067b00..d646fe222 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -859,6 +859,8 @@ static double calc_correction_factor(double err_per_mb,
   power_term = (power_term > pt_high) ? pt_high : power_term;
 
   // Calculate correction factor
+  if (power_term < 1.0)
+    assert(error_term >= 0.0);
   correction_factor = pow(error_term, power_term);
 
   // Clip range
@@ -920,15 +922,19 @@ static int estimate_max_q(VP9_COMP *cpi,
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
-  sr_err_diff =
-    (fpstats->sr_coded_error - fpstats->coded_error) /
-    (fpstats->count * cpi->common.MBs);
-  sr_correction = (sr_err_diff / 32.0);
-  sr_correction = pow(sr_correction, 0.25);
-  if (sr_correction < 0.75)
+  if (fpstats->sr_coded_error > fpstats->coded_error) {
+    sr_err_diff =
+      (fpstats->sr_coded_error - fpstats->coded_error) /
+      (fpstats->count * cpi->common.MBs);
+    sr_correction = (sr_err_diff / 32.0);
+    sr_correction = pow(sr_correction, 0.25);
+    if (sr_correction < 0.75)
+      sr_correction = 0.75;
+    else if (sr_correction > 1.25)
+      sr_correction = 1.25;
+  } else {
     sr_correction = 0.75;
-  else if (sr_correction > 1.25)
-    sr_correction = 1.25;
+  }
 
   // Calculate a corrective factor based on a rolling ratio of bits spent
   // vs target bits
@@ -1031,15 +1037,19 @@ static int estimate_cq(VP9_COMP *cpi,
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
-  sr_err_diff =
-    (fpstats->sr_coded_error - fpstats->coded_error) /
-    (fpstats->count * cpi->common.MBs);
-  sr_correction = (sr_err_diff / 32.0);
-  sr_correction = pow(sr_correction, 0.25);
-  if (sr_correction < 0.75)
+  if (fpstats->sr_coded_error > fpstats->coded_error) {
+    sr_err_diff =
+      (fpstats->sr_coded_error - fpstats->coded_error) /
+      (fpstats->count * cpi->common.MBs);
+    sr_correction = (sr_err_diff / 32.0);
+    sr_correction = pow(sr_correction, 0.25);
+    if (sr_correction < 0.75)
+      sr_correction = 0.75;
+    else if (sr_correction > 1.25)
+      sr_correction = 1.25;
+  } else {
     sr_correction = 0.75;
-  else if (sr_correction > 1.25)
-    sr_correction = 1.25;
+  }
 
   // II ratio correction factor for clip as a whole
   clip_iiratio = cpi->twopass.total_stats->intra_error /
@@ -1178,12 +1188,16 @@ static double get_prediction_decay_rate(VP9_COMP *cpi,
   mb_sr_err_diff =
     (next_frame->sr_coded_error - next_frame->coded_error) /
     (cpi->common.MBs);
-  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
-  second_ref_decay = pow(second_ref_decay, 0.5);
-  if (second_ref_decay < 0.85)
+  if (mb_sr_err_diff <= 512.0) {
+    second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
+    second_ref_decay = pow(second_ref_decay, 0.5);
+    if (second_ref_decay < 0.85)
+      second_ref_decay = 0.85;
+    else if (second_ref_decay > 1.0)
+      second_ref_decay = 1.0;
+  } else {
     second_ref_decay = 0.85;
-  else if (second_ref_decay > 1.0)
-    second_ref_decay = 1.0;
+  }
 
   if (second_ref_decay < prediction_decay_rate)
     prediction_decay_rate = second_ref_decay;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index d5c7032a9..fd1bb2b4e 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -21,7 +21,7 @@ void print_mode_context(VP9_COMMON *pc);
 
 // The maximum number of steps in a step search given the largest
 // allowed initial step
-#define MAX_MVSEARCH_STEPS 10
+#define MAX_MVSEARCH_STEPS 11
 // Max full pel mv specified in 1 pel units
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
 // Maximum size of the first step in full pel units
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 300e12869..2e9f96c09 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -390,11 +390,6 @@ typedef struct VP9_COMP {
   CODING_CONTEXT coding_context;
 
   // Rate targetting variables
-  int64_t prediction_error;
-  int64_t last_prediction_error;
-  int64_t intra_error;
-  int64_t last_intra_error;
-
   int this_frame_target;
   int projected_frame_size;
   int last_q[2];                   // Separate values for Intra/Inter
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3004d6bf3..d21224318 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1168,7 +1168,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
 #if CONFIG_NEWBINTRAMODES
-  b->bmi.as_mode.context = vp9_find_bpred_context(b);
+  b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);
 #endif
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
@@ -1279,7 +1279,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
       bmode_costs  = mb->bmode_costs[A][L];
     }
 #if CONFIG_NEWBINTRAMODES
-    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd->block + i);
+    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i);
 #endif
 
     total_rd += rd_pick_intra4x4block(
diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
index ff884d999..28c4c754e 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
+++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
@@ -270,3 +270,629 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
     _mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7);
   }
 }
+
+void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  const int stride = pitch >> 1;
+  int pass;
+  // We need an intermediate buffer between passes.
+  int16_t intermediate[256];
+  int16_t *in = input;
+  int16_t *out = intermediate;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = _mm_add_epi16(in00, in15);
+        input1 = _mm_add_epi16(in01, in14);
+        input2 = _mm_add_epi16(in02, in13);
+        input3 = _mm_add_epi16(in03, in12);
+        input4 = _mm_add_epi16(in04, in11);
+        input5 = _mm_add_epi16(in05, in10);
+        input6 = _mm_add_epi16(in06, in09);
+        input7 = _mm_add_epi16(in07, in08);
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = _mm_sub_epi16(in07, in08);
+        step1_1 = _mm_sub_epi16(in06, in09);
+        step1_2 = _mm_sub_epi16(in05, in10);
+        step1_3 = _mm_sub_epi16(in04, in11);
+        step1_4 = _mm_sub_epi16(in03, in12);
+        step1_5 = _mm_sub_epi16(in02, in13);
+        step1_6 = _mm_sub_epi16(in01, in14);
+        step1_7 = _mm_sub_epi16(in00, in15);
+      }
+      // Work on the first eight values; fdct8_1d(input, even_results);
+      {
+        // Add/substract
+        const __m128i q0 = _mm_add_epi16(input0, input7);
+        const __m128i q1 = _mm_add_epi16(input1, input6);
+        const __m128i q2 = _mm_add_epi16(input2, input5);
+        const __m128i q3 = _mm_add_epi16(input3, input4);
+        const __m128i q4 = _mm_sub_epi16(input3, input4);
+        const __m128i q5 = _mm_sub_epi16(input2, input5);
+        const __m128i q6 = _mm_sub_epi16(input1, input6);
+        const __m128i q7 = _mm_sub_epi16(input0, input7);
+        // Work on first four results
+        {
+          // Add/substract
+          const __m128i r0 = _mm_add_epi16(q0, q3);
+          const __m128i r1 = _mm_add_epi16(q1, q2);
+          const __m128i r2 = _mm_sub_epi16(q1, q2);
+          const __m128i r3 = _mm_sub_epi16(q0, q3);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res00 = _mm_packs_epi32(w0, w1);
+          res08 = _mm_packs_epi32(w2, w3);
+          res04 = _mm_packs_epi32(w4, w5);
+          res12 = _mm_packs_epi32(w6, w7);
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+          // Combine
+          const __m128i r0 = _mm_packs_epi32(s0, s1);
+          const __m128i r1 = _mm_packs_epi32(s2, s3);
+          // Add/substract
+          const __m128i x0 = _mm_add_epi16(q4, r0);
+          const __m128i x1 = _mm_sub_epi16(q4, r0);
+          const __m128i x2 = _mm_sub_epi16(q7, r1);
+          const __m128i x3 = _mm_add_epi16(q7, r1);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res02 = _mm_packs_epi32(w0, w1);
+          res14 = _mm_packs_epi32(w2, w3);
+          res10 = _mm_packs_epi32(w4, w5);
+          res06 = _mm_packs_epi32(w6, w7);
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_2 = _mm_packs_epi32(w0, w1);
+          step2_3 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_5 = _mm_packs_epi32(w0, w1);
+          step2_4 = _mm_packs_epi32(w2, w3);
+        }
+        // step 3
+        {
+          step3_0 = _mm_add_epi16(step1_0, step2_3);
+          step3_1 = _mm_add_epi16(step1_1, step2_2);
+          step3_2 = _mm_sub_epi16(step1_1, step2_2);
+          step3_3 = _mm_sub_epi16(step1_0, step2_3);
+          step3_4 = _mm_sub_epi16(step1_7, step2_4);
+          step3_5 = _mm_sub_epi16(step1_6, step2_5);
+          step3_6 = _mm_add_epi16(step1_6, step2_5);
+          step3_7 = _mm_add_epi16(step1_7, step2_4);
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_1 = _mm_packs_epi32(w0, w1);
+          step2_2 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_6 = _mm_packs_epi32(w0, w1);
+          step2_5 = _mm_packs_epi32(w2, w3);
+        }
+        // step 5
+        {
+          step1_0 = _mm_add_epi16(step3_0, step2_1);
+          step1_1 = _mm_sub_epi16(step3_0, step2_1);
+          step1_2 = _mm_sub_epi16(step3_3, step2_2);
+          step1_3 = _mm_add_epi16(step3_3, step2_2);
+          step1_4 = _mm_add_epi16(step3_4, step2_5);
+          step1_5 = _mm_sub_epi16(step3_4, step2_5);
+          step1_6 = _mm_sub_epi16(step3_7, step2_6);
+          step1_7 = _mm_add_epi16(step3_7, step2_6);
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res01 = _mm_packs_epi32(w0, w1);
+          res09 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res05 = _mm_packs_epi32(w0, w1);
+          res13 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res11 = _mm_packs_epi32(w0, w1);
+          res03 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res15 = _mm_packs_epi32(w0, w1);
+          res07 = _mm_packs_epi32(w2, w3);
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        _mm_storeu_si128 ((__m128i *)(out + 0 * 16), tr2_0);
+        _mm_storeu_si128 ((__m128i *)(out + 1 * 16), tr2_1);
+        _mm_storeu_si128 ((__m128i *)(out + 2 * 16), tr2_2);
+        _mm_storeu_si128 ((__m128i *)(out + 3 * 16), tr2_3);
+        _mm_storeu_si128 ((__m128i *)(out + 4 * 16), tr2_4);
+        _mm_storeu_si128 ((__m128i *)(out + 5 * 16), tr2_5);
+        _mm_storeu_si128 ((__m128i *)(out + 6 * 16), tr2_6);
+        _mm_storeu_si128 ((__m128i *)(out + 7 * 16), tr2_7);
+      }
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        // Store results
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 0 * 16), tr2_0);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 1 * 16), tr2_1);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 2 * 16), tr2_2);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 3 * 16), tr2_3);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 4 * 16), tr2_4);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 5 * 16), tr2_5);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 6 * 16), tr2_6);
+        _mm_storeu_si128 ((__m128i *)(out + 8 + 7 * 16), tr2_7);
+      }
+      out += 8*16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}