8 files changed, 96 insertions, 133 deletions
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 7094a0118..3cd9f44e9 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -50,39 +50,25 @@ void vp9_foreach_transformed_block_in_plane(
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const int step = 1 << (tx_size << 1);
-  int i;
+  int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    int r, c;
-
-    int max_blocks_wide = num_4x4_w;
-    int max_blocks_high = num_4x4_h;
-
-    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-    // it to 4x4 block sizes.
-    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-    i = 0;
-    // Unlike the normal case - in here we have to keep track of the
-    // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border.
-    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
-      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
-        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, plane_bsize, tx_size, arg);
-        i += step;
-      }
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      if (c < max_blocks_wide)
+        visit(plane, i, plane_bsize, tx_size, arg);
+      i += step;
     }
-  } else {
-    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
-      visit(plane, i, plane_bsize, tx_size, arg);
   }
 }
 
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 048a0ed35..4df1b588c 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -65,7 +65,7 @@ void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
 void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
 
 // Set golden frame update interval, for non-svc 1 pass CBR mode.
-void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *cpi);
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
 
 // Check if we should not update golden reference, based on past refresh stats.
 void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5c841cfdc..cf8ac0a22 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -514,12 +514,6 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
   }
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-#define GLOBAL_MOTION 0
-#else
-#define GLOBAL_MOTION 1
-#endif
-
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static void choose_partitioning(VP9_COMP *cpi,
@@ -564,7 +558,7 @@ static void choose_partitioning(VP9_COMP *cpi,
     MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
     unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-#if GLOBAL_MOTION
+
     const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
     BLOCK_SIZE bsize;
@@ -576,9 +570,9 @@ static void choose_partitioning(VP9_COMP *cpi,
       bsize = BLOCK_64X32;
     else
       bsize = BLOCK_32X32;
-#endif
+
     assert(yv12 != NULL);
-#if GLOBAL_MOTION
+
     if (yv12_g && yv12_g != yv12) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -589,7 +583,7 @@ static void choose_partitioning(VP9_COMP *cpi,
     } else {
       y_sad_g = UINT_MAX;
     }
-#endif
+
     vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
                          &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
@@ -597,7 +591,7 @@ static void choose_partitioning(VP9_COMP *cpi,
     mbmi->sb_type = BLOCK_64X64;
     mbmi->mv[0].as_int = 0;
     mbmi->interp_filter = BILINEAR;
-#if GLOBAL_MOTION
+
     y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
     if (y_sad_g < y_sad) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -608,29 +602,21 @@ static void choose_partitioning(VP9_COMP *cpi,
     } else {
       x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
     }
-#endif
 
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
-#if GLOBAL_MOTION
       const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd);
-#endif
+
       if (bs == BLOCK_INVALID)
-        uv_sad = INT_MAX;
+        uv_sad = UINT_MAX;
       else
         uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
                                      pd->dst.buf, pd->dst.stride);
 
-#if GLOBAL_MOTION
-      x->color_sensitivity[i - 1] = uv_sad * 4 > y_sad;
-#else
-      x->color_sensitivity[i - 1] = (uv_sad > 512);
-#endif
+      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
     }
 
     d = xd->plane[0].dst.buf;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 0730467b0..12882e432 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1807,6 +1807,13 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   int best_sad;
   MV this_mv;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  tmp_mv->row = 0;
+  tmp_mv->col = 0;
+  return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                xd->plane[0].pre[0].buf, ref_stride);
+#endif
+
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index f457f20b1..e3acc5a82 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -137,10 +137,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int cost_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
-  if (cpi->common.show_frame &&
-      (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
-    return rv;
-
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
@@ -796,15 +792,43 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
 
       if (this_mode == NEWMV) {
-        if (ref_frame > LAST_FRAME)
-          continue;
         if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
             best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
-        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                    &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rdc.rdcost))
+
+        if (ref_frame > LAST_FRAME) {
+          int tmp_sad;
+          int dis, cost_list[5];
+
+          if (bsize < BLOCK_16X16)
+            continue;
+
+          tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+          if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
+            continue;
+
+          frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+          rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+                                    &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+          frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+          frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+          cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+                                       &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                       cpi->common.allow_high_precision_mv,
+                                       x->errorperbit,
+                                       &cpi->fn_ptr[bsize],
+                                       cpi->sf.mv.subpel_force_stop,
+                                       cpi->sf.mv.subpel_iters_per_step,
+                                       cond_cost_list(cpi, cost_list),
+                                       x->nmvjointcost, x->mvcost, &dis,
+                                       &x->pred_sse[ref_frame], NULL, 0, 0);
+        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                           &frame_mv[NEWMV][ref_frame],
+                                           &rate_mv, best_rdc.rdcost)) {
           continue;
+        }
       }
 
       if (this_mode != NEARESTMV &&
@@ -829,7 +853,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
-          pred_filter_search &&
+          pred_filter_search && (ref_frame == LAST_FRAME) &&
           ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
            (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
         int pf_rate[3];
@@ -1064,6 +1088,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         mode_idx[INTRA_FRAME][mbmi->mode];
     PREDICTION_MODE this_mode;
     for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+      if (best_ref_frame != ref_frame) continue;
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
         int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index c7c5e972d..7783f7bdc 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -377,7 +377,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
       rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
   rcf *= rcf_mult[rc->frame_size_selector];
-  return rcf > MAX_BPB_FACTOR ? MAX_BPB_FACTOR : rcf;
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
@@ -386,6 +386,8 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   // Normalize RCF to account for the size-dependent scaling factor.
   factor /= rcf_mult[cpi->rc.frame_size_selector];
 
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
   if (cpi->common.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {
@@ -754,7 +756,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
                                const VP9EncoderConfig *const oxcf) {
-  static const double cq_adjust_threshold = 0.5;
+  static const double cq_adjust_threshold = 0.1;
   int active_cq_level = oxcf->cq_level;
   if (oxcf->rc_mode == VPX_CQ &&
       rc->total_target_bits > 0) {
@@ -1687,7 +1689,7 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
   int target_rate = rc->base_frame_target;
 
   // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR)
+  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
     vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
   vp9_rc_set_frame_target(cpi, target_rate);
 }
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index f49949940..618b5f73d 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -61,7 +61,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
   __m128i t0, t1;
@@ -69,14 +69,14 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
   ref += ref_stride;
 
   for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
     s0 = _mm_adds_epu16(s0, t0);
     s1 = _mm_adds_epu16(s1, t1);
     ref += ref_stride;
 
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
     s0 = _mm_adds_epu16(s0, t0);
@@ -84,7 +84,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
     ref += ref_stride;
   }
 
-  src_line = _mm_load_si128((const __m128i *)ref);
+  src_line = _mm_loadu_si128((const __m128i *)ref);
   t0 = _mm_unpacklo_epi8(src_line, zero);
   t1 = _mm_unpackhi_epi8(src_line, zero);
   s0 = _mm_adds_epu16(s0, t0);
@@ -101,9 +101,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
     s1 = _mm_srai_epi16(s1, 3);
   }
 
-  _mm_store_si128((__m128i *)hbuf, s0);
+  _mm_storeu_si128((__m128i *)hbuf, s0);
   hbuf += 8;
-  _mm_store_si128((__m128i *)hbuf, s1);
+  _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index 5c0ad7892..a1a2bda80 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -102,99 +102,56 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
       const __m128i r2 = _mm_sub_epi16(q1, q2);
       const __m128i r3 = _mm_sub_epi16(q0, q3);
       // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_add_epi16(r0, r1);
-      const __m128i t1 = _mm_sub_epi16(r0, r1);
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
 
-      const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16);
-      const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
       // dct_const_round_shift
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res0 = u0;
-      res4 = u1;
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    if (pass == 1) {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
+
       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
       // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    } else {
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
       // Interleave to do the multiply by constants which gets us into 32bits
       const __m128i d0 = _mm_sub_epi16(q6, q5);
       const __m128i d1 = _mm_add_epi16(q6, q5);
       const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
       const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
       // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);