12 files changed, 270 insertions, 177 deletions
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 2101ec58c..676382dc8 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -34,10 +34,10 @@
 //
 // A loopfilter should be applied to every other 8x8 horizontally.
 static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
-  0xffffffffffffffff,  // TX_4X4
-  0xffffffffffffffff,  // TX_8x8
-  0x5555555555555555,  // TX_16x16
-  0x1111111111111111,  // TX_32x32
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x5555555555555555ULL,  // TX_16x16
+  0x1111111111111111ULL,  // TX_32x32
 };
 
 // 64 bit masks for above transform size. Each 1 represents a position where
@@ -58,10 +58,10 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
 //
 // A loopfilter should be applied to every other 4 the row vertically.
 static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
-  0xffffffffffffffff,  // TX_4X4
-  0xffffffffffffffff,  // TX_8x8
-  0x00ff00ff00ff00ff,  // TX_16x16
-  0x000000ff000000ff,  // TX_32x32
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x00ff00ff00ff00ffULL,  // TX_16x16
+  0x000000ff000000ffULL,  // TX_32x32
 };
 
 // 64 bit masks for prediction sizes (left). Each 1 represents a position
@@ -80,59 +80,59 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
 //  00000000
 //  00000000
 static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4,
-  0x0000000000000001,  // BLOCK_4X8,
-  0x0000000000000001,  // BLOCK_8X4,
-  0x0000000000000001,  // BLOCK_8X8,
-  0x0000000000000101,  // BLOCK_8X16,
-  0x0000000000000001,  // BLOCK_16X8,
-  0x0000000000000101,  // BLOCK_16X16,
-  0x0000000001010101,  // BLOCK_16X32,
-  0x0000000000000101,  // BLOCK_32X16,
-  0x0000000001010101,  // BLOCK_32X32,
-  0x0101010101010101,  // BLOCK_32X64,
-  0x0000000001010101,  // BLOCK_64X32,
-  0x0101010101010101,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4,
+  0x0000000000000001ULL,  // BLOCK_4X8,
+  0x0000000000000001ULL,  // BLOCK_8X4,
+  0x0000000000000001ULL,  // BLOCK_8X8,
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000001ULL,  // BLOCK_16X8,
+  0x0000000000000101ULL,  // BLOCK_16X16,
+  0x0000000001010101ULL,  // BLOCK_16X32,
+  0x0000000000000101ULL,  // BLOCK_32X16,
+  0x0000000001010101ULL,  // BLOCK_32X32,
+  0x0101010101010101ULL,  // BLOCK_32X64,
+  0x0000000001010101ULL,  // BLOCK_64X32,
+  0x0101010101010101ULL,  // BLOCK_64X64
 };
 
 // 64 bit mask to shift and set for each prediction size.
 static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4
-  0x0000000000000001,  // BLOCK_4X8
-  0x0000000000000001,  // BLOCK_8X4
-  0x0000000000000001,  // BLOCK_8X8
-  0x0000000000000001,  // BLOCK_8X16,
-  0x0000000000000003,  // BLOCK_16X8
-  0x0000000000000003,  // BLOCK_16X16
-  0x0000000000000003,  // BLOCK_16X32,
-  0x000000000000000f,  // BLOCK_32X16,
-  0x000000000000000f,  // BLOCK_32X32,
-  0x000000000000000f,  // BLOCK_32X64,
-  0x00000000000000ff,  // BLOCK_64X32,
-  0x00000000000000ff,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000001ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000003ULL,  // BLOCK_16X16
+  0x0000000000000003ULL,  // BLOCK_16X32,
+  0x000000000000000fULL,  // BLOCK_32X16,
+  0x000000000000000fULL,  // BLOCK_32X32,
+  0x000000000000000fULL,  // BLOCK_32X64,
+  0x00000000000000ffULL,  // BLOCK_64X32,
+  0x00000000000000ffULL,  // BLOCK_64X64
 };
 // 64 bit mask to shift and set for each prediction size. A bit is set for
 // each 8x8 block that would be in the left most block of the given block
 // size in the 64x64 block.
 static const uint64_t size_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4
-  0x0000000000000001,  // BLOCK_4X8
-  0x0000000000000001,  // BLOCK_8X4
-  0x0000000000000001,  // BLOCK_8X8
-  0x0000000000000101,  // BLOCK_8X16,
-  0x0000000000000003,  // BLOCK_16X8
-  0x0000000000000303,  // BLOCK_16X16
-  0x0000000003030303,  // BLOCK_16X32,
-  0x0000000000000f0f,  // BLOCK_32X16,
-  0x000000000f0f0f0f,  // BLOCK_32X32,
-  0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
-  0x00000000ffffffff,  // BLOCK_64X32,
-  0xffffffffffffffff,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000303ULL,  // BLOCK_16X16
+  0x0000000003030303ULL,  // BLOCK_16X32,
+  0x0000000000000f0fULL,  // BLOCK_32X16,
+  0x000000000f0f0f0fULL,  // BLOCK_32X32,
+  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
+  0x00000000ffffffffULL,  // BLOCK_64X32,
+  0xffffffffffffffffULL,  // BLOCK_64X64
 };
 
 // These are used for masking the left and above borders.
-static const uint64_t left_border =  0x1111111111111111;
-static const uint64_t above_border = 0x000000ff000000ff;
+static const uint64_t left_border =  0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
 
 // 16 bit masks for uv transform sizes.
 static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
@@ -773,7 +773,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
   // an 8x8 in that the internal ones can be skipped and don't depend on
   // the prediction block size.
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 
   if (tx_size_uv == TX_4X4)
     *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
@@ -819,7 +819,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
               left_64x64_txform_mask[tx_size_y]) << shift_y;
 
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 }
 
 // This function sets up the bit masks for the entire 64x64 region represented
@@ -1021,7 +1021,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 
     // Each pixel inside the border gets a 1, the multiply copies the border
     // to where we need it.
-    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
     const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
 
     // Internal edges are not applied on the last column of the image so
@@ -1053,7 +1053,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
   // out.
   if (mi_col == 0) {
     for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefe;
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
       lfm->left_uv[i] &= 0xeeee;
     }
   }
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index cfb0a98e5..7ca24a56e 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -110,12 +110,7 @@ typedef struct {
 
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
-
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
-
   vpx_color_space_t color_space;
-
   int width;
   int height;
   int display_width;
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 18e0bf53f..d345a0578 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -741,6 +741,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
 static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
@@ -821,6 +822,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
 static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -1436,6 +1438,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
 #if CONFIG_VP9_HIGHBITDEPTH
   get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
 #endif
+  get_frame_new_buffer(cm)->color_space = cm->color_space;
 
   if (pbi->need_resync) {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 5a3671d84..ff08f04e6 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -96,30 +96,35 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
 static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
                                 const MB_MODE_INFO *mbmi,
                                 int64_t rate,
-                                int64_t dist) {
+                                int64_t dist,
+                                int bsize) {
   MV mv = mbmi->mv[0].as_mv;
-  // If projected rate is below the thresh_rate accept it for lower-qp coding.
-  // Otherwise, reject the block for lower-qp coding if projected distortion
+  // Reject the block for lower-qp coding if projected distortion
   // is above the threshold, and any of the following is true:
   // 1) mode uses large mv
   // 2) mode is an intra-mode
-  if (rate < cr->thresh_rate_sb)
-    return 1;
-  else if (dist > cr->thresh_dist_sb &&
-          (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
-           mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
-           !is_inter_block(mbmi)))
-    return 0;
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else  if (bsize >= BLOCK_32X32 &&
+            rate < cr->thresh_rate_sb &&
+            is_inter_block(mbmi) &&
+            mbmi->mv[0].as_int == 0)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
   else
-    return 1;
+    return CR_SEGMENT_ID_BOOST1;
 }
 
 // Compute delta-q for the segment.
-static int compute_deltaq(const VP9_COMP *cpi, int q) {
+static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) {
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const RATE_CONTROL *const rc = &cpi->rc;
   int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type,
-                                          q, cr->rate_ratio_qdelta,
+                                          q, rate_factor,
                                           cpi->common.bit_depth);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
@@ -128,8 +133,9 @@ static int compute_deltaq(const VP9_COMP *cpi, int q) {
 }
 
 // For the just encoded frame, estimate the bits, incorporating the delta-q
-// from segment 1. This function is called in the postencode (called from
-// rc_update_rate_correction_factors()).
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
 int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
                                           double correction_factor) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -137,11 +143,11 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
   int estimated_bits;
   int mbs = cm->MBs;
   int num8x8bl = mbs << 2;
-  // Weight for segment 1: use actual number of blocks refreshed in
+  // Weight for non-base segments: use actual number of blocks refreshed in
   // previous/just encoded frame. Note number of blocks here is in 8x8 units.
   double weight_segment = (double)cr->actual_num_seg_blocks / num8x8bl;
   // Compute delta-q that was used in the just encoded frame.
-  int deltaq = compute_deltaq(cpi, cm->base_qindex);
+  int deltaq = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
   // Take segment weighted average for estimated bits.
   estimated_bits = (int)((1.0 - weight_segment) *
       vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
@@ -155,6 +161,8 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
 // Prior to encoding the frame, estimate the bits per mb, for a given q = i and
 // a corresponding delta-q (for segment 1). This function is called in the
 // rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
 int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
                                       double correction_factor) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -171,7 +179,7 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
   // does not occur/is very small.
   double weight_segment = (double)cr->target_num_seg_blocks / num8x8bl;
   // Compute delta-q corresponding to qindex i.
-  int deltaq = compute_deltaq(cpi, i);
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
   // Take segment weighted average for bits per mb.
   bits_per_mb = (int)((1.0 - weight_segment) *
       vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) +
@@ -197,20 +205,22 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist);
+  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
+                                                      bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
-  // Check if we should reset the segment_id for this block.
-  if (mbmi->segment_id > 0 && !refresh_this_block)
-    mbmi->segment_id = 0;
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (mbmi->segment_id != CR_SEGMENT_ID_BASE)
+    mbmi->segment_id = refresh_this_block;
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (mbmi->segment_id == 1) {
+  if (mbmi->segment_id != CR_SEGMENT_ID_BASE) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -242,20 +252,24 @@ void vp9_cyclic_refresh_update_actual_count(struct VP9_COMP *const cpi) {
   cr->actual_num_seg_blocks = 0;
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
   for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
-    if (seg_map[mi_row * cm->mi_cols + mi_col] == 1)
+    if (seg_map[mi_row * cm->mi_cols + mi_col] != CR_SEGMENT_ID_BASE)
       cr->actual_num_seg_blocks++;
   }
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
 // refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
 void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   unsigned char *const seg_map = cpi->segmentation_map;
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
-  vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+  vpx_memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
   sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
   sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
   sbs_in_frame = sb_cols * sb_rows;
@@ -300,7 +314,7 @@ void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) {
     if (sum_map >= xmis * ymis / 2) {
       for (y = 0; y < ymis; y++)
         for (x = 0; x < xmis; x++) {
-          seg_map[bl_index + y * cm->mi_cols + x] = 1;
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
         }
       cr->target_num_seg_blocks += xmis * ymis;
     }
@@ -352,11 +366,11 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     vp9_clear_system_state();
     cr->max_qdelta_perc = 50;
     cr->time_for_refresh = 0;
-    // Set rate threshold to some fraction (set to 1 for now) of the target
+    // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
-    cr->thresh_rate_sb = (rc->sb64_target_rate << 8);
+    cr->thresh_rate_sb = (rc->sb64_target_rate << 8) << 1;
     // Distortion threshold, quadratic in Q, scale factor to be adjusted.
-    cr->thresh_dist_sb = (int)(q * q) << 5;
+    cr->thresh_dist_sb = (int)(q * q) << 2;
     cr->motion_thresh = 32;
     // Set up segmentation.
     // Clear down the segment map.
@@ -372,19 +386,27 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     // relative to 0 previous map.
     // seg->temporal_update = 0;
 
-    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
-    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
-    // Use segment 1 for in-frame Q adjustment.
-    vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
 
-    // Set the q delta for segment 1.
-    qindex_delta = compute_deltaq(cpi, cm->base_qindex);
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
 
-    // Compute rd-mult for segment 1.
+    // Compute rd-mult for segment BOOST1.
     qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
     cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
 
-    vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta);
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex,
+                                  MIN(CR_MAX_RATE_TARGET_RATIO,
+                                      CR_BOOST2_FAC * cr->rate_ratio_qdelta));
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
 
     // Update the segmentation and refresh map.
     vp9_cyclic_refresh_update_map(cpi);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 1ed07c2c2..dc1b968a2 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -18,6 +18,18 @@
 extern "C" {
 #endif
 
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE    0
+#define CR_SEGMENT_ID_BOOST1  1
+#define CR_SEGMENT_ID_BOOST2  2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+// Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+#define CR_BOOST2_FAC 1.7
+
 struct VP9_COMP;
 
 struct CYCLIC_REFRESH;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 4f245e249..cf67e115e 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -45,7 +45,7 @@ static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
 static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
   (void)bs;
   (void)increase_denoising;
-  return 25 * 25;
+  return 625;
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
@@ -53,8 +53,8 @@ static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
-                           int mv_row, int mv_col) {
-  if (mv_row * mv_row + mv_col * mv_col >
+                           int motion_magnitude) {
+  if (motion_magnitude >
       noise_motion_thresh(bs, increase_denoising)) {
     return 0;
   } else {
@@ -219,7 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
-      sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) {
+      sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
     mbmi->ref_frame[0] = ctx->best_reference_frame;
     mbmi->mode = ctx->best_sse_inter_mode;
     mbmi->mv[0] = ctx->best_sse_mv;
@@ -241,8 +241,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
     *mbmi = saved_mbmi;
     return COPY_BLOCK;
   }
-  if (mv_row * mv_row + mv_col * mv_col >
-      8 * noise_motion_thresh(bs, increase_denoising)) {
+  if (*motion_magnitude >
+     (noise_motion_thresh(bs, increase_denoising) << 3)) {
     // Restore everything to its original state
     *mbmi = saved_mbmi;
     return COPY_BLOCK;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3bdaa043f..ff12bf8a2 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -403,7 +403,8 @@ static int set_vt_partitioning(VP9_COMP *cpi,
                                int mi_row,
                                int mi_col,
                                int64_t threshold,
-                               BLOCK_SIZE bsize_min) {
+                               BLOCK_SIZE bsize_min,
+                               int segment_id) {
   VP9_COMMON * const cm = &cpi->common;
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
@@ -412,6 +413,10 @@ static int set_vt_partitioning(VP9_COMP *cpi,
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
+  // No 64x64 blocks on segments other than base (un-boosted) segment.
+  if (segment_id != CR_SEGMENT_ID_BASE && bsize == BLOCK_64X64)
+    return 0;
+
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
@@ -528,10 +533,10 @@ static int vector_match(int16_t *ref, int16_t *src) {
   }
   center = offset;
 
-  for (d = -8; d <= 8; d += 4) {
+  for (d = -8; d <= 8; d += 16) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64 || this_pos == 32)
+    if (this_pos < 0 || this_pos > 64)
       continue;
     this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
     if (this_sad < best_sad) {
@@ -541,10 +546,10 @@ static int vector_match(int16_t *ref, int16_t *src) {
   }
   offset = center;
 
-  for (d = -4; d <= 4; d += 2) {
+  for (d = -4; d <= 4; d += 8) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64 || this_pos == 32)
+    if (this_pos < 0 || this_pos > 64)
       continue;
     this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
     if (this_sad < best_sad) {
@@ -554,10 +559,23 @@ static int vector_match(int16_t *ref, int16_t *src) {
   }
   offset = center;
 
-  for (d = -2; d <= 2; d += 1) {
+  for (d = -2; d <= 2; d += 4) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64 || this_pos == 32)
+    if (this_pos < 0 || this_pos > 64)
+      continue;
+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > 64)
       continue;
     this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
     if (this_sad < best_sad) {
@@ -670,6 +688,13 @@ static void choose_partitioning(VP9_COMP *cpi,
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
 
+  int segment_id = CR_SEGMENT_ID_BASE;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
+                                                    cm->last_frame_seg_map;
+    segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+  }
+
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
   if (xd->mb_to_right_edge < 0)
@@ -843,7 +868,8 @@ static void choose_partitioning(VP9_COMP *cpi,
   // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
       !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           cpi->vbp_threshold_bsize_max, BLOCK_16X16)) {
+                           cpi->vbp_threshold_bsize_max, BLOCK_16X16,
+                           segment_id)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
@@ -851,7 +877,7 @@ static void choose_partitioning(VP9_COMP *cpi,
       if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
                                (mi_row + y32_idx), (mi_col + x32_idx),
                                cpi->vbp_threshold,
-                               BLOCK_16X16)) {
+                               BLOCK_16X16, segment_id)) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
@@ -865,7 +891,7 @@ static void choose_partitioning(VP9_COMP *cpi,
                                    mi_row + y32_idx + y16_idx,
                                    mi_col + x32_idx + x16_idx,
                                    cpi->vbp_threshold_16x16,
-                                   cpi->vbp_bsize_min)) {
+                                   cpi->vbp_bsize_min, segment_id)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
@@ -875,7 +901,7 @@ static void choose_partitioning(VP9_COMP *cpi,
                                          mi_row + y32_idx + y16_idx + y8_idx,
                                          mi_col + x32_idx + x16_idx + x8_idx,
                                          cpi->vbp_threshold_bsize_min,
-                                         BLOCK_8X8)) {
+                                         BLOCK_8X8, segment_id)) {
                   set_block_size(cpi, xd,
                                  (mi_row + y32_idx + y16_idx + y8_idx),
                                  (mi_col + x32_idx + x16_idx + x8_idx),
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 35c5a487b..69edfded1 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -269,6 +269,8 @@ struct EncWorkerData;
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
   VP9_COMMON common;
   VP9EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 71cea0e45..c18687b55 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -242,9 +242,13 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
     else
       xd->mi[0].src_mi->mbmi.tx_size = TX_8X8;
 
-    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
-        xd->mi[0].src_mi->mbmi.tx_size > TX_16X16)
-      xd->mi[0].src_mi->mbmi.tx_size = TX_16X16;
+    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+          xd->mi[0].src_mi->mbmi.segment_id != CR_SEGMENT_ID_BASE)
+        xd->mi[0].src_mi->mbmi.tx_size = TX_8X8;
+      else if (xd->mi[0].src_mi->mbmi.tx_size > TX_16X16)
+        xd->mi[0].src_mi->mbmi.tx_size = TX_16X16;
+    }
   } else {
     xd->mi[0].src_mi->mbmi.tx_size =
         MIN(max_txsize_lookup[bsize],
@@ -1228,7 +1232,9 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                           [INTER_OFFSET(this_mode)];
           }
 
-          vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            vp9_highbd_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
                                     pd->dst.buf, pd->dst.stride,
                                     &xd->mi[0].bmi[i].as_mv[0].as_mv,
                                     &xd->block_refs[0]->sf,
@@ -1237,7 +1243,24 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                     vp9_get_interp_kernel(mbmi->interp_filter),
                                     MV_PRECISION_Q3,
                                     mi_col * MI_SIZE + 4 * (i & 0x01),
-                                    mi_row * MI_SIZE + 4 * (i >> 1));
+                                    mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
+          } else {
+#endif
+            vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                     pd->dst.buf, pd->dst.stride,
+                                     &xd->mi[0].bmi[i].as_mv[0].as_mv,
+                                     &xd->block_refs[0]->sf,
+                                     4 * num_4x4_blocks_wide,
+                                     4 * num_4x4_blocks_high, 0,
+                                     vp9_get_interp_kernel(mbmi->interp_filter),
+                                     MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i & 0x01),
+                                     mi_row * MI_SIZE + 4 * (i >> 1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif
+
           model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
                             &var_y, &sse_y);
 
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 389dc87e0..7143987d4 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -591,7 +591,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->y_dequant[q][i] = quant;
+      cpi->y_dequant[q][i] = quant;
 
       // uv
       quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
@@ -602,7 +602,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->uv_dequant[q][i] = quant;
+      cpi->uv_dequant[q][i] = quant;
     }
 
     for (i = 2; i < 8; i++) {
@@ -612,7 +612,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
-      cm->y_dequant[q][i] = cm->y_dequant[q][1];
+      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
 
       quants->uv_quant[q][i] = quants->uv_quant[q][1];
       quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
@@ -620,7 +620,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
       quants->uv_round[q][i] = quants->uv_round[q][1];
-      cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
+      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
     }
   }
 }
@@ -641,7 +641,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
-  xd->plane[0].dequant = cm->y_dequant[qindex];
+  xd->plane[0].dequant = cpi->y_dequant[qindex];
 
   x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
   x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
@@ -654,7 +654,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
-    xd->plane[i].dequant = cm->uv_dequant[qindex];
+    xd->plane[i].dequant = cpi->uv_dequant[qindex];
 
     x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index ae22a0b32..279074ce3 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -712,9 +712,7 @@ static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
 }
 
 // right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, int const bit) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  const int bit_m02 = bit - 2;
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
   __m128i sign0 = _mm_srai_epi16(res[0], 15);
   __m128i sign1 = _mm_srai_epi16(res[1], 15);
   __m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -724,16 +722,16 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
   __m128i sign6 = _mm_srai_epi16(res[6], 15);
   __m128i sign7 = _mm_srai_epi16(res[7], 15);
 
-  if (bit_m02 >= 0) {
-    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
-    res[0] = _mm_add_epi16(res[0], k_const_rounding);
-    res[1] = _mm_add_epi16(res[1], k_const_rounding);
-    res[2] = _mm_add_epi16(res[2], k_const_rounding);
-    res[3] = _mm_add_epi16(res[3], k_const_rounding);
-    res[4] = _mm_add_epi16(res[4], k_const_rounding);
-    res[5] = _mm_add_epi16(res[5], k_const_rounding);
-    res[6] = _mm_add_epi16(res[6], k_const_rounding);
-    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_add_epi16(res[0], const_rounding);
+    res[1] = _mm_add_epi16(res[1], const_rounding);
+    res[2] = _mm_add_epi16(res[2], const_rounding);
+    res[3] = _mm_add_epi16(res[3], const_rounding);
+    res[4] = _mm_add_epi16(res[4], const_rounding);
+    res[5] = _mm_add_epi16(res[5], const_rounding);
+    res[6] = _mm_add_epi16(res[6], const_rounding);
+    res[7] = _mm_add_epi16(res[7], const_rounding);
   }
 
   res[0] = _mm_sub_epi16(res[0], sign0);
@@ -745,14 +743,25 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
   res[6] = _mm_sub_epi16(res[6], sign6);
   res[7] = _mm_sub_epi16(res[7], sign7);
 
-  res[0] = _mm_srai_epi16(res[0], bit);
-  res[1] = _mm_srai_epi16(res[1], bit);
-  res[2] = _mm_srai_epi16(res[2], bit);
-  res[3] = _mm_srai_epi16(res[3], bit);
-  res[4] = _mm_srai_epi16(res[4], bit);
-  res[5] = _mm_srai_epi16(res[5], bit);
-  res[6] = _mm_srai_epi16(res[6], bit);
-  res[7] = _mm_srai_epi16(res[7], bit);
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
 }
 
 // write 8x8 array
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 72e01d646..c35eb3603 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -15,6 +15,7 @@ pw_1: times 8 dw 1
 
 SECTION .text
 
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
 %macro QUANTIZE_FN 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
                                 shift, qcoeff, dqcoeff, dequant, \
@@ -244,11 +245,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psllw                           m2, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+
   lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
@@ -266,15 +267,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
 %ifidn %1, fp_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
 %endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
   punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
 %ifidn %1, fp_32x32
   psrlw                           m8, 1
   psrlw                          m13, 1
@@ -282,12 +283,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10
   psrlw                           m0, m3, 2
 %endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
   psubw                           m6, m7                   ; m6 = scan[i] + 1
   psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                           m8, m6                   ; m8 = max(eob)
@@ -318,26 +319,26 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
 %ifidn %1, fp_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
 %endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
 %ifidn %1, fp_32x32
   psrlw                          m14, 1
   psrlw                          m13, 1
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
   psubw                           m6, m7                   ; m6 = scan[i] + 1
   psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                          m14, m6                   ; m14 = max(eob)
@@ -350,10 +351,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %ifidn %1, fp_32x32
   jmp .accumulate_eob
 .skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 %endif
@@ -368,7 +369,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
   pextrw                          r6, m8, 0
-  mov                             [r2], r6
+  mov                           [r2], r6
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -377,19 +378,19 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   movifnidn                  ncoeffq, ncoeffmp
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
   add                        ncoeffq, mmsize
   jl .blank_loop
-  mov                    word [eobq], 0
+  mov                     word [r3q], 0
   RET
 %endmacro