9 files changed, 260 insertions, 37 deletions
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index ed3ea7e1f..1be358e87 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -264,11 +264,18 @@ void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
 }
+
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+}
+
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
 }
+
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index d5ecf85b4..e7057445a 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -52,6 +52,9 @@ void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize);
 
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane);
+
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize);
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 07249d092..31dd7ebcd 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1064,8 +1064,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
     vp9_zero(tile_data->xd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->pbi, &tile_data->xd,
-                       &tile_data->pbi->common.counts,
+      decode_partition(tile_data->pbi, &tile_data->xd, &tile_data->counts,
                        tile, mi_row, mi_col, &tile_data->bit_reader,
                        BLOCK_64X64);
     }
@@ -1086,6 +1085,105 @@ static int compare_tile_buffers(const void *a, const void *b) {
   }
 }
 
+// Accumulate frame counts.
+static void accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    for (j = 0; j < PARTITION_TYPES; j++)
+      cm->counts.partition[i][j] += counts->partition[i][j];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++) {
+            cm->counts.eob_branch[i][j][k][l][m] +=
+                counts->eob_branch[i][j][k][l][m];
+            for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+              cm->counts.coef[i][j][k][l][m][n] +=
+                  counts->coef[i][j][k][l][m][n];
+          }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (j = 0; j < SWITCHABLE_FILTERS; j++)
+      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    for (j = 0; j < INTER_MODES; j++)
+      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      for (k = 0; k < 2; k++)
+      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZES; j++)
+      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+
+    for (j = 0; j < TX_SIZES - 1; j++)
+      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+
+    for (j = 0; j < TX_SIZES - 2; j++)
+      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.skip[i][j] += counts->skip[i][j];
+
+  for (i = 0; i < MV_JOINTS; i++)
+    cm->counts.mv.joints[i] += counts->mv.joints[i];
+
+  for (k = 0; k < 2; k++) {
+    nmv_component_counts *comps = &cm->counts.mv.comps[k];
+    nmv_component_counts *comps_t = &counts->mv.comps[k];
+
+    for (i = 0; i < 2; i++) {
+      comps->sign[i] += comps_t->sign[i];
+      comps->class0_hp[i] += comps_t->class0_hp[i];
+      comps->hp[i] += comps_t->hp[i];
+    }
+
+    for (i = 0; i < MV_CLASSES; i++)
+      comps->classes[i] += comps_t->classes[i];
+
+    for (i = 0; i < CLASS0_SIZE; i++) {
+      comps->class0[i] += comps_t->class0[i];
+      for (j = 0; j < MV_FP_SIZE; j++)
+        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+    }
+
+    for (i = 0; i < MV_OFFSET_BITS; i++)
+      for (j = 0; j < 2; j++)
+        comps->bits[i][j] += comps_t->bits[i][j];
+
+    for (i = 0; i < MV_FP_SIZE; i++)
+      comps->fp[i] += comps_t->fp[i];
+  }
+}
+
 static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
                                       const uint8_t *data,
                                       const uint8_t *data_end) {
@@ -1172,6 +1270,17 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
     }
   }
 
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    int i;
+
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[i].data1;
+      vp9_zero(tile_data->counts);
+    }
+  }
+
   n = 0;
   while (n < tile_cols) {
     int i;
@@ -1184,7 +1293,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
       tile_data->pbi = pbi;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, &pbi->common, 0, buf->col);
+      vp9_tile_init(tile, cm, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
@@ -1218,6 +1327,15 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
       bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
       final_worker = -1;
     }
+
+    // Accumulate thread frame counts.
+    if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
+      for (i = 0; i < num_workers; ++i) {
+        TileWorkerData *const tile_data =
+            (TileWorkerData*)pbi->tile_workers[i].data1;
+        accumulate_frame_counts(cm, &tile_data->counts);
+      }
+    }
   }
 
   return bit_reader_end;
@@ -1673,10 +1791,8 @@ void vp9_decode_frame(VP9Decoder *pbi,
     vp9_frameworker_unlock_stats(worker);
   }
 
-  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
-  // single-frame tile decoding.
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
-      cm->frame_parallel_decoding_mode) {
+  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+    // Multi-threaded tile decoder
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
     if (!xd->corrupted) {
       // If multiple threads are used to decode tiles, then we use those threads
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 47cce068f..4dfbe8171 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -36,6 +36,7 @@ typedef struct TileData {
 typedef struct TileWorkerData {
   struct VP9Decoder *pbi;
   vp9_reader bit_reader;
+  FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 7aa888848..d2a2b819c 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -45,6 +45,13 @@ void vp9_frameworker_signal_stats(VP9Worker *const worker) {
 #endif
 }
 
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
 // TODO(hkuang): Remove worker parameter as it is only used in debug code.
 void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
                           int row) {
@@ -52,9 +59,11 @@ void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
   if (!ref_buf)
     return;
 
-  // Enabling the following line of code will get harmless tsan error but
-  // will get best performance.
-  // if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
 
   {
     // Find the worker thread that owns the reference frame. If the reference
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 68174a6cc..04a1b8f3c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -118,6 +118,10 @@ struct macroblock {
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
+  // Strong color activity detection. Used in RTC coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 091013060..d17b051ec 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -504,7 +504,7 @@ static void choose_partitioning(VP9_COMP *cpi,
   threshold_base = (int64_t)(threshold_multiplier *
       vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
   threshold = threshold_base;
-  threshold_bsize_min = threshold_base << 6;
+  threshold_bsize_min = threshold_base << cpi->oxcf.speed;
   threshold_bsize_max = threshold_base;
 
   // Modify thresholds for key frame and for low-resolutions (set lower
@@ -529,12 +529,23 @@ static void choose_partitioning(VP9_COMP *cpi,
 
   if (cm->frame_type != KEY_FRAME) {
     MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+    unsigned int var = 0, sse;
     vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
     mbmi->sb_type = BLOCK_64X64;
     mbmi->mv[0].as_int = 0;
-    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+
+    for (i = 1; i <= 2; ++i) {
+      struct macroblock_plane  *p = &x->plane[i];
+      struct macroblockd_plane *pd = &xd->plane[i];
+      const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd);
+      var += cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                                pd->dst.buf, pd->dst.stride, &sse);
+      if (sse > 2048)
+        x->color_sensitivity[i - 1] = 1;
+    }
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
@@ -2738,6 +2749,10 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
 static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
   if (xd->lossless)
     return ONLY_4X4;
+  if (cpi->common.frame_type == KEY_FRAME &&
+      cpi->sf.use_nonrd_pick_mode &&
+      cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+    return ALLOW_16X16;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
@@ -3382,6 +3397,8 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
     vp9_rd_cost_init(&dummy_rdc);
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
 
     // Set the partition type of the 64X64 block
     switch (sf->partition_search_type) {
@@ -3671,14 +3688,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
 
-  cm->tx_mode = select_tx_mode(cpi, xd);
-  if (cm->frame_type == KEY_FRAME &&
-      cpi->sf.use_nonrd_pick_mode &&
-      cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
-    cm->tx_mode = ALLOW_16X16;
-  }
-
-
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth)
     x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
@@ -3691,10 +3700,10 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
-  if (xd->lossless) {
+  if (xd->lossless)
     x->optimize = 0;
-    cm->lf.filter_level = 0;
-  }
+
+  cm->tx_mode = select_tx_mode(cpi, xd);
 
   vp9_frame_init_quantizer(cpi);
 
@@ -3782,9 +3791,6 @@ static INTERP_FILTER get_interp_filter(
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
-  FRAME_COUNTS *counts = cpi->td.counts;
-  RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
@@ -3806,11 +3812,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     }
   }
 
-  vpx_memset(cpi->td.counts->tx.tx_totals, 0,
-             sizeof(cpi->td.counts->tx.tx_totals));
-
   if (cpi->sf.frame_parameter_update) {
     int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    FRAME_COUNTS *counts = cpi->td.counts;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
     // This code does a single RD pass over the whole frame assuming
     // either compound, single or hybrid prediction as per whatever has
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index a428f1a2d..159e0fc0c 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -90,13 +90,10 @@ static int mv_err_cost(const MV *mv, const MV *ref,
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int error_per_bit) {
-  if (x->nmvsadcost) {
-    const MV diff = { mv->row - ref->row,
-                      mv->col - ref->col };
-    return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
-                                      x->nmvsadcost) * error_per_bit, 8);
-  }
-  return 0;
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
+                                    x->nmvsadcost) * error_per_bit, 8);
 }
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index e239c008f..a34b12258 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -283,6 +283,71 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
     x->skip_txfm[0] = 1;
 }
 
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               int *out_rate_sum, int64_t *out_dist_sum,
+                               unsigned int *var_y, unsigned int *sse_y) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+
+  *out_rate_sum = 0;
+  *out_dist_sum = 0;
+
+  for (i = 1; i <= 2; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = pd->dequant[0];
+    const uint32_t ac_quant = pd->dequant[1];
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    unsigned int var;
+
+    if (!x->color_sensitivity[i - 1])
+      continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    *var_y += var;
+    *sse_y += sse;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate >> 1;
+    *out_dist_sum += dist << 3;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                 ac_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate;
+    *out_dist_sum += dist << 4;
+  }
+}
+
 static int get_pred_buffer(PRED_BUFFER *p, int len) {
   int i;
 
@@ -658,7 +723,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
 
     // Select prediction reference frames.
-    xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
 
     clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
@@ -776,6 +842,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                           &var_y, &sse_y);
       }
 
+      // chroma component rate-distortion cost modeling
+      if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+        int uv_rate = 0;
+        int64_t uv_dist = 0;
+        if (x->color_sensitivity[0])
+          vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+        if (x->color_sensitivity[1])
+          vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+        model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+                           &var_y, &sse_y);
+        this_rdc.rate += uv_rate;
+        this_rdc.dist += uv_dist;
+      }
+
       this_rdc.rate += rate_mv;
       this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
                                   [INTER_OFFSET(this_mode)];