11 files changed, 116 insertions, 68 deletions
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index d9dace6ac..e96bc4f2b 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -25,55 +25,65 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
 static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
   int mi_row, mi_col;
-  int mi_index = 0;
-  // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = &cm->mi;
+  MODE_INFO *mi = cm->mi;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
 
   log_frame_info(cm, descriptor, file);
-  mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index]->mbmi) +
-                        member_offset)));
-      mi_index++;
+              *((int*) ((char *) (&mi->src_mi->mbmi) +
+                                  member_offset)));
+      mi++;
     }
     fprintf(file, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(file, "\n");
 }
+
 void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
-  int mi_index = 0;
   FILE *mvs = fopen(file, "a");
-  // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = &cm->mi;
+  MODE_INFO *mi = cm->mi;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi->src_mi->mbmi.skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
   log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
-                               mi[mi_index]->mbmi.mv[0].as_mv.col);
-      mi_index++;
+      fprintf(mvs, "%4d:%4d ", mi->src_mi->mbmi.mv[0].as_mv.row,
+                               mi->src_mi->mbmi.mv[0].as_mv.col);
+      mi++;
     }
     fprintf(mvs, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(mvs, "\n");
 
diff --git a/vp9/common/vp9_thread.h b/vp9/common/vp9_thread.h
index c24ef5fac..12848fede 100644
--- a/vp9/common/vp9_thread.h
+++ b/vp9/common/vp9_thread.h
@@ -28,7 +28,7 @@ extern "C" {
 
 #if CONFIG_MULTITHREAD
 
-#if defined(_WIN32)
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 #include <errno.h>  // NOLINT
 #include <process.h>  // NOLINT
 #include <windows.h>  // NOLINT
diff --git a/vp9/common/x86/vp9_high_intrapred_sse2.asm b/vp9/common/x86/vp9_high_intrapred_sse2.asm
index 721126c78..b12d29c0a 100644
--- a/vp9/common/x86/vp9_high_intrapred_sse2.asm
+++ b/vp9/common/x86/vp9_high_intrapred_sse2.asm
@@ -345,7 +345,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 0885909cd..99bb9300e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -471,6 +471,43 @@ static int set_vt_partitioning(VP9_COMP *cpi,
   return 0;
 }
 
+
+void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  if (sf->partition_search_type != VAR_BASED_PARTITION) {
+    return;
+  } else {
+    VP9_COMMON *const cm = &cpi->common;
+    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+    const int is_key_frame = (cm->frame_type == KEY_FRAME);
+    const int use_4x4_partition = is_key_frame;
+    const int low_res = (cm->width <= 352 && cm->height <= 288);
+    const int threshold_multiplier = is_key_frame ? 80 : 4;
+    const int64_t threshold_base = (int64_t)(threshold_multiplier *
+        vp9_convert_qindex_to_q(q, cm->bit_depth));
+    cpi->vbp_threshold = threshold_base;
+    cpi->vbp_threshold_bsize_min = threshold_base << oxcf->speed;
+    cpi->vbp_threshold_bsize_max = threshold_base;
+
+    if (is_key_frame) {
+      cpi->vbp_threshold = threshold_base >> 2;
+      cpi->vbp_threshold_bsize_min = threshold_base << 2;
+    } else if (low_res) {
+      cpi->vbp_threshold_bsize_min = threshold_base << 3;
+      cpi->vbp_threshold_bsize_max = threshold_base >> 2;
+    }
+    // TODO(marpan): Allow 4x4 partitions for inter-frames.
+    // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
+    // If 4x4 partition is not used, then 8x8 partition will be selected
+    // if variance of 16x16 block is very high, so use larger threshold
+    // for 16x16 (threshold_bsize_min) in that case.
+    cpi->vbp_threshold_16x16 = (use_4x4_partition) ?
+      cpi->vbp_threshold : cpi->vbp_threshold_bsize_min;
+    cpi->vbp_bsize_min = (use_4x4_partition) ? BLOCK_8X8 : BLOCK_16X16;
+  }
+}
+
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for downs-sampled inputs.
 static void choose_partitioning(VP9_COMP *cpi,
@@ -479,7 +516,6 @@ static void choose_partitioning(VP9_COMP *cpi,
                                 int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-
   int i, j, k, m;
   v64x64 vt;
   v16x16 vt2[16];
@@ -489,34 +525,12 @@ static void choose_partitioning(VP9_COMP *cpi,
   int dp;
   int pixels_wide = 64, pixels_high = 64;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+
   // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
   const int use_4x4_partition = is_key_frame;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
-  int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0;
-  const int threshold_multiplier = is_key_frame ? 80 : 4;
-  int64_t threshold_base;
-  int64_t threshold;
-  int64_t threshold_bsize_min;
-  int64_t threshold_bsize_max;
-
-  vp9_clear_system_state();
-  threshold_base = (int64_t)(threshold_multiplier *
-      vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
-  threshold = threshold_base;
-  threshold_bsize_min = threshold_base << cpi->oxcf.speed;
-  threshold_bsize_max = threshold_base;
-
-  // Modify thresholds for key frame and for low-resolutions (set lower
-  // thresholds to favor split).
-  if (is_key_frame) {
-    threshold = threshold_base >> 2;
-    threshold_bsize_min = threshold_base << 2;
-  } else if (low_res) {
-    threshold_bsize_min = threshold_base << 3;
-    threshold_bsize_max = threshold_base >> 2;
-  }
 
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
@@ -531,7 +545,8 @@ static void choose_partitioning(VP9_COMP *cpi,
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
     unsigned int var = 0, sse;
-    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+        &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
     mbmi->sb_type = BLOCK_64X64;
@@ -619,7 +634,7 @@ static void choose_partitioning(VP9_COMP *cpi,
       }
       if (is_key_frame || (low_res &&
           vt.split[i].split[j].part_variances.none.variance >
-          (threshold << 1))) {
+          (cpi->vbp_threshold << 1))) {
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
         for (k = 0; k < 4; k++) {
@@ -680,30 +695,22 @@ static void choose_partitioning(VP9_COMP *cpi,
   }
   fill_variance_tree(&vt, BLOCK_64X64);
 
-
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
       !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           threshold_bsize_max, BLOCK_16X16)) {
+                           cpi->vbp_threshold_bsize_max, BLOCK_16X16)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
       const int i2 = i << 2;
       if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
                                (mi_row + y32_idx), (mi_col + x32_idx),
-                               threshold, BLOCK_16X16)) {
+                               cpi->vbp_threshold,
+                               BLOCK_16X16)) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          // TODO(marpan): Allow 4x4 partitions for inter-frames.
-          // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
-          // If 4x4 partition is not used, then 8x8 partition will be selected
-          // if variance of 16x16 block is very high, so use larger threshold
-          // for 16x16 (threshold_bsize_min) in that case.
-          uint64_t threshold_16x16 = (use_4x4_partition) ? threshold :
-                                      threshold_bsize_min;
-          BLOCK_SIZE bsize_min = (use_4x4_partition) ? BLOCK_8X8 : BLOCK_16X16;
           // For inter frames: if variance4x4downsample[] == 1 for this 16x16
           // block, then the variance is based on 4x4 down-sampling, so use vt2
           // in set_vt_partioning(), otherwise use vt.
@@ -713,7 +720,8 @@ static void choose_partitioning(VP9_COMP *cpi,
           if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
                                    mi_row + y32_idx + y16_idx,
                                    mi_col + x32_idx + x16_idx,
-                                   threshold_16x16, bsize_min)) {
+                                   cpi->vbp_threshold_16x16,
+                                   cpi->vbp_bsize_min)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
@@ -722,7 +730,8 @@ static void choose_partitioning(VP9_COMP *cpi,
                                          BLOCK_8X8,
                                          mi_row + y32_idx + y16_idx + y8_idx,
                                          mi_col + x32_idx + x16_idx + x8_idx,
-                                         threshold_bsize_min, BLOCK_8X8)) {
+                                         cpi->vbp_threshold_bsize_min,
+                                         BLOCK_8X8)) {
                   set_block_size(cpi, xd,
                                  (mi_row + y32_idx + y16_idx + y8_idx),
                                  (mi_col + x32_idx + x16_idx + x8_idx),
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 556f3a5a1..8d545b671 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -38,6 +38,8 @@ void vp9_init_tile_data(struct VP9_COMP *cpi);
 void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
                      int tile_row, int tile_col);
 
+void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 65b660528..eda38ff3d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2800,7 +2800,10 @@ static void encode_without_recode_loop(VP9_COMP *cpi) {
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
   vp9_set_quantizer(cm, q);
+  vp9_set_vbp_thresholds(cpi, q);
+
   setup_frame(cpi);
+
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1e4c982ff..35c5a487b 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -450,6 +450,13 @@ typedef struct VP9_COMP {
 
   int resize_pending;
 
+  // VAR_BASED_PARTITION thresholds
+  int64_t vbp_threshold;
+  int64_t vbp_threshold_bsize_min;
+  int64_t vbp_threshold_bsize_max;
+  int64_t vbp_threshold_16x16;
+  BLOCK_SIZE vbp_bsize_min;
+
   // Multi-threading
   int num_workers;
   VP9Worker *workers;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index d24e4c7e6..071747e17 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -810,9 +810,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
           model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
                             &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
-          cost = RDCOST(x->rdmult, x->rddiv,
-                        vp9_get_switchable_rate(cpi, xd) + pf_rate[filter],
-                        pf_dist[filter]);
+          pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+          cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
           pf_tx_size[filter] = mbmi->tx_size;
           if (cost < best_cost) {
             best_filter = filter;
@@ -849,6 +848,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
                           &var_y, &sse_y);
+        this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+            vp9_get_switchable_rate(cpi, xd) : 0;
       }
 
       // chroma component rate-distortion cost modeling
@@ -1129,8 +1130,6 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         int64_t b_best_rd = INT64_MAX;
         const int i = idy * 2 + idx;
         PREDICTION_MODE this_mode;
-        int b_rate = 0;
-        int64_t b_dist = 0;
         RD_COST this_rdc;
         unsigned int var_y, sse_y;
 
@@ -1158,6 +1157,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                       &b_mv[NEARMV]);
 
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          int b_rate = 0;
           xd->mi[0].bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
 
           if (this_mode == NEWMV) {
@@ -1219,6 +1219,9 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                          &x->pred_sse[ref_frame], NULL, 0, 0);
 
             xd->mi[0].bmi[i].as_mv[0].as_mv = tmp_mv;
+          } else {
+            b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                          [INTER_OFFSET(this_mode)];
           }
 
           vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
@@ -1235,7 +1238,6 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                             &var_y, &sse_y);
 
           this_rdc.rate += b_rate;
-          this_rdc.dist += b_dist;
           this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                    this_rdc.rate, this_rdc.dist);
           if (this_rdc.rdcost < b_best_rd) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 8d316d6bc..69751379f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1173,8 +1173,8 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
   // Modify frame size target when down-scaling.
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
       rc->frame_size_selector != UNSCALED)
-    rc->this_frame_target =
-        rc->this_frame_target * rate_thresh_mult[rc->frame_size_selector];
+    rc->this_frame_target = (int)(rc->this_frame_target
+        * rate_thresh_mult[rc->frame_size_selector]);
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index eaa0accdb..8722d9cec 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -421,4 +421,3 @@ void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
 #endif
 
 #endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
-
diff --git a/vpxenc.c b/vpxenc.c
index f42e02cb0..46af6008a 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -398,6 +398,22 @@ static const arg_def_t frame_periodic_boost = ARG_DEF(
     NULL, "frame-boost", 1,
     "Enable frame periodic boost (0: off (default), 1: on)");
 
+static const struct arg_enum_list color_space_enum[] = {
+  { "unknown", VPX_CS_UNKNOWN },
+  { "bt601", VPX_CS_BT_601 },
+  { "bt709", VPX_CS_BT_709 },
+  { "smpte170", VPX_CS_SMPTE_170 },
+  { "smpte240", VPX_CS_SMPTE_240 },
+  { "bt2020", VPX_CS_BT_2020 },
+  { "reserved", VPX_CS_RESERVED },
+  { "sRGB", VPX_CS_SRGB },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_space = ARG_DEF_ENUM(
+    NULL, "color-space", 1,
+    "The color space of input content:", color_space_enum);
+
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
   {"8",  VPX_BITS_8},
@@ -429,7 +445,7 @@ static const arg_def_t *vp9_args[] = {
   &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
   &gf_cbr_boost_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
-  &noise_sens, &tune_content,
+  &noise_sens, &tune_content, &input_color_space,
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif