13 files changed, 244 insertions, 190 deletions
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 4f689c4bc..2a0c2987b 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -518,7 +518,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
     cpi->b_multi_threaded = 0;
     cpi->encoding_thread_count = 0;
-    cpi->b_lpf_running = 0;
 
     pthread_mutex_init(&cpi->mt_mutex, NULL);
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 93c457008..88c191e94 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1531,15 +1531,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     if (!oxcf)
         return;
 
-#if CONFIG_MULTITHREAD
-    /*  wait for the last picture loopfilter thread done */
-    if (cpi->b_lpf_running)
-    {
-        sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
-    }
-#endif
-
     if (cm->version != oxcf->Version)
     {
         cm->version = oxcf->Version;
@@ -3638,15 +3629,6 @@ static void encode_frame_to_data_rate
     /* Clear down mmx registers to allow floating point in what follows */
     vp8_clear_system_state();
 
-#if CONFIG_MULTITHREAD
-    /*  wait for the last picture loopfilter thread done */
-    if (cpi->b_lpf_running)
-    {
-        sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
-    }
-#endif
-
     if(cpi->force_next_frame_intra)
     {
         cm->frame_type = KEY_FRAME;  /* delayed intra frame */
@@ -4375,8 +4357,6 @@ static void encode_frame_to_data_rate
             vp8_setup_key_frame(cpi);
         }
 
-
-
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
         {
             if(cpi->oxcf.error_resilient_mode)
@@ -4842,7 +4822,6 @@ static void encode_frame_to_data_rate
     {
         /* start loopfilter in separate thread */
         sem_post(&cpi->h_event_start_lpf);
-        cpi->b_lpf_running = 1;
     }
     else
 #endif
@@ -4874,11 +4853,10 @@ static void encode_frame_to_data_rate
     vp8_pack_bitstream(cpi, dest, dest_end, size);
 
 #if CONFIG_MULTITHREAD
-    /* if PSNR packets are generated we have to wait for the lpf */
-    if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+    /* wait for the lpf thread done */
+    if (cpi->b_multi_threaded)
     {
         sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
     }
 #endif
 
@@ -5838,14 +5816,6 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
     {
         int ret;
 
-#if CONFIG_MULTITHREAD
-        if(cpi->b_lpf_running)
-        {
-            sem_wait(&cpi->h_event_end_lpf);
-            cpi->b_lpf_running = 0;
-        }
-#endif
-
 #if CONFIG_POSTPROC
         cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 2b2f7a0a9..6ede9b95a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -536,7 +536,6 @@ typedef struct VP8_COMP
     int mt_sync_range;
     int b_multi_threaded;
     int encoding_thread_count;
-    int b_lpf_running;
 
     pthread_t *h_encoding_thread;
     pthread_t h_filter_thread;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0ea063291..51fbe541c 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,7 +50,8 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 static const int skin_mean[5][2] =
     {{7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
 static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
-static const int skin_threshold[2] = {1570636, 800000};       // q18
+static const int skin_threshold[6] = {1570636, 1400000, 800000, 800000, 800000,
+    800000};  // q18
 
 // Evaluates the Mahalanobis distance measure for the input CbCr values.
 static int evaluate_skin_color_difference(int cb, int cr, int idx) {
@@ -73,7 +74,7 @@ static int evaluate_skin_color_difference(int cb, int cr, int idx) {
 }
 
 // Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr)
+static int is_skin_color(int y, int cb, int cr, int consec_zeromv)
 {
   if (y < 40 || y > 220)
   {
@@ -88,13 +89,31 @@ static int is_skin_color(int y, int cb, int cr)
     else
     {
       int i = 0;
-      for (; i < 5; i++)
-      {
-        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[1])
-        {
-          return 1;
-        }
-      }
+      // No skin if block has been zero motion for long consecutive time.
+      if (consec_zeromv > 80)
+        return 0;
+      // Exit on grey.
+       if (cb == 128 && cr == 128)
+         return 0;
+       // Exit on very strong cb.
+       if (cb > 150 && cr < 110)
+         return 0;
+       for (; i < 5; i++) {
+         int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+         if (skin_color_diff < skin_threshold[i + 1]) {
+            if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+              return 0;
+            else if (consec_zeromv > 30 &&
+                     skin_color_diff > (skin_threshold[i + 1] >> 1))
+              return 0;
+            else
+             return 1;
+         }
+         // Exit if difference is much large than the threshold.
+         if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+           return 0;
+         }
+       }
       return 0;
     }
   }
@@ -851,8 +870,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         x->src.v_buffer[4 * x->src.uv_stride + 3] +
         x->src.v_buffer[4 * x->src.uv_stride + 4]) >> 2;
     x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode)
-      x->is_skin = is_skin_color(y, cb, cr);
+    if (!cpi->oxcf.screen_content_mode) {
+      int block_index = mb_row * cpi->common.mb_cols + mb_col;
+      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
+    }
     }
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 7fc573333..32c72194d 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -189,54 +189,31 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
                                           uint8_t *dst, int stride,
                                           int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless) {
-        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_8X8:
-            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_16X16:
-            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_32X32:
-            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-        }
-      }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->lossless) {
+      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
     } else {
-      if (xd->lossless) {
-        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_8X8:
-            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_16X16:
-            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_32X32:
-            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-            return;
-        }
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_8X8:
+          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_16X16:
+          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_32X32:
+          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
       }
     }
-#else
+  } else {
     if (xd->lossless) {
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
@@ -258,18 +235,40 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
           return;
       }
     }
+  }
+#else
+  if (xd->lossless) {
+    vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+  } else {
+    switch (tx_size) {
+      case TX_4X4:
+        vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        return;
+    }
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
+  if (eob == 1) {
+    dqcoeff[0] = 0;
+  } else {
+    if (tx_size <= TX_16X16 && eob <= 10)
+      memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+    else if (tx_size == TX_32X32 && eob <= 34)
+      memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+    else
+      memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
   }
 }
 
@@ -279,54 +278,31 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
                                           uint8_t *dst, int stride,
                                           int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless) {
-        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_8X8:
-            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_16X16:
-            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_32X32:
-            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-        }
-      }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->lossless) {
+      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
     } else {
-      if (xd->lossless) {
-        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_8X8:
-            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_16X16:
-            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_32X32:
-            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-            return;
-        }
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_8X8:
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_16X16:
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_32X32:
+          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
       }
     }
-#else
+  } else {
     if (xd->lossless) {
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
@@ -348,18 +324,40 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
           return;
       }
     }
+  }
+#else
+  if (xd->lossless) {
+    vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+  } else {
+    switch (tx_size) {
+      case TX_4X4:
+        vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        return;
+    }
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
+  if (eob == 1) {
+    dqcoeff[0] = 0;
+  } else {
+    if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+      memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+    else if (tx_size == TX_32X32 && eob <= 34)
+      memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+    else
+      memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
   }
 }
 
@@ -389,8 +387,10 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
         &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
                                             r, mi->segment_id);
-    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
-                                  dst, pd->dst.stride, eob);
+    if (eob > 0) {
+      inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                    dst, pd->dst.stride, eob);
+    }
   }
 }
 
@@ -402,9 +402,11 @@ static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
   const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
                                           mi->segment_id);
 
-  inverse_transform_block_inter(xd, plane, tx_size,
-                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
-                            pd->dst.stride, eob);
+  if (eob > 0) {
+    inverse_transform_block_inter(
+        xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+        pd->dst.stride, eob);
+  }
   return eob;
 }
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 3bd42ece6..596427c1e 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -463,14 +463,13 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
-static void dec_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist,
-                                  int_mv *best_mv, int refmv_count) {
+static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
+                                  int refmv_count) {
   int i;
 
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < refmv_count; ++i) {
     lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-    clamp_mv2(&mvlist[i].as_mv, xd);
     *best_mv = mvlist[i];
   }
 }
@@ -778,7 +777,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        tmp_mvs, mi_row, mi_col, -1, 0,
                                        fpm_sync, (void *)pbi);
 
-        dec_find_best_ref_mvs(xd, allow_hp, tmp_mvs, &best_ref_mvs[ref],
+        dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
                               refmv_count);
       }
     }
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 9bc9f26c1..9eca2a229 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -316,13 +316,14 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
   return FILTER_BLOCK;
 }
 
-void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx,
                           VP9_DENOISER_DECISION *denoiser_decision) {
   int mv_col, mv_row;
   int motion_magnitude = 0;
   int zeromv_filter = 0;
+  VP9_DENOISER *denoiser = &cpi->denoiser;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -331,21 +332,53 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                                           mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
 
-  if (bs <= BLOCK_32X32 && denoiser->denoising_level >= kDenLow) {
+  if (cpi->use_skin_detection &&
+      bs <= BLOCK_32X32 &&
+      denoiser->denoising_level >= kDenLow) {
+    int motion_level = (motion_magnitude < 16) ? 0 : 1;
+    // If motion for current block is small/zero, compute consec_zeromv for
+    // skin detection (early exit in skin detection is done for large
+    // consec_zeromv when current block has small/zero motion).
+    int consec_zeromv = 0;
+    if (motion_level == 0) {
+      CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+      VP9_COMMON * const cm = &cpi->common;
+      int j, i;
+      // Loop through the 8x8 sub-blocks.
+      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      consec_zeromv = 100;
+      for (i = 0; i < ymis; i++) {
+        for (j = 0; j < xmis; j++) {
+          int bl_index = block_index + i * cm->mi_cols + j;
+          consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index], consec_zeromv);
+          // No need to keep checking 8x8 blocks if any of the sub-blocks
+          // has small consec_zeromv (since threshold for no_skin based on
+          // zero/small motion in skin detection is high, i.e, > 5).
+          if (consec_zeromv < 5) {
+            i = ymis;
+            j = xmis;
+          }
+        }
+      }
+    }
+    // TODO(marpan): Compute skin detection over sub-blocks.
     is_skin = vp9_compute_skin_block(mb->plane[0].src.buf,
                                      mb->plane[1].src.buf,
                                      mb->plane[2].src.buf,
                                      mb->plane[0].src.stride,
                                      mb->plane[1].src.stride,
                                      bs,
-                                     0,
-                                     0);
+                                     consec_zeromv,
+                                     motion_level);
   }
-
-  mv_col = ctx->best_sse_mv.as_mv.col;
-  mv_row = ctx->best_sse_mv.as_mv.row;
-  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
   if (!is_skin &&
       denoiser->denoising_level == kDenHigh &&
       motion_magnitude < 16) {
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 9f13bd533..a0e201781 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -53,7 +53,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     int refresh_last_frame,
                                     int resized);
 
-void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx ,
                           VP9_DENOISER_DECISION *denoiser_decision);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 84593836c..40b332ac8 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -772,36 +772,55 @@ static int choose_partitioning(VP9_COMP *cpi,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     // Check if most of the superblock is skin content, and if so, force split
-    // to 32x32. Avoid checking superblocks on/near boundary and avoid low
-    // resolutons for now.
+    // to 32x32, and set x->sb_is_skin for use in mode selection.
+    // Avoid checking superblocks on/near boundary and avoid low resolutions.
     // Note superblock may still pick 64X64 if y_sad is very small
     // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
     x->sb_is_skin = 0;
 #if !CONFIG_VP9_HIGHBITDEPTH
     if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
         mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
+      CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+      int bl_index1, bl_index2, bl_index3;
       int num_16x16_skin = 0;
       int num_16x16_nonskin = 0;
+      int is_skin = 0;
+      int consec_zeromv = 0;
       uint8_t *ysignal = x->plane[0].src.buf;
       uint8_t *usignal = x->plane[1].src.buf;
       uint8_t *vsignal = x->plane[2].src.buf;
       int spuv = x->plane[1].src.stride;
-      for (i = 0; i < 4; i++) {
-        for (j = 0; j < 4; j++) {
-          int is_skin = vp9_compute_skin_block(ysignal,
-                                               usignal,
-                                               vsignal,
-                                               sp,
-                                               spuv,
-                                               BLOCK_16X16,
-                                               0,
-                                               0);
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+      // Loop through the 16x16 sub-blocks.
+      int j, i;
+      for (i = 0; i < ymis; i+=2) {
+        for (j = 0; j < xmis; j+=2) {
+          int bl_index = block_index + i * cm->mi_cols + j;
+          bl_index1 = bl_index + 1;
+          bl_index2 = bl_index + cm->mi_cols;
+          bl_index3 = bl_index2 + 1;
+          consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+                                 VPXMIN(cr->consec_zero_mv[bl_index1],
+                                 VPXMIN(cr->consec_zero_mv[bl_index2],
+                                 cr->consec_zero_mv[bl_index3])));
+          is_skin = vp9_compute_skin_block(ysignal,
+                                           usignal,
+                                           vsignal,
+                                           sp,
+                                           spuv,
+                                           BLOCK_16X16,
+                                           consec_zeromv,
+                                           0);
           num_16x16_skin += is_skin;
           num_16x16_nonskin += (1 - is_skin);
           if (num_16x16_nonskin > 3) {
             // Exit loop if at least 4 of the 16x16 blocks are not skin.
-            i = 4;
-            j = 4;
+            i = ymis;
+            j = xmis;
           }
           ysignal += 16;
           usignal += 8;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3ea2ccd88..8b2e98549 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1825,8 +1825,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->oxcf.noise_sensitivity > 0 &&
       cpi->resize_pending == 0) {
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
-    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
-                         VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
+    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, VPXMAX(BLOCK_8X8, bsize),
+                         ctx, &decision);
     // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
     // result. Only do this under noise conditions, and if rdcost of ZEROMV on
     // original source is not significantly higher than rdcost of best mode.
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index c014ca174..02be3c3f9 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -400,6 +400,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
     sf->simple_model_rd_from_var = 1;
+    if (cpi->oxcf.rc_mode == VPX_VBR)
+      sf->mv.search_method = NSTEP;
 
     if (!is_keyframe) {
       int i;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 624d5c9fc..5921636d3 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -158,7 +158,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, g_w,                   1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_h,                   1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
-  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+  RANGE_CHECK(cfg, g_timebase.num,        1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile,          3);
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
@@ -486,7 +486,16 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->content = extra_cfg->content;
 
   oxcf->tile_columns = extra_cfg->tile_columns;
-  oxcf->tile_rows    = extra_cfg->tile_rows;
+
+  // TODO(yunqing): The dependencies between row tiles cause error in multi-
+  // threaded encoding. For now, tile_rows is forced to be 0 in this case.
+  // The further fix can be done by adding synchronizations after a tile row
+  // is encoded. But this will hurt multi-threaded encoder performance. So,
+  // it is recommended to use tile-rows=0 while encoding with threads > 1.
+  if (oxcf->max_threads > 1 && oxcf->tile_columns > 0)
+    oxcf->tile_rows  = 0;
+  else
+    oxcf->tile_rows  = extra_cfg->tile_rows;
 
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
diff --git a/vpxenc.c b/vpxenc.c
index f24b1805b..50e7c7fc7 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -380,7 +380,8 @@ static const arg_def_t cpu_used_vp9 = ARG_DEF(
 static const arg_def_t tile_cols = ARG_DEF(
     NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows = ARG_DEF(
-    NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+    NULL, "tile-rows", 1,
+    "Number of tile rows to use, log2 (set to 0 while threads > 1)");
 static const arg_def_t lossless = ARG_DEF(
     NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(