15 files changed, 139 insertions, 50 deletions
diff --git a/test/resize_test.cc b/test/resize_test.cc
index bc91fe226..c5f05f310 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -94,13 +94,53 @@ unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
   if (frame < 10)
     return val;
   if (frame < 20)
-    return val / 2;
+    return val * 3 / 4;
   if (frame < 30)
-    return val * 2 / 3;
+    return val / 2;
   if (frame < 40)
-    return val / 4;
+    return val;
   if (frame < 50)
-    return val * 7 / 8;
+    return val * 3 / 4;
+  if (frame < 60)
+    return val / 2;
+  if (frame < 70)
+    return val * 3 / 4;
+  if (frame < 80)
+    return val;
+  if (frame < 90)
+    return val * 3 / 4;
+  if (frame < 100)
+    return val / 2;
+  if (frame < 110)
+    return val * 3 / 4;
+  if (frame < 120)
+    return val;
+  if (frame < 130)
+    return val * 3 / 4;
+  if (frame < 140)
+    return val / 2;
+  if (frame < 150)
+    return val * 3 / 4;
+  if (frame < 160)
+    return val;
+  if (frame < 170)
+    return val / 2;
+  if (frame < 180)
+    return val * 3 / 4;
+  if (frame < 190)
+    return val;
+  if (frame < 200)
+    return val * 3 / 4;
+  if (frame < 210)
+    return val / 2;
+  if (frame < 220)
+    return val * 3 / 4;
+  if (frame < 230)
+    return val;
+  if (frame < 240)
+    return val / 2;
+  if (frame < 250)
+    return val * 3 / 4;
   return val;
 }
 
@@ -108,7 +148,7 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
  public:
   ResizingVideoSource() {
     SetSize(kInitialWidth, kInitialHeight);
-    limit_ = 60;
+    limit_ = 300;
   }
 
   virtual ~ResizingVideoSource() {}
@@ -347,6 +387,8 @@ class ResizeRealtimeTest : public ::libvpx_test::EncoderTest,
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
   DefaultConfig();
+  // Disable internal resize for this test.
+  cfg_.rc_resize_allowed = 0;
   change_bitrate_ = false;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index b1077cb21..c62da964a 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1073,6 +1073,12 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP10_COMP *cpi, MACROBLOCK *mb,
   memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
+  // TODO(any): Add search of the tx_type to improve rd performance at the
+  // expense of speed.
+  mic->mbmi.tx_type = DCT_DCT;
+
+  // Later we can add search of the tx_type to improve results.
+  // For now just set it to DCT_DCT
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
@@ -3940,6 +3946,10 @@ void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
         filter_cache[i] = INT64_MAX;
 
+      // TODO(any): Add search of the tx_type to improve rd performance at the
+      // expense of speed.
+      mbmi->tx_type = DCT_DCT;
+
       if (cm->interp_filter != BILINEAR) {
         tmp_best_filter = EIGHTTAP;
         if (x->source_variance < sf->disable_filter_search_var_thresh) {
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index a433d03a0..c00e517a7 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -12,6 +12,7 @@
 #ifndef VP8_COMMON_THREADING_H_
 #define VP8_COMMON_THREADING_H_
 
+#include "./vpx_config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -20,7 +21,7 @@ extern "C" {
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
 /* Thread management macros */
-#ifdef _WIN32
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 /* Win32 */
 #include <process.h>
 #include <windows.h>
@@ -77,8 +78,8 @@ extern "C" {
 #define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
 #endif
 
-/* Syncrhronization macros: Win32 and Pthreads */
-#ifdef _WIN32
+/* Synchronization macros: Win32 and Pthreads */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 #define sem_t HANDLE
 #define pause(voidpara) __asm PAUSE
 #define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index a2e391841..b7cfdf6bf 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -608,4 +608,5 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
   memset(cr->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols);
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
+  cpi->refresh_alt_ref_frame = 1;
 }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3eaa9deb8..147743e8d 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -65,8 +65,14 @@ struct macroblock {
   int skip_optimize;
   int q_index;
 
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
   int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
   int sadperbit16;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
   int rdmult;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 99118f5df..e419cffd8 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -332,7 +332,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
 
-  if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kDenLow) {
+  if (bs <= BLOCK_32X32 && denoiser->denoising_level >= kDenLow) {
     is_skin = vp9_compute_skin_block(mb->plane[0].src.buf,
                                      mb->plane[1].src.buf,
                                      mb->plane[2].src.buf,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8a46738cd..bd9813a77 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1540,6 +1540,10 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   }
   update_frame_size(cpi);
 
+  if ((last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_reset_resize(cpi);
+
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
@@ -2971,8 +2975,19 @@ void vp9_scale_references(VP9_COMP *cpi) {
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       } else {
-        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        int buf_idx;
+        RefCntBuffer *buf = NULL;
+        if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+          // Check for release of scaled reference.
+          buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
+          buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL;
+          if (buf != NULL) {
+            --buf->ref_count;
+            cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+          }
+        }
+        buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        buf = &pool->frame_bufs[buf_idx];
         buf->buf.y_crop_width = ref->y_crop_width;
         buf->buf.y_crop_height = ref->y_crop_height;
         cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
@@ -4141,7 +4156,7 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int use_highbitdepth = sd->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 #else
   check_initial_width(cpi, subsampling_x, subsampling_y);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 607941cfa..8b7825e7b 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -80,27 +80,29 @@ int vp9_mv_bit_cost(const MV *mv, const MV *ref,
   return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
 }
 
-static int mv_err_cost(const MV *mv, const MV *ref,
-                       const int *mvjcost, int *mvcost[2],
-                       int error_per_bit) {
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                       int *mvcost[2], int error_per_bit) {
   if (mvcost) {
-    const MV diff = { mv->row - ref->row,
-                      mv->col - ref->col };
-    // TODO(aconverse): See if this shift needs to be tied to
-    // VP9_PROB_COST_SHIFT.
-    return ROUND_POWER_OF_TWO((unsigned)mv_cost(&diff, mvjcost, mvcost) *
-                                  error_per_bit, 13);
+    const MV diff = {mv->row - ref->row, mv->col - ref->col};
+    // This product sits at a 32-bit ceiling right now and any additional
+    // accuracy in either bit cost or error cost will cause it to overflow.
+    return ROUND_POWER_OF_TWO(
+        (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+        RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
+            PIXEL_TRANSFORM_ERROR_SCALE);
   }
   return 0;
 }
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int error_per_bit) {
+                          int sad_per_bit) {
   const MV diff = { mv->row - ref->row,
                     mv->col - ref->col };
-  // TODO(aconverse): See if this shift needs to be tied to VP9_PROB_COST_SHIFT.
-  return ROUND_POWER_OF_TWO((unsigned)mv_cost(&diff, x->nmvjointsadcost,
-                                      x->nmvsadcost) * error_per_bit, 8);
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) *
+          sad_per_bit,
+      VP9_PROB_COST_SHIFT);
 }
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
@@ -152,12 +154,13 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
  * could reduce the area.
  */
 
-/* estimated cost of a motion vector (r,c) */
+/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
+ * from the same math as in mv_err_cost(). */
 #define MVC(r, c)                                              \
     (mvcost ?                                                  \
      ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +      \
        mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *        \
-      error_per_bit + 4096) >> 13 : 0)
+      error_per_bit + 8192) >> 14 : 0)
 
 
 // convert motion vector component to offset for sv[a]f calc
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 829066c9f..d861f8096 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -949,7 +949,8 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     // TODO(jingning): Skip is signalled per prediciton block not per tx block.
     rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
   } else {
-    unsigned int var, sse;
+    unsigned int var = 0;
+    unsigned int sse = 0;
     model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
                        plane, plane);
   }
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 980a49f0a..91f877ed7 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -342,8 +342,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
-  x->errorperbit = rdmult >> 6;
-  x->errorperbit += (x->errorperbit == 0);
+  set_error_per_bit(x, rdmult);
 
   vp9_initialize_me_consts(cpi, x, x->q_index);
 }
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index a8a939ee4..fc32d1911 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -41,7 +41,6 @@
 #include "vp9/encoder/vp9_tokenize.h"
 
 #define RD_THRESH_POW      1.25
-#define RD_MULT_EPB_RATIO  64
 
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -279,8 +278,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
   rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
-  x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
-  x->errorperbit += (x->errorperbit == 0);
+  set_error_per_bit(x, rd->RDMULT);
 
   x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
                        cm->frame_type != KEY_FRAME) ? 0 : 1;
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index a92b14edf..9b8e2732c 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -24,6 +24,7 @@ extern "C" {
 #endif
 
 #define RDDIV_BITS          7
+#define RD_EPB_SHIFT        6
 
 #define RDCOST(RM, DM, R, D) \
   (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM))
@@ -168,6 +169,11 @@ static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }
 
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
 void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride,
                  int ref_frame, BLOCK_SIZE block_size);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f00a58ce2..1480ea418 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3355,24 +3355,25 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
     }
 
     if (!disable_skip) {
-      vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+      const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
 
         // Cost the skip mb case
-        rate2 += vp9_cost_bit(skip_prob, 1);
+        rate2 += skip_cost1;
       } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv,
-                   rate_y + rate_uv + vp9_cost_bit(skip_prob, 0),
-                   distortion2) <
-            RDCOST(x->rdmult, x->rddiv,
-                   vp9_cost_bit(skip_prob, 1), total_sse)) {
+                   rate_y + rate_uv + skip_cost0, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
           // Add in the cost of the no skip flag.
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+          rate2 += skip_cost0;
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          rate2 += skip_cost1;
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -3380,7 +3381,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
         }
       } else {
         // Add in the cost of the no skip flag.
-        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        rate2 += skip_cost0;
       }
 
       // Calculate the final RD estimate for this mode.
@@ -4152,17 +4153,21 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     }
 
     if (!disable_skip) {
+      const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+      const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
       // Skip is never coded at the segment level for sub8x8 blocks and instead
       // always coded in the bitstream at the mode info level.
-
       if (ref_frame != INTRA_FRAME && !xd->lossless) {
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+        if (RDCOST(x->rdmult, x->rddiv,
+                   rate_y + rate_uv + skip_cost0, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
           // Add in the cost of the no skip flag.
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+          rate2 += skip_cost0;
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          rate2 += skip_cost1;
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -4172,7 +4177,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
         }
       } else {
         // Add in the cost of the no skip flag.
-        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        rate2 += skip_cost0;
       }
 
       // Calculate the final RD estimate for this mode.
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index b475f8db1..0bc417fc1 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -47,12 +47,12 @@ static INLINE int mv_cost(const int_mv mv,
 }
 
 static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int error_per_bit) {
+                          int sad_per_bit) {
   const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
                                   mv.as_mv.col - ref->col);
   return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
                                               x->nmvsadcost) *
-                                              error_per_bit, 8);
+                                              sad_per_bit, VP9_PROB_COST_SHIFT);
 }
 
 /*****************************************************************************
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 83a91e870..2930c23dd 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -119,7 +119,9 @@ endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
+endif
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c