9 files changed, 54 insertions, 35 deletions
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 21916ad51..bc91fe226 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -407,7 +407,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   // Disable dropped frames.
   cfg_.rc_dropframe_thresh = 0;
   // Starting bitrate low.
-  cfg_.rc_target_bitrate = 90;
+  cfg_.rc_target_bitrate = 80;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
@@ -432,7 +432,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   }
 
   // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(2, resize_count) << "Resizing should occur twice.";
+  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
 }
 
 vpx_img_fmt_t CspForFrameNumber(int frame) {
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5d59e83f7..d44a64a0b 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -191,9 +191,14 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
 INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
                 vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
                 vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse)
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE && CONFIG_USE_X86INC
 
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
                 vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 4e88819b1..f5da07ea0 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -243,7 +243,7 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
     decrease_ref_count(old_idx, frame_bufs, pool);
 
     // Release the reference frame in reference map.
-    if ((mask & 1) && old_idx >= 0) {
+    if (mask & 1) {
       decrease_ref_count(old_idx, frame_bufs, pool);
     }
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
@@ -350,7 +350,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
         decrease_ref_count(old_idx, frame_bufs, pool);
 
         // Release the reference frame in reference map.
-        if ((mask & 1) && old_idx >= 0) {
+        if (mask & 1) {
           decrease_ref_count(old_idx, frame_bufs, pool);
         }
         ++ref_index;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 4a5188f8f..afa400941 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -128,7 +128,7 @@ void vp9_decoder_remove(struct VP9Decoder *pbi);
 
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
-  if (idx >= 0) {
+  if (idx >= 0 && frame_bufs[idx].ref_count > 0) {
     --frame_bufs[idx].ref_count;
     // A worker may only get a free framebuffer index when calling get_free_fb.
     // But the private buffer is not set up until finish decoding header.
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index ddf081c86..b41ffd0a3 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -36,6 +36,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne,
   } else if (width * height >= 1280 * 720) {
     ne->thresh = 130;
   }
+  ne->num_frames_estimate = 20;
 }
 
 int enable_noise_estimation(VP9_COMP *const cpi) {
@@ -88,10 +89,9 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   // Estimate of noise level every frame_period frames.
   int frame_period = 10;
   int thresh_consec_zeromv = 8;
-  unsigned int thresh_sum_diff = 128;
+  unsigned int thresh_sum_diff = 100;
   unsigned int thresh_sum_spatial = (200 * 200) << 8;
   unsigned int thresh_spatial_var = (32 * 32) << 8;
-  int num_frames_estimate = 20;
   int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
   // Estimate is between current source and last source.
   YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
@@ -135,6 +135,17 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
     const int uv_width_shift = y_width_shift >> 1;
     const int uv_height_shift = y_height_shift >> 1;
     int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion < ((3 * cm->mi_rows * cm->mi_cols) >> 3))
+      frame_low_motion = 0;
     for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
       for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
         // 16x16 blocks, 1/4 sample of frame.
@@ -154,7 +165,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
           const uint8_t vsource =
             src_v[uv_height_shift * src_uvstride + uv_width_shift];
           int is_skin = vp9_skin_pixel(ysource, usource, vsource);
-          if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+          if (frame_low_motion &&
+              cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
@@ -202,10 +214,11 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
       // Normalize.
       avg_est = avg_est / num_samples;
       // Update noise estimate.
-      ne->value = (int)((3 * ne->value + avg_est) >> 2);
+      ne->value = (int)((15 * ne->value + avg_est) >> 4);
       ne->count++;
-      if (ne->count == num_frames_estimate) {
+      if (ne->count == ne->num_frames_estimate) {
         // Reset counter and check noise level condition.
+        ne->num_frames_estimate = 30;
         ne->count = 0;
         if (ne->value > (ne->thresh << 1))
           ne->level = kHigh;
diff --git a/vp9/encoder/vp9_noise_estimate.h b/vp9/encoder/vp9_noise_estimate.h
index b5dded9ef..0d22ef042 100644
--- a/vp9/encoder/vp9_noise_estimate.h
+++ b/vp9/encoder/vp9_noise_estimate.h
@@ -38,6 +38,7 @@ typedef struct noise_estimate {
   int count;
   int last_w;
   int last_h;
+  int num_frames_estimate;
 } NOISE_ESTIMATE;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1dfc45cf6..13da155c7 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -278,7 +278,8 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->alt_ref_source = lc->alt_ref_source;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
-  if (cpi->svc.number_temporal_layers > 1) {
+  if (cpi->svc.number_temporal_layers > 1 ||
+      cpi->svc.number_spatial_layers > 1) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b369b0548..03dcd4eeb 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -97,7 +97,7 @@ add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, cons
 specialize qw/vpx_ve_predictor_4x4/;
 
 add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 22b573188..04b39a583 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -515,35 +515,34 @@ cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
   pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movd                  m0, [aboveq]
-  punpcklbw             m2, m1
+  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
   punpcklbw             m0, m1
-  pshufw                m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  add                leftq, 4
-  psubw                 m0, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
+  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
+  psrldq                m0, 2
+  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+  movd                  m2, [leftq]
   punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshufw                m2, m2, 0x0
-  pshufw                m3, m3, 0x0
-  paddw                 m2, m0
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  paddw                 m4, m0
   paddw                 m3, m0
-  packuswb              m2, m2
+  packuswb              m4, m4
   packuswb              m3, m3
-  movd      [dstq        ], m2
+  movd      [dstq        ], m4
   movd      [dstq+strideq], m3
   lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m4, m2, 0xaa
+  pshuflw               m3, m2, 0xff
+  paddw                 m4, m0
+  paddw                 m3, m0
+  packuswb              m4, m4
+  packuswb              m3, m3
+  movd      [dstq        ], m4
+  movd      [dstq+strideq], m3
+  RET
 
 INIT_XMM sse2
 cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left