3 files changed, 121 insertions, 3 deletions
diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index 2febe9e90..dcb5990c5 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc
@@ -480,6 +480,123 @@ TEST_P(YUVTemporalFilterTest, Use16x16) {
   }
 }
 
+TEST_P(YUVTemporalFilterTest, SaturationTest) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+
+  Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
+  Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0);
+  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
+  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count_ref.Init());
+  ASSERT_TRUE(y_accum_ref.Init());
+  ASSERT_TRUE(y_count_tst.Init());
+  ASSERT_TRUE(y_accum_tst.Init());
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+          const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+          Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8);
+          Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> u_count_ref =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> u_accum_ref =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> u_count_tst =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> u_accum_tst =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          ASSERT_TRUE(u_src.Init());
+          ASSERT_TRUE(u_pre.Init());
+          ASSERT_TRUE(u_count_ref.Init());
+          ASSERT_TRUE(u_accum_ref.Init());
+          ASSERT_TRUE(u_count_tst.Init());
+          ASSERT_TRUE(u_accum_tst.Init());
+          Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8);
+          Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> v_count_ref =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> v_accum_ref =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          Buffer<uint16_t> v_count_tst =
+              Buffer<uint16_t>(uv_width, uv_height, 0);
+          Buffer<uint32_t> v_accum_tst =
+              Buffer<uint32_t>(uv_width, uv_height, 0);
+          ASSERT_TRUE(v_src.Init());
+          ASSERT_TRUE(v_pre.Init());
+          ASSERT_TRUE(v_count_ref.Init());
+          ASSERT_TRUE(v_accum_ref.Init());
+          ASSERT_TRUE(v_count_tst.Init());
+          ASSERT_TRUE(v_accum_tst.Init());
+
+          // The difference between the buffers must be small to pass the
+          // threshold to apply the filter.
+          y_src.Set(255);
+          y_pre.Set(0);
+          u_src.Set(255);
+          u_pre.Set(0);
+          v_src.Set(255);
+          v_pre.Set(0);
+
+          y_accum_ref.Set(rnd_.Rand8());
+          y_accum_tst.CopyFrom(y_accum_ref);
+          y_count_ref.Set(rnd_.Rand8());
+          y_count_tst.CopyFrom(y_count_ref);
+          u_accum_ref.Set(rnd_.Rand8());
+          u_accum_tst.CopyFrom(u_accum_ref);
+          u_count_ref.Set(rnd_.Rand8());
+          u_count_tst.CopyFrom(u_count_ref);
+          v_accum_ref.Set(rnd_.Rand8());
+          v_accum_tst.CopyFrom(v_accum_ref);
+          v_count_ref.Set(rnd_.Rand8());
+          v_count_tst.CopyFrom(v_count_ref);
+
+          ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width,
+                               height, ss_x, ss_y, filter_strength,
+                               &filter_weight, use_32x32, &y_accum_ref,
+                               &y_count_ref, &u_accum_ref, &u_count_ref,
+                               &v_accum_ref, &v_count_ref);
+          ASM_REGISTER_STATE_CHECK(filter_func_(
+              y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(),
+              y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(),
+              u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(),
+              u_pre.stride(), width, height, ss_x, ss_y, filter_strength,
+              &filter_weight, use_32x32, y_accum_tst.TopLeftPixel(),
+              y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(),
+              u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(),
+              v_count_tst.TopLeftPixel()));
+
+          EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
+          EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
+          EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
+          EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
+          EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
+          EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
+
+          if (HasFailure()) {
+            printf("SS_X: %d, SS_Y: %d, Weight: %d, Strength: %d\n", ss_x, ss_y,
+                   filter_weight, filter_strength);
+            y_accum_tst.PrintDifference(y_accum_ref);
+            y_count_tst.PrintDifference(y_count_ref);
+            u_accum_tst.PrintDifference(u_accum_ref);
+            u_count_tst.PrintDifference(u_count_ref);
+            v_accum_tst.PrintDifference(v_accum_ref);
+            v_count_tst.PrintDifference(v_count_ref);
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
 TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
   const int width = 32, height = 32;
   Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8);
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 04b589ac3..e10946aca 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -192,7 +192,8 @@ static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
   assert(index >= 0 && index <= 13);
   assert(index_mult[index] != 0);
 
-  mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod =
+      ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
   mod += rounding;
   mod >>= strength;
 
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index 18c4f02d5..b560e2218 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -967,8 +967,8 @@ static void vp9_apply_temporal_filter_chroma_8(
   v_sum_row_2 = v_sum_row_3;
 
   // Add chroma values
-  u_sum_row = _mm_adds_epu8(u_sum_row_1, u_sum_row_2);
-  v_sum_row = _mm_adds_epu8(v_sum_row_1, v_sum_row_2);
+  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
 
   // Add luma values
   add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);