diff options
-rw-r--r-- | test/temporal_filter_test.cc | 280 | ||||
-rw-r--r-- | test/test.mk | 1 | ||||
-rw-r--r-- | test/yuv_temporal_filter_test.cc | 887 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_temporal_filter.c | 130 | ||||
-rw-r--r-- | vp9/encoder/x86/temporal_filter_sse4.c | 207 |
6 files changed, 430 insertions, 1078 deletions
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc deleted file mode 100644 index d14a4826e..000000000 --- a/test/temporal_filter_test.cc +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <limits> - -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vp9_rtcd.h" -#include "test/acm_random.h" -#include "test/buffer.h" -#include "test/register_state_check.h" -#include "vpx_ports/vpx_timer.h" - -namespace { - -using ::libvpx_test::ACMRandom; -using ::libvpx_test::Buffer; - -typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride, - const uint8_t *b, unsigned int w, - unsigned int h, int filter_strength, - int filter_weight, unsigned int *accumulator, - uint16_t *count); - -// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply -// filter based on strength and weight. Store the resulting filter amount in -// 'count' and apply it to 'b' and store it in 'accumulator'. -void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w, - int h, int filter_strength, int filter_weight, - Buffer<unsigned int> *accumulator, - Buffer<uint16_t> *count) { - Buffer<int> diff_sq = Buffer<int>(w, h, 0); - ASSERT_TRUE(diff_sq.Init()); - diff_sq.Set(0); - - int rounding = 0; - if (filter_strength > 0) { - rounding = 1 << (filter_strength - 1); - } - - ASSERT_TRUE(a.TopLeftPixel() != NULL); - ASSERT_TRUE(b.TopLeftPixel() != NULL); - ASSERT_TRUE(diff_sq.TopLeftPixel() != NULL); - // Calculate all the differences. Avoids re-calculating a bunch of extra - // values. - for (int height = 0; height < h; ++height) { - for (int width = 0; width < w; ++width) { - int diff = a.TopLeftPixel()[height * a.stride() + width] - - b.TopLeftPixel()[height * b.stride() + width]; - diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff; - } - } - - // For any given point, sum the neighboring values and calculate the - // modifier. - for (int height = 0; height < h; ++height) { - for (int width = 0; width < w; ++width) { - // Determine how many values are being summed. - int summed_values = 9; - - if (height == 0 || height == (h - 1)) { - summed_values -= 3; - } - - if (width == 0 || width == (w - 1)) { - if (summed_values == 6) { // corner - summed_values -= 2; - } else { - summed_values -= 3; - } - } - - // Sum the diff_sq of the surrounding values. - int sum = 0; - for (int idy = -1; idy <= 1; ++idy) { - for (int idx = -1; idx <= 1; ++idx) { - const int y = height + idy; - const int x = width + idx; - - // If inside the border. - if (y >= 0 && y < h && x >= 0 && x < w) { - sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x]; - } - } - } - - sum *= 3; - sum /= summed_values; - sum += rounding; - sum >>= filter_strength; - - // Clamp the value and invert it. - if (sum > 16) sum = 16; - sum = 16 - sum; - - sum *= filter_weight; - - count->TopLeftPixel()[height * count->stride() + width] += sum; - accumulator->TopLeftPixel()[height * accumulator->stride() + width] += - sum * b.TopLeftPixel()[height * b.stride() + width]; - } - } -} - -class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> { - public: - virtual void SetUp() { - filter_func_ = GetParam(); - rnd_.Reset(ACMRandom::DeterministicSeed()); - } - - protected: - TemporalFilterFunc filter_func_; - ACMRandom rnd_; -}; - -TEST_P(TemporalFilterTest, SizeCombinations) { - // Depending on subsampling this function may be called with values of 8 or 16 - // for width and height, in any combination. - Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8); - ASSERT_TRUE(a.Init()); - - const int filter_weight = 2; - const int filter_strength = 6; - - for (int width = 8; width <= 16; width += 8) { - for (int height = 8; height <= 16; height += 8) { - // The second buffer must not have any border. - Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0); - ASSERT_TRUE(b.Init()); - Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0); - ASSERT_TRUE(accum_ref.Init()); - Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0); - ASSERT_TRUE(accum_chk.Init()); - Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0); - ASSERT_TRUE(count_ref.Init()); - Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0); - ASSERT_TRUE(count_chk.Init()); - - // The difference between the buffers must be small to pass the threshold - // to apply the filter. - a.Set(&rnd_, 0, 7); - b.Set(&rnd_, 0, 7); - - accum_ref.Set(rnd_.Rand8()); - accum_chk.CopyFrom(accum_ref); - count_ref.Set(rnd_.Rand8()); - count_chk.CopyFrom(count_ref); - reference_filter(a, b, width, height, filter_strength, filter_weight, - &accum_ref, &count_ref); - ASM_REGISTER_STATE_CHECK( - filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, - height, filter_strength, filter_weight, - accum_chk.TopLeftPixel(), count_chk.TopLeftPixel())); - EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); - EXPECT_TRUE(count_chk.CheckValues(count_ref)); - if (HasFailure()) { - printf("Width: %d Height: %d\n", width, height); - count_chk.PrintDifference(count_ref); - accum_chk.PrintDifference(accum_ref); - return; - } - } - } -} - -TEST_P(TemporalFilterTest, CompareReferenceRandom) { - for (int width = 8; width <= 16; width += 8) { - for (int height = 8; height <= 16; height += 8) { - Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8); - ASSERT_TRUE(a.Init()); - // The second buffer must not have any border. - Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0); - ASSERT_TRUE(b.Init()); - Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0); - ASSERT_TRUE(accum_ref.Init()); - Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0); - ASSERT_TRUE(accum_chk.Init()); - Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0); - ASSERT_TRUE(count_ref.Init()); - Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0); - ASSERT_TRUE(count_chk.Init()); - - for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) { - for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) { - for (int repeat = 0; repeat < 100; ++repeat) { - if (repeat < 50) { - a.Set(&rnd_, 0, 7); - b.Set(&rnd_, 0, 7); - } else { - // Check large (but close) values as well. - a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7, - std::numeric_limits<uint8_t>::max()); - b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7, - std::numeric_limits<uint8_t>::max()); - } - - accum_ref.Set(rnd_.Rand8()); - accum_chk.CopyFrom(accum_ref); - count_ref.Set(rnd_.Rand8()); - count_chk.CopyFrom(count_ref); - reference_filter(a, b, width, height, filter_strength, - filter_weight, &accum_ref, &count_ref); - ASM_REGISTER_STATE_CHECK(filter_func_( - a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height, - filter_strength, filter_weight, accum_chk.TopLeftPixel(), - count_chk.TopLeftPixel())); - EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); - EXPECT_TRUE(count_chk.CheckValues(count_ref)); - if (HasFailure()) { - printf("Weight: %d Strength: %d\n", filter_weight, - filter_strength); - count_chk.PrintDifference(count_ref); - accum_chk.PrintDifference(accum_ref); - return; - } - } - } - } - } - } -} - -TEST_P(TemporalFilterTest, DISABLED_Speed) { - Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8); - ASSERT_TRUE(a.Init()); - - const int filter_weight = 2; - const int filter_strength = 6; - - for (int width = 8; width <= 16; width += 8) { - for (int height = 8; height <= 16; height += 8) { - // The second buffer must not have any border. - Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0); - ASSERT_TRUE(b.Init()); - Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0); - ASSERT_TRUE(accum_ref.Init()); - Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0); - ASSERT_TRUE(accum_chk.Init()); - Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0); - ASSERT_TRUE(count_ref.Init()); - Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0); - ASSERT_TRUE(count_chk.Init()); - - a.Set(&rnd_, 0, 7); - b.Set(&rnd_, 0, 7); - - accum_chk.Set(0); - count_chk.Set(0); - - vpx_usec_timer timer; - vpx_usec_timer_start(&timer); - for (int i = 0; i < 10000; ++i) { - filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, - height, filter_strength, filter_weight, - accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()); - } - vpx_usec_timer_mark(&timer); - const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); - printf("Temporal filter %dx%d time: %5d us\n", width, height, - elapsed_time); - } - } -} - -INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest, - ::testing::Values(&vp9_temporal_filter_apply_c)); - -#if HAVE_SSE4_1 -INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest, - ::testing::Values(&vp9_temporal_filter_apply_sse4_1)); -#endif // HAVE_SSE4_1 -} // namespace diff --git a/test/test.mk b/test/test.mk index 2b7636185..61eb6060f 100644 --- a/test/test.mk +++ b/test/test.mk @@ -170,7 +170,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc ifneq ($(CONFIG_REALTIME_ONLY),yes) -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc index 8d68e4abe..e8d19672d 100644 --- a/test/yuv_temporal_filter_test.cc +++ b/test/yuv_temporal_filter_test.cc @@ -30,6 +30,14 @@ typedef void (*YUVTemporalFilterFunc)( uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +struct TemporalFilterWithBd { + TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth) + : temporal_filter(func), bd(bitdepth) {} + + YUVTemporalFilterFunc temporal_filter; + int bd; +}; + int GetFilterWeight(unsigned int row, unsigned int col, unsigned int block_height, unsigned int block_width, const int *const blk_fw, int use_32x32) { @@ -40,8 +48,24 @@ int GetFilterWeight(unsigned int row, unsigned int col, return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)]; } +template <typename PixelType> int GetModIndex(int sum_dist, int index, int rounding, int strength, int filter_weight) { + int mod = sum_dist * 3 / index; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength, + int filter_weight) { unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124 }; @@ -61,22 +85,33 @@ int GetModIndex(int sum_dist, int index, int rounding, int strength, return mod; } +template <typename PixelType> void ApplyReferenceFilter( - const Buffer<uint8_t> &y_src, const Buffer<uint8_t> &y_pre, - const Buffer<uint8_t> &u_src, const Buffer<uint8_t> &v_src, - const Buffer<uint8_t> &u_pre, const Buffer<uint8_t> &v_pre, + const Buffer<PixelType> &y_src, const Buffer<PixelType> &y_pre, + const Buffer<PixelType> &u_src, const Buffer<PixelType> &v_src, + const Buffer<PixelType> &u_pre, const Buffer<PixelType> &v_pre, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, - Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_count, - Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_count, - Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_count) { - // blk_fw means block_filter_weight - // Set up buffer to store squared_diffs + Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_counter, + Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_counter, + Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_counter) { + const PixelType *y_src_ptr = y_src.TopLeftPixel(); + const PixelType *y_pre_ptr = y_pre.TopLeftPixel(); + const PixelType *u_src_ptr = u_src.TopLeftPixel(); + const PixelType *u_pre_ptr = u_pre.TopLeftPixel(); + const PixelType *v_src_ptr = v_src.TopLeftPixel(); + const PixelType *v_pre_ptr = v_pre.TopLeftPixel(); + + const int uv_block_width = block_width >> ss_x, + uv_block_height = block_height >> ss_y; + const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride(); + const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride(); + const int y_diff_stride = block_width, uv_diff_stride = uv_block_width; + Buffer<int> y_dif = Buffer<int>(block_width, block_height, 0); - const int uv_block_width = block_width >> ss_x; - const int uv_block_height = block_height >> ss_y; Buffer<int> u_dif = Buffer<int>(uv_block_width, uv_block_height, 0); Buffer<int> v_dif = Buffer<int>(uv_block_width, uv_block_height, 0); + ASSERT_TRUE(y_dif.Init()); ASSERT_TRUE(u_dif.Init()); ASSERT_TRUE(v_dif.Init()); @@ -84,55 +119,56 @@ void ApplyReferenceFilter( u_dif.Set(0); v_dif.Set(0); - // How many bits do we want to round - ASSERT_GE(strength, 0); - ASSERT_LE(strength, 6); - int rounding = 0; - if (strength > 0) { - rounding = 1 << (strength - 1); - } + int *y_diff_ptr = y_dif.TopLeftPixel(); + int *u_diff_ptr = u_dif.TopLeftPixel(); + int *v_diff_ptr = v_dif.TopLeftPixel(); - // Check that the buffers are valid - ASSERT_TRUE(y_src.TopLeftPixel() != NULL); - ASSERT_TRUE(y_pre.TopLeftPixel() != NULL); - ASSERT_TRUE(y_dif.TopLeftPixel() != NULL); - ASSERT_TRUE(u_src.TopLeftPixel() != NULL); - ASSERT_TRUE(u_pre.TopLeftPixel() != NULL); - ASSERT_TRUE(u_dif.TopLeftPixel() != NULL); - ASSERT_TRUE(v_src.TopLeftPixel() != NULL); - ASSERT_TRUE(v_pre.TopLeftPixel() != NULL); - ASSERT_TRUE(v_dif.TopLeftPixel() != NULL); + uint32_t *y_accum = y_accumulator->TopLeftPixel(); + uint32_t *u_accum = u_accumulator->TopLeftPixel(); + uint32_t *v_accum = v_accumulator->TopLeftPixel(); + uint16_t *y_count = y_counter->TopLeftPixel(); + uint16_t *u_count = u_counter->TopLeftPixel(); + uint16_t *v_count = v_counter->TopLeftPixel(); + + const int y_accum_stride = y_accumulator->stride(); + const int u_accum_stride = u_accumulator->stride(); + const int v_accum_stride = v_accumulator->stride(); + const int y_count_stride = y_counter->stride(); + const int u_count_stride = u_counter->stride(); + const int v_count_stride = v_counter->stride(); + + const int rounding = (1 << strength) >> 1; // Get the square diffs - for (int row = 0; row < static_cast<int>(block_height); row++) { - for (int col = 0; col < static_cast<int>(block_width); col++) { - const int diff = y_src.TopLeftPixel()[row * y_src.stride() + col] - - y_pre.TopLeftPixel()[row * y_pre.stride() + col]; - y_dif.TopLeftPixel()[row * y_dif.stride() + col] = diff * diff; + for (int row = 0; row < (int)block_height; row++) { + for (int col = 0; col < (int)block_width; col++) { + const int diff = y_src_ptr[row * y_src_stride + col] - + y_pre_ptr[row * y_pre_stride + col]; + y_diff_ptr[row * y_diff_stride + col] = diff * diff; } } - for (int row = 0; row < uv_block_height; row++) { - for (int col = 0; col < uv_block_width; col++) { - const int u_diff = u_src.TopLeftPixel()[row * u_src.stride() + col] - - u_pre.TopLeftPixel()[row * u_pre.stride() + col]; - const int v_diff = v_src.TopLeftPixel()[row * v_src.stride() + col] - - v_pre.TopLeftPixel()[row * v_pre.stride() + col]; - u_dif.TopLeftPixel()[row * u_dif.stride() + col] = u_diff * u_diff; - v_dif.TopLeftPixel()[row * v_dif.stride() + col] = v_diff * v_diff; + for (int row = 0; row < (int)uv_block_height; row++) { + for (int col = 0; col < (int)uv_block_width; col++) { + const int u_diff = u_src_ptr[row * uv_src_stride + col] - + u_pre_ptr[row * uv_pre_stride + col]; + const int v_diff = v_src_ptr[row * uv_src_stride + col] - + v_pre_ptr[row * uv_pre_stride + col]; + u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff; } } - // Apply the filter - for (int row = 0; row < static_cast<int>(block_height); row++) { - for (int col = 0; col < static_cast<int>(block_width); col++) { - const int uv_r = row >> ss_y; - const int uv_c = col >> ss_x; + // Apply the filter to luma + for (int row = 0; row < (int)block_height; row++) { + for (int col = 0; col < (int)block_width; col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; const int filter_weight = GetFilterWeight(row, col, block_height, block_width, blk_fw, use_32x32); // First we get the modifier for the current y pixel - const int y_pixel = y_pre.TopLeftPixel()[row * y_pre.stride() + col]; + const int y_pixel = y_pre_ptr[row * y_pre_stride + col]; int y_num_used = 0; int y_mod = 0; @@ -142,116 +178,316 @@ void ApplyReferenceFilter( const int sub_row = row + row_step; const int sub_col = col + col_step; - if (sub_row >= 0 && sub_row < static_cast<int>(block_height) && - sub_col >= 0 && sub_col < static_cast<int>(block_width)) { - y_mod += y_dif.TopLeftPixel()[sub_row * y_dif.stride() + sub_col]; + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col]; y_num_used++; } } } - ASSERT_GE(y_num_used, 0); - // Sum the corresponding uv pixels to the current y modifier // Note we are rounding down instead of rounding to the nearest pixel. - y_mod += u_dif.TopLeftPixel()[uv_r * uv_block_width + uv_c]; - y_mod += v_dif.TopLeftPixel()[uv_r * uv_block_width + uv_c]; + y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col]; y_num_used += 2; // Set the modifier - y_mod = GetModIndex(y_mod, y_num_used, rounding, strength, filter_weight); + y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength, + filter_weight); // Accumulate the result - y_count->TopLeftPixel()[row * y_count->stride() + col] += y_mod; - y_accumulator->TopLeftPixel()[row * y_accumulator->stride() + col] += - y_mod * y_pixel; - - // Get the modifier for chroma components - if (!(row & ss_y) && !(col & ss_x)) { - const int u_pixel = u_pre.TopLeftPixel()[uv_r * u_pre.stride() + uv_c]; - const int v_pixel = v_pre.TopLeftPixel()[uv_r * v_pre.stride() + uv_c]; - - int uv_num_used = 0; - int u_mod = 0, v_mod = 0; - - // Sum the neighboring 3x3 chromal pixels to the chroma modifier - for (int row_step = -1; row_step <= 1; row_step++) { - for (int col_step = -1; col_step <= 1; col_step++) { - const int sub_row = uv_r + row_step; - const int sub_col = uv_c + col_step; - - if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && - sub_col < uv_block_width) { - u_mod += u_dif.TopLeftPixel()[sub_row * uv_block_width + sub_col]; - v_mod += v_dif.TopLeftPixel()[sub_row * uv_block_width + sub_col]; - uv_num_used++; - } - } - } + y_count[row * y_count_stride + col] += y_mod; + y_accum[row * y_accum_stride + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) { + for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = GetFilterWeight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); - ASSERT_GT(uv_num_used, 0); + const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col]; - // Sum all the luma pixels associated with the current luma pixel - for (int row_step = 0; row_step < 1 + ss_y; row_step++) { - for (int col_step = 0; col_step < 1 + ss_x; col_step++) { - const int sub_row = (uv_r << ss_y) + row_step; - const int sub_col = (uv_c << ss_x) + col_step; - const int y_diff = - y_dif.TopLeftPixel()[sub_row * y_dif.stride() + sub_col]; + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; - u_mod += y_diff; - v_mod += y_diff; + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col]; uv_num_used++; } } + } - // Set the modifier - u_mod = - GetModIndex(u_mod, uv_num_used, rounding, strength, filter_weight); - v_mod = - GetModIndex(v_mod, uv_num_used, rounding, strength, filter_weight); - - // Accumulate the result - u_count->TopLeftPixel()[uv_r * u_count->stride() + uv_c] += u_mod; - u_accumulator->TopLeftPixel()[uv_r * u_accumulator->stride() + uv_c] += - u_mod * u_pixel; - v_count->TopLeftPixel()[uv_r * u_count->stride() + uv_c] += v_mod; - v_accumulator->TopLeftPixel()[uv_r * v_accumulator->stride() + uv_c] += - v_mod * v_pixel; + // Sum all the luma pixels associated with the current luma pixel + for (int row_step = 0; row_step < 1 + ss_y; row_step++) { + for (int col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } } + + // Set the modifier + u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * u_count_stride + uv_col] += u_mod; + u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel; + v_count[uv_row * v_count_stride + uv_col] += v_mod; + v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel; } } } class YUVTemporalFilterTest - : public ::testing::TestWithParam<YUVTemporalFilterFunc> { + : public ::testing::TestWithParam<TemporalFilterWithBd> { public: virtual void SetUp() { - filter_func_ = GetParam(); + filter_func_ = GetParam().temporal_filter; + bd_ = GetParam().bd; + use_highbd_ = (bd_ != 8); + rnd_.Reset(ACMRandom::DeterministicSeed()); + saturate_test_ = 0; + num_repeats_ = 10; + + ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12); } protected: + template <typename PixelType> + void CompareTestWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template <typename PixelType> + void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); YUVTemporalFilterFunc filter_func_; ACMRandom rnd_; + int saturate_test_; + int num_repeats_; + int use_highbd_; + int bd_; }; -TEST_P(YUVTemporalFilterTest, Use32x32) { - const int width = 32, height = 32; - Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8); - Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0); +template <typename PixelType> +void YUVTemporalFilterTest::CompareTestWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0); + Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0); Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0); Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0); Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0); Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0); + + Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<uint16_t> u_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0); + Buffer<uint32_t> u_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0); + Buffer<uint16_t> u_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0); + Buffer<uint32_t> u_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0); + + Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<uint16_t> v_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0); + Buffer<uint32_t> v_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0); + Buffer<uint16_t> v_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0); + Buffer<uint32_t> v_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0); + ASSERT_TRUE(y_src.Init()); ASSERT_TRUE(y_pre.Init()); ASSERT_TRUE(y_count_ref.Init()); ASSERT_TRUE(y_accum_ref.Init()); ASSERT_TRUE(y_count_tst.Init()); ASSERT_TRUE(y_accum_tst.Init()); + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count_ref.Init()); + ASSERT_TRUE(u_accum_ref.Init()); + ASSERT_TRUE(u_count_tst.Init()); + ASSERT_TRUE(u_accum_tst.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count_ref.Init()); + ASSERT_TRUE(v_accum_ref.Init()); + ASSERT_TRUE(v_count_tst.Init()); + ASSERT_TRUE(v_accum_tst.Init()); + + y_accum_ref.Set(0); + y_accum_tst.Set(0); + y_count_ref.Set(0); + y_count_tst.Set(0); + u_accum_ref.Set(0); + u_accum_tst.Set(0); + u_count_ref.Set(0); + u_count_tst.Set(0); + v_accum_ref.Set(0); + v_accum_tst.Set(0); + v_count_ref.Set(0); + v_count_tst.Set(0); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + if (saturate_test_) { + const int max_val = (1 << bd_) - 1; + y_src.Set(max_val); + y_pre.Set(0); + u_src.Set(max_val); + u_pre.Set(0); + v_src.Set(max_val); + v_pre.Set(0); + } else { + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + } + ApplyReferenceFilter<PixelType>( + y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref, + &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref); + + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()), + reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()), + reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(), + u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(), + v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel())); + + EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); + EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); + EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); + EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); + EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); + EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); + + if (HasFailure()) { + if (use_32x32) { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y, + filter_strength, *filter_weight); + } else { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x, + ss_y, filter_strength, filter_weight[0], filter_weight[1], + filter_weight[2], filter_weight[3]); + } + y_accum_tst.PrintDifference(y_accum_ref); + y_count_tst.PrintDifference(y_count_ref); + u_accum_tst.PrintDifference(u_accum_ref); + u_count_tst.PrintDifference(u_count_ref); + v_accum_tst.PrintDifference(v_accum_ref); + v_count_tst.PrintDifference(v_count_ref); + + return; + } + } +} + +template <typename PixelType> +void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0); + Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0); + Buffer<uint16_t> y_count = Buffer<uint16_t>(width, height, 0); + Buffer<uint32_t> y_accum = Buffer<uint32_t>(width, height, 0); + + Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<uint16_t> u_count = Buffer<uint16_t>(uv_width, uv_height, 0); + Buffer<uint32_t> u_accum = Buffer<uint32_t>(uv_width, uv_height, 0); + + Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0); + Buffer<uint16_t> v_count = Buffer<uint16_t>(uv_width, uv_height, 0); + Buffer<uint32_t> v_accum = Buffer<uint32_t>(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count.Init()); + ASSERT_TRUE(y_accum.Init()); + + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count.Init()); + ASSERT_TRUE(u_accum.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count.Init()); + ASSERT_TRUE(v_accum.Init()); + + y_accum.Set(0); + y_count.Set(0); + + u_accum.Set(0); + u_count.Set(0); + + v_accum.Set(0); + v_count.Set(0); + + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()), + reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()), + reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(), + u_count.TopLeftPixel(), v_accum.TopLeftPixel(), + v_count.TopLeftPixel())); + } +} + +TEST_P(YUVTemporalFilterTest, Use32x32) { + const int width = 32, height = 32; const int use_32x32 = 1; for (int ss_x = 0; ss_x <= 1; ss_x++) { @@ -259,95 +495,17 @@ TEST_P(YUVTemporalFilterTest, Use32x32) { for (int filter_strength = 0; filter_strength <= 6; filter_strength += 2) { for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { - const int uv_width = width >> ss_x, uv_height = height >> ss_y; - Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count_ref = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum_ref = - Buffer<uint32_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count_tst = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum_tst = - Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(u_src.Init()); - ASSERT_TRUE(u_pre.Init()); - ASSERT_TRUE(u_count_ref.Init()); - ASSERT_TRUE(u_accum_ref.Init()); - ASSERT_TRUE(u_count_tst.Init()); - ASSERT_TRUE(u_accum_tst.Init()); - Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count_ref = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum_ref = - Buffer<uint32_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count_tst = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum_tst = - Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(v_src.Init()); - ASSERT_TRUE(v_pre.Init()); - ASSERT_TRUE(v_count_ref.Init()); - ASSERT_TRUE(v_accum_ref.Init()); - ASSERT_TRUE(v_count_tst.Init()); - ASSERT_TRUE(v_accum_tst.Init()); - - // The difference between the buffers must be small to pass the - // threshold to apply the filter. - y_src.Set(&rnd_, 0, 7); - y_pre.Set(&rnd_, 0, 7); - u_src.Set(&rnd_, 0, 7); - u_pre.Set(&rnd_, 0, 7); - v_src.Set(&rnd_, 0, 7); - v_pre.Set(&rnd_, 0, 7); - - y_accum_ref.Set(rnd_.Rand8()); - y_accum_tst.CopyFrom(y_accum_ref); - y_count_ref.Set(rnd_.Rand8()); - y_count_tst.CopyFrom(y_count_ref); - u_accum_ref.Set(rnd_.Rand8()); - u_accum_tst.CopyFrom(u_accum_ref); - u_count_ref.Set(rnd_.Rand8()); - u_count_tst.CopyFrom(u_count_ref); - v_accum_ref.Set(rnd_.Rand8()); - v_accum_tst.CopyFrom(v_accum_ref); - v_count_ref.Set(rnd_.Rand8()); - v_count_tst.CopyFrom(v_count_ref); - - ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width, - height, ss_x, ss_y, filter_strength, - &filter_weight, use_32x32, &y_accum_ref, - &y_count_ref, &u_accum_ref, &u_count_ref, - &v_accum_ref, &v_count_ref); - ASM_REGISTER_STATE_CHECK(filter_func_( - y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(), - y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(), - u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(), - u_pre.stride(), width, height, ss_x, ss_y, filter_strength, - &filter_weight, use_32x32, y_accum_tst.TopLeftPixel(), - y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(), - u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(), - v_count_tst.TopLeftPixel())); - - EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); - EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); - EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); - EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); - EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); - EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); - - if (HasFailure()) { - printf("SS_X: %d, SS_Y: %d, Weight: %d, Strength: %d\n", ss_x, ss_y, - filter_weight, filter_strength); - y_accum_tst.PrintDifference(y_accum_ref); - y_count_tst.PrintDifference(y_count_ref); - u_accum_tst.PrintDifference(u_accum_ref); - u_count_tst.PrintDifference(u_count_ref); - v_accum_tst.PrintDifference(v_accum_ref); - v_count_tst.PrintDifference(v_count_ref); - return; + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); } + ASSERT_FALSE(HasFailure()); } } } @@ -356,19 +514,6 @@ TEST_P(YUVTemporalFilterTest, Use32x32) { TEST_P(YUVTemporalFilterTest, Use16x16) { const int width = 32, height = 32; - Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8); - Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0); - Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0); - Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0); - Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0); - Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0); - ASSERT_TRUE(y_src.Init()); - ASSERT_TRUE(y_pre.Init()); - ASSERT_TRUE(y_count_ref.Init()); - ASSERT_TRUE(y_accum_ref.Init()); - ASSERT_TRUE(y_count_tst.Init()); - ASSERT_TRUE(y_accum_tst.Init()); - const int use_32x32 = 0; for (int ss_x = 0; ss_x <= 1; ss_x++) { @@ -385,95 +530,18 @@ TEST_P(YUVTemporalFilterTest, Use16x16) { // Test each parameter for (int filter_strength = 0; filter_strength <= 6; filter_strength += 2) { - const int uv_width = width >> ss_x, uv_height = height >> ss_y; - Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count_ref = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum_ref = - Buffer<uint32_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count_tst = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum_tst = - Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(u_src.Init()); - ASSERT_TRUE(u_pre.Init()); - ASSERT_TRUE(u_count_ref.Init()); - ASSERT_TRUE(u_accum_ref.Init()); - ASSERT_TRUE(u_count_tst.Init()); - ASSERT_TRUE(u_accum_tst.Init()); - Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count_ref = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum_ref = - Buffer<uint32_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count_tst = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum_tst = - Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(v_src.Init()); - ASSERT_TRUE(v_pre.Init()); - ASSERT_TRUE(v_count_ref.Init()); - ASSERT_TRUE(v_accum_ref.Init()); - ASSERT_TRUE(v_count_tst.Init()); - ASSERT_TRUE(v_accum_tst.Init()); - - // The difference between the buffers must be small to pass the - // threshold to apply the filter. - y_src.Set(&rnd_, 0, 7); - y_pre.Set(&rnd_, 0, 7); - u_src.Set(&rnd_, 0, 7); - u_pre.Set(&rnd_, 0, 7); - v_src.Set(&rnd_, 0, 7); - v_pre.Set(&rnd_, 0, 7); - - y_accum_ref.Set(rnd_.Rand8()); - y_accum_tst.CopyFrom(y_accum_ref); - y_count_ref.Set(rnd_.Rand8()); - y_count_tst.CopyFrom(y_count_ref); - u_accum_ref.Set(rnd_.Rand8()); - u_accum_tst.CopyFrom(u_accum_ref); - u_count_ref.Set(rnd_.Rand8()); - u_count_tst.CopyFrom(u_count_ref); - v_accum_ref.Set(rnd_.Rand8()); - v_accum_tst.CopyFrom(v_accum_ref); - v_count_ref.Set(rnd_.Rand8()); - v_count_tst.CopyFrom(v_count_ref); - - ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width, - height, ss_x, ss_y, filter_strength, - filter_weight, use_32x32, &y_accum_ref, - &y_count_ref, &u_accum_ref, &u_count_ref, - &v_accum_ref, &v_count_ref); - ASM_REGISTER_STATE_CHECK(filter_func_( - y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(), - y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(), - u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(), - u_pre.stride(), width, height, ss_x, ss_y, filter_strength, - filter_weight, use_32x32, y_accum_tst.TopLeftPixel(), - y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(), - u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(), - v_count_tst.TopLeftPixel())); - - EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); - EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); - EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); - EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); - EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); - EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); - - if (HasFailure()) { - printf("SS_X: %d, SS_Y: %d, Weight Idx: %d, Strength: %d\n", ss_x, - ss_y, filter_idx, filter_strength); - y_accum_tst.PrintDifference(y_accum_ref); - y_count_tst.PrintDifference(y_count_ref); - u_accum_tst.PrintDifference(u_accum_ref); - u_count_tst.PrintDifference(u_count_ref); - v_accum_tst.PrintDifference(v_accum_ref); - v_count_tst.PrintDifference(v_count_ref); - return; + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + filter_weight); + } else { + CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); } + + ASSERT_FALSE(HasFailure()); } } } @@ -483,115 +551,25 @@ TEST_P(YUVTemporalFilterTest, Use16x16) { TEST_P(YUVTemporalFilterTest, SaturationTest) { const int width = 32, height = 32; const int use_32x32 = 1; - - Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8); - Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0); - Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0); - Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0); - Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0); - Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0); - ASSERT_TRUE(y_src.Init()); - ASSERT_TRUE(y_pre.Init()); - ASSERT_TRUE(y_count_ref.Init()); - ASSERT_TRUE(y_accum_ref.Init()); - ASSERT_TRUE(y_count_tst.Init()); - ASSERT_TRUE(y_accum_tst.Init()); + const int filter_weight = 1; + saturate_test_ = 1; for (int ss_x = 0; ss_x <= 1; ss_x++) { for (int ss_y = 0; ss_y <= 1; ss_y++) { for (int filter_strength = 0; filter_strength <= 6; filter_strength += 2) { - for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { - const int uv_width = width >> ss_x, uv_height = height >> ss_y; - Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count_ref = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum_ref = - Buffer<uint32_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count_tst = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum_tst = - Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(u_src.Init()); - ASSERT_TRUE(u_pre.Init()); - ASSERT_TRUE(u_count_ref.Init()); - ASSERT_TRUE(u_accum_ref.Init()); - ASSERT_TRUE(u_count_tst.Init()); - ASSERT_TRUE(u_accum_tst.Init()); - Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count_ref = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum_ref = - Buffer<uint32_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count_tst = - Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum_tst = - Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(v_src.Init()); - ASSERT_TRUE(v_pre.Init()); - ASSERT_TRUE(v_count_ref.Init()); - ASSERT_TRUE(v_accum_ref.Init()); - ASSERT_TRUE(v_count_tst.Init()); - ASSERT_TRUE(v_accum_tst.Init()); - - // The difference between the buffers must be small to pass the - // threshold to apply the filter. - y_src.Set(255); - y_pre.Set(0); - u_src.Set(255); - u_pre.Set(0); - v_src.Set(255); - v_pre.Set(0); - - y_accum_ref.Set(rnd_.Rand8()); - y_accum_tst.CopyFrom(y_accum_ref); - y_count_ref.Set(rnd_.Rand8()); - y_count_tst.CopyFrom(y_count_ref); - u_accum_ref.Set(rnd_.Rand8()); - u_accum_tst.CopyFrom(u_accum_ref); - u_count_ref.Set(rnd_.Rand8()); - u_count_tst.CopyFrom(u_count_ref); - v_accum_ref.Set(rnd_.Rand8()); - v_accum_tst.CopyFrom(v_accum_ref); - v_count_ref.Set(rnd_.Rand8()); - v_count_tst.CopyFrom(v_count_ref); - - ApplyReferenceFilter(y_src, y_pre, u_src, v_src, u_pre, v_pre, width, - height, ss_x, ss_y, filter_strength, - &filter_weight, use_32x32, &y_accum_ref, - &y_count_ref, &u_accum_ref, &u_count_ref, - &v_accum_ref, &v_count_ref); - ASM_REGISTER_STATE_CHECK(filter_func_( - y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(), - y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(), - u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(), - u_pre.stride(), width, height, ss_x, ss_y, filter_strength, - &filter_weight, use_32x32, y_accum_tst.TopLeftPixel(), - y_count_tst.TopLeftPixel(), u_accum_tst.TopLeftPixel(), - u_count_tst.TopLeftPixel(), v_accum_tst.TopLeftPixel(), - v_count_tst.TopLeftPixel())); - - EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); - EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); - EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); - EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); - EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); - EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); - - if (HasFailure()) { - printf("SS_X: %d, SS_Y: %d, Weight: %d, Strength: %d\n", ss_x, ss_y, - filter_weight, filter_strength); - y_accum_tst.PrintDifference(y_accum_ref); - y_count_tst.PrintDifference(y_count_ref); - u_accum_tst.PrintDifference(u_accum_ref); - u_count_tst.PrintDifference(u_count_ref); - v_accum_tst.PrintDifference(v_accum_ref); - v_count_tst.PrintDifference(v_count_ref); - return; - } + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); } + + ASSERT_FALSE(HasFailure()); } } } @@ -599,14 +577,7 @@ TEST_P(YUVTemporalFilterTest, SaturationTest) { TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { const int width = 32, height = 32; - Buffer<uint8_t> y_src = Buffer<uint8_t>(width, height, 8); - Buffer<uint8_t> y_pre = Buffer<uint8_t>(width, height, 0); - Buffer<uint16_t> y_count = Buffer<uint16_t>(width, height, 0); - Buffer<uint32_t> y_accum = Buffer<uint32_t>(width, height, 0); - ASSERT_TRUE(y_src.Init()); - ASSERT_TRUE(y_pre.Init()); - ASSERT_TRUE(y_count.Init()); - ASSERT_TRUE(y_accum.Init()); + num_repeats_ = 1000; for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) { const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3; @@ -625,50 +596,17 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { // Test each parameter for (int filter_strength = 0; filter_strength <= 6; filter_strength += 2) { - const int uv_width = width >> ss_x, uv_height = height >> ss_y; - Buffer<uint8_t> u_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> u_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> u_count = Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> u_accum = Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(u_src.Init()); - ASSERT_TRUE(u_pre.Init()); - ASSERT_TRUE(u_count.Init()); - ASSERT_TRUE(u_accum.Init()); - Buffer<uint8_t> v_src = Buffer<uint8_t>(uv_width, uv_height, 8); - Buffer<uint8_t> v_pre = Buffer<uint8_t>(uv_width, uv_height, 0); - Buffer<uint16_t> v_count = Buffer<uint16_t>(uv_width, uv_height, 0); - Buffer<uint32_t> v_accum = Buffer<uint32_t>(uv_width, uv_height, 0); - ASSERT_TRUE(v_src.Init()); - ASSERT_TRUE(v_pre.Init()); - ASSERT_TRUE(v_count.Init()); - ASSERT_TRUE(v_accum.Init()); - - y_src.Set(&rnd_, 0, 7); - y_pre.Set(&rnd_, 0, 7); - u_src.Set(&rnd_, 0, 7); - u_pre.Set(&rnd_, 0, 7); - v_src.Set(&rnd_, 0, 7); - v_pre.Set(&rnd_, 0, 7); - - y_accum.Set(0); - y_count.Set(0); - u_accum.Set(0); - u_count.Set(0); - v_accum.Set(0); - v_count.Set(0); - vpx_usec_timer timer; vpx_usec_timer_start(&timer); - for (int num_calls = 0; num_calls < 1000; num_calls++) { - filter_func_( - y_src.TopLeftPixel(), y_src.stride(), y_pre.TopLeftPixel(), - y_pre.stride(), u_src.TopLeftPixel(), v_src.TopLeftPixel(), - u_src.stride(), u_pre.TopLeftPixel(), v_pre.TopLeftPixel(), - u_pre.stride(), width, height, ss_x, ss_y, filter_strength, - filter_weight, use_32x32, y_accum.TopLeftPixel(), - y_count.TopLeftPixel(), u_accum.TopLeftPixel(), - u_count.TopLeftPixel(), v_accum.TopLeftPixel(), - v_count.TopLeftPixel()); + + if (use_highbd_) { + RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } else { + RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); } vpx_usec_timer_mark(&timer); @@ -676,9 +614,9 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { static_cast<int>(vpx_usec_timer_elapsed(&timer)); printf( - "Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: %d, Strength: " - "%d, Time: %5d\n", - use_32x32, ss_x, ss_y, filter_idx, filter_strength, + "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: " + "%d, Strength: %d, Time: %5d\n", + bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength, elapsed_time); } } @@ -687,11 +625,46 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { } } -INSTANTIATE_TEST_CASE_P(C, YUVTemporalFilterTest, - ::testing::Values(&vp9_apply_temporal_filter_c)); +#if CONFIG_VP9_HIGHBITDEPTH +#define WRAP_HIGHBD_FUNC(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, \ + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, \ + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, \ + int uv_pre_stride, unsigned int block_width, unsigned int block_height, \ + int ss_x, int ss_y, int strength, const int *const blk_fw, \ + int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, \ + uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, \ + uint16_t *v_count) { \ + func(reinterpret_cast<const uint16_t *>(y_src), y_src_stride, \ + reinterpret_cast<const uint16_t *>(y_pre), y_pre_stride, \ + reinterpret_cast<const uint16_t *>(u_src), \ + reinterpret_cast<const uint16_t *>(v_src), uv_src_stride, \ + reinterpret_cast<const uint16_t *>(u_pre), \ + reinterpret_cast<const uint16_t *>(v_pre), uv_pre_stride, \ + block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32, \ + y_accumulator, y_count, u_accumulator, u_count, v_accumulator, \ + v_count); \ + } + +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10); +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12); + +INSTANTIATE_TEST_CASE_P( + C, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12))); +#else +INSTANTIATE_TEST_CASE_P( + C, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8))); #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P(SSE4_1, YUVTemporalFilterTest, - ::testing::Values(&vp9_apply_temporal_filter_sse4_1)); + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_sse4_1, 8))); #endif // HAVE_SSE4_1 +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 42705dd26..00c4414ad 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -187,9 +187,6 @@ specialize qw/vp9_diamond_search_sad avx/; # Apply temporal filter # if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { -add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse4_1/; - add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; specialize qw/vp9_apply_temporal_filter sse4_1/; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index c622fd88b..d02603615 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -386,137 +386,7 @@ void vp9_apply_temporal_filter_c( } } -// TODO(any): This function is not used anymore. Should be removed. -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, - const uint8_t *frame2, - unsigned int block_width, - unsigned int block_height, int strength, - int filter_weight, uint32_t *accumulator, - uint16_t *count) { - unsigned int i, j, k; - int modifier; - int byte = 0; - const int rounding = (1 << strength) >> 1; - - assert(strength >= 0); - assert(strength <= 6); - - assert(filter_weight >= 0); - assert(filter_weight <= 2); - - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; - - // non-local mean approach - int diff_sse[9] = { 0 }; - int idx, idy, index = 0; - - for (idy = -1; idy <= 1; ++idy) { - for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; - - if (row >= 0 && row < (int)block_height && col >= 0 && - col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; - ++index; - } - } - } - - assert(index > 0); - - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; - - modifier *= 3; - modifier /= index; - - ++frame2; - - modifier += rounding; - modifier >>= strength; - - if (modifier > 16) modifier = 16; - - modifier = 16 - modifier; - modifier *= filter_weight; - - count[k] += modifier; - accumulator[k] += modifier * pixel_value; - - byte++; - } - - byte += stride - block_width; - } -} - #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_temporal_filter_apply_c( - const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, - unsigned int block_width, unsigned int block_height, int strength, - int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count) { - const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); - const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); - unsigned int i, j, k; - int modifier; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; - - int diff_sse[BLK_PELS] = { 0 }; - int this_idx = 0; - - for (i = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++) { - const int diff = - frame1[i * (int)stride + j] - frame2[i * (int)block_width + j]; - diff_sse[this_idx++] = diff * diff; - } - } - - modifier = 0; - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = frame2[i * (int)block_width + j]; - int filter_weight = - get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); - - int idx, idy, index = 0; - - for (idy = -1; idy <= 1; ++idy) { - for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; - - if (row >= 0 && row < (int)block_height && col >= 0 && - col < (int)block_width) { - modifier += diff_sse[row * (int)block_width + col]; - ++index; - } - } - } - assert(index > 0); - - modifier *= 3; - modifier /= index; - - modifier += rounding; - modifier >>= strength; - - if (modifier > 16) modifier = 16; - - modifier = 16 - modifier; - modifier *= filter_weight; - - count[k] += modifier; - accumulator[k] += modifier * pixel_value; - } - } -} - void vp9_highbd_apply_temporal_filter_c( const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index b560e2218..9f9483a9b 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -18,71 +18,6 @@ #include "vp9/encoder/vp9_temporal_filter.h" #include "vp9/encoder/x86/temporal_filter_constants.h" -// Load values from 'a' and 'b'. Compute the difference squared and sum -// neighboring values such that: -// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 -// Values to the left and right of the row are set to 0. -// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. -static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { - const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); - const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); - - const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); - const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); - - const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); - const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); - - // Shift all the values one place to the left/right so we can efficiently sum - // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. - const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); - const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); - - // It becomes necessary to treat the values as unsigned at this point. The - // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point - // forward since the filter is only applied to smooth small pixel changes. - // Once the value has saturated to uint16_t it is well outside the useful - // range. - __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); - - *sum = sum_u16; -} - -static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, - __m128i *sum_1) { - const __m128i zero = _mm_setzero_si128(); - const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); - const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); - - const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); - const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); - const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); - const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); - - const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); - const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); - const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); - const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); - - __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); - // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. - __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); - - __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); - - *sum_0 = sum_u16; - - shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); - shift_right = _mm_srli_si128(diff_sq_1_u16, 2); - - sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); - - *sum_1 = sum_u16; -} - // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the // difference squared, and store as unsigned 16-bit integer to dst. static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, @@ -312,148 +247,6 @@ static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, get_sum_8(y_dist + 8, sum_second); } -void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, - const uint8_t *b, unsigned int width, - unsigned int height, int strength, - int weight, uint32_t *accumulator, - uint16_t *count) { - unsigned int h; - const int rounding = (1 << strength) >> 1; - - assert(strength >= 0); - assert(strength <= 6); - - assert(weight >= 0); - assert(weight <= 2); - - assert(width == 8 || width == 16); - - if (width == 8) { - __m128i sum_row_a, sum_row_b, sum_row_c; - __m128i mul_constants = _mm_setr_epi16( - NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - - sum_8(a, b, &sum_row_a); - sum_8(a + stride, b + width, &sum_row_b); - sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_c = - average_8(sum_row_c, &mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_c, b, count, accumulator); - - a += stride + stride; - b += width; - count += width; - accumulator += width; - - mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); - - for (h = 0; h < height - 2; ++h) { - sum_8(a, b + width, &sum_row_c); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); - sum_row_a = - average_8(sum_row_a, &mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_a, b, count, accumulator); - - a += stride; - b += width; - count += width; - accumulator += width; - - sum_row_a = sum_row_b; - sum_row_b = sum_row_c; - } - - mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_a = - average_8(sum_row_a, &mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_a, b, count, accumulator); - - } else { // width == 16 - __m128i sum_row_a_0, sum_row_a_1; - __m128i sum_row_b_0, sum_row_b_1; - __m128i sum_row_c_0, sum_row_c_1; - __m128i mul_constants_0 = _mm_setr_epi16( - NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), - mul_constants_1 = _mm_setr_epi16( - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - - sum_16(a, b, &sum_row_a_0, &sum_row_a_1); - sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); - - sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - - average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); - - a += stride + stride; - b += width; - count += width; - accumulator += width; - - mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); - mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); - for (h = 0; h < height - 2; ++h) { - sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); - - sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); - sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); - - average_16(&sum_row_a_0, &sum_row_a_1, &mul_constants_0, &mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); - - a += stride; - b += width; - count += width; - accumulator += width; - - sum_row_a_0 = sum_row_b_0; - sum_row_a_1 = sum_row_b_1; - sum_row_b_0 = sum_row_c_0; - sum_row_b_1 = sum_row_c_1; - } - - mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); - mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - - average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); - } -} - // Read in a row of chroma values corresponds to a row of 16 luma values. static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, const uint16_t *v_dist, |