summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorchiyotsai <chiyotsai@google.com>2019-01-31 12:38:44 -0800
committerchiyotsai <chiyotsai@google.com>2019-02-01 09:20:19 -0800
commitc447a1d51f9b2912e63c3a1143957db09a7924a0 (patch)
treeb42f05a01c850811eb037d59e54cddb42d33fd6c
parent16b02219433156915d8e7c265e44e7a54dcd21ef (diff)
downloadlibvpx-c447a1d51f9b2912e63c3a1143957db09a7924a0.tar
libvpx-c447a1d51f9b2912e63c3a1143957db09a7924a0.tar.gz
libvpx-c447a1d51f9b2912e63c3a1143957db09a7924a0.tar.bz2
libvpx-c447a1d51f9b2912e63c3a1143957db09a7924a0.zip
Remove old version of temporal_filter_apply
BUG=webm:1591 Change-Id: I926566ac1bf4bac8cb1ce1c6ded9ba940109283e
-rw-r--r--test/temporal_filter_test.cc280
-rw-r--r--test/test.mk1
-rw-r--r--vp9/common/vp9_rtcd_defs.pl3
-rw-r--r--vp9/encoder/vp9_temporal_filter.c130
-rw-r--r--vp9/encoder/x86/temporal_filter_sse4.c207
5 files changed, 0 insertions, 621 deletions
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
deleted file mode 100644
index d14a4826e..000000000
--- a/test/temporal_filter_test.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <limits>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "test/acm_random.h"
-#include "test/buffer.h"
-#include "test/register_state_check.h"
-#include "vpx_ports/vpx_timer.h"
-
-namespace {
-
-using ::libvpx_test::ACMRandom;
-using ::libvpx_test::Buffer;
-
-typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride,
- const uint8_t *b, unsigned int w,
- unsigned int h, int filter_strength,
- int filter_weight, unsigned int *accumulator,
- uint16_t *count);
-
-// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply
-// filter based on strength and weight. Store the resulting filter amount in
-// 'count' and apply it to 'b' and store it in 'accumulator'.
-void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
- int h, int filter_strength, int filter_weight,
- Buffer<unsigned int> *accumulator,
- Buffer<uint16_t> *count) {
- Buffer<int> diff_sq = Buffer<int>(w, h, 0);
- ASSERT_TRUE(diff_sq.Init());
- diff_sq.Set(0);
-
- int rounding = 0;
- if (filter_strength > 0) {
- rounding = 1 << (filter_strength - 1);
- }
-
- ASSERT_TRUE(a.TopLeftPixel() != NULL);
- ASSERT_TRUE(b.TopLeftPixel() != NULL);
- ASSERT_TRUE(diff_sq.TopLeftPixel() != NULL);
- // Calculate all the differences. Avoids re-calculating a bunch of extra
- // values.
- for (int height = 0; height < h; ++height) {
- for (int width = 0; width < w; ++width) {
- int diff = a.TopLeftPixel()[height * a.stride() + width] -
- b.TopLeftPixel()[height * b.stride() + width];
- diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff;
- }
- }
-
- // For any given point, sum the neighboring values and calculate the
- // modifier.
- for (int height = 0; height < h; ++height) {
- for (int width = 0; width < w; ++width) {
- // Determine how many values are being summed.
- int summed_values = 9;
-
- if (height == 0 || height == (h - 1)) {
- summed_values -= 3;
- }
-
- if (width == 0 || width == (w - 1)) {
- if (summed_values == 6) { // corner
- summed_values -= 2;
- } else {
- summed_values -= 3;
- }
- }
-
- // Sum the diff_sq of the surrounding values.
- int sum = 0;
- for (int idy = -1; idy <= 1; ++idy) {
- for (int idx = -1; idx <= 1; ++idx) {
- const int y = height + idy;
- const int x = width + idx;
-
- // If inside the border.
- if (y >= 0 && y < h && x >= 0 && x < w) {
- sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x];
- }
- }
- }
-
- sum *= 3;
- sum /= summed_values;
- sum += rounding;
- sum >>= filter_strength;
-
- // Clamp the value and invert it.
- if (sum > 16) sum = 16;
- sum = 16 - sum;
-
- sum *= filter_weight;
-
- count->TopLeftPixel()[height * count->stride() + width] += sum;
- accumulator->TopLeftPixel()[height * accumulator->stride() + width] +=
- sum * b.TopLeftPixel()[height * b.stride() + width];
- }
- }
-}
-
-class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> {
- public:
- virtual void SetUp() {
- filter_func_ = GetParam();
- rnd_.Reset(ACMRandom::DeterministicSeed());
- }
-
- protected:
- TemporalFilterFunc filter_func_;
- ACMRandom rnd_;
-};
-
-TEST_P(TemporalFilterTest, SizeCombinations) {
- // Depending on subsampling this function may be called with values of 8 or 16
- // for width and height, in any combination.
- Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
- ASSERT_TRUE(a.Init());
-
- const int filter_weight = 2;
- const int filter_strength = 6;
-
- for (int width = 8; width <= 16; width += 8) {
- for (int height = 8; height <= 16; height += 8) {
- // The second buffer must not have any border.
- Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
- ASSERT_TRUE(b.Init());
- Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
- ASSERT_TRUE(accum_ref.Init());
- Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
- ASSERT_TRUE(accum_chk.Init());
- Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
- ASSERT_TRUE(count_ref.Init());
- Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
- ASSERT_TRUE(count_chk.Init());
-
- // The difference between the buffers must be small to pass the threshold
- // to apply the filter.
- a.Set(&rnd_, 0, 7);
- b.Set(&rnd_, 0, 7);
-
- accum_ref.Set(rnd_.Rand8());
- accum_chk.CopyFrom(accum_ref);
- count_ref.Set(rnd_.Rand8());
- count_chk.CopyFrom(count_ref);
- reference_filter(a, b, width, height, filter_strength, filter_weight,
- &accum_ref, &count_ref);
- ASM_REGISTER_STATE_CHECK(
- filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
- height, filter_strength, filter_weight,
- accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
- EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
- EXPECT_TRUE(count_chk.CheckValues(count_ref));
- if (HasFailure()) {
- printf("Width: %d Height: %d\n", width, height);
- count_chk.PrintDifference(count_ref);
- accum_chk.PrintDifference(accum_ref);
- return;
- }
- }
- }
-}
-
-TEST_P(TemporalFilterTest, CompareReferenceRandom) {
- for (int width = 8; width <= 16; width += 8) {
- for (int height = 8; height <= 16; height += 8) {
- Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
- ASSERT_TRUE(a.Init());
- // The second buffer must not have any border.
- Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
- ASSERT_TRUE(b.Init());
- Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
- ASSERT_TRUE(accum_ref.Init());
- Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
- ASSERT_TRUE(accum_chk.Init());
- Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
- ASSERT_TRUE(count_ref.Init());
- Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
- ASSERT_TRUE(count_chk.Init());
-
- for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
- for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
- for (int repeat = 0; repeat < 100; ++repeat) {
- if (repeat < 50) {
- a.Set(&rnd_, 0, 7);
- b.Set(&rnd_, 0, 7);
- } else {
- // Check large (but close) values as well.
- a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
- std::numeric_limits<uint8_t>::max());
- b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
- std::numeric_limits<uint8_t>::max());
- }
-
- accum_ref.Set(rnd_.Rand8());
- accum_chk.CopyFrom(accum_ref);
- count_ref.Set(rnd_.Rand8());
- count_chk.CopyFrom(count_ref);
- reference_filter(a, b, width, height, filter_strength,
- filter_weight, &accum_ref, &count_ref);
- ASM_REGISTER_STATE_CHECK(filter_func_(
- a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
- filter_strength, filter_weight, accum_chk.TopLeftPixel(),
- count_chk.TopLeftPixel()));
- EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
- EXPECT_TRUE(count_chk.CheckValues(count_ref));
- if (HasFailure()) {
- printf("Weight: %d Strength: %d\n", filter_weight,
- filter_strength);
- count_chk.PrintDifference(count_ref);
- accum_chk.PrintDifference(accum_ref);
- return;
- }
- }
- }
- }
- }
- }
-}
-
-TEST_P(TemporalFilterTest, DISABLED_Speed) {
- Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
- ASSERT_TRUE(a.Init());
-
- const int filter_weight = 2;
- const int filter_strength = 6;
-
- for (int width = 8; width <= 16; width += 8) {
- for (int height = 8; height <= 16; height += 8) {
- // The second buffer must not have any border.
- Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
- ASSERT_TRUE(b.Init());
- Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
- ASSERT_TRUE(accum_ref.Init());
- Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
- ASSERT_TRUE(accum_chk.Init());
- Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
- ASSERT_TRUE(count_ref.Init());
- Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
- ASSERT_TRUE(count_chk.Init());
-
- a.Set(&rnd_, 0, 7);
- b.Set(&rnd_, 0, 7);
-
- accum_chk.Set(0);
- count_chk.Set(0);
-
- vpx_usec_timer timer;
- vpx_usec_timer_start(&timer);
- for (int i = 0; i < 10000; ++i) {
- filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
- height, filter_strength, filter_weight,
- accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
- }
- vpx_usec_timer_mark(&timer);
- const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
- printf("Temporal filter %dx%d time: %5d us\n", width, height,
- elapsed_time);
- }
- }
-}
-
-INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
- ::testing::Values(&vp9_temporal_filter_apply_c));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
- ::testing::Values(&vp9_temporal_filter_apply_sse4_1));
-#endif // HAVE_SSE4_1
-} // namespace
diff --git a/test/test.mk b/test/test.mk
index 2b7636185..61eb6060f 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -170,7 +170,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
ifneq ($(CONFIG_REALTIME_ONLY),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
endif
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 42705dd26..00c4414ad 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -187,9 +187,6 @@ specialize qw/vp9_diamond_search_sad avx/;
# Apply temporal filter
#
if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
-add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse4_1/;
-
add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
specialize qw/vp9_apply_temporal_filter sse4_1/;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index c622fd88b..d02603615 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -386,137 +386,7 @@ void vp9_apply_temporal_filter_c(
}
}
-// TODO(any): This function is not used anymore. Should be removed.
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
- const uint8_t *frame2,
- unsigned int block_width,
- unsigned int block_height, int strength,
- int filter_weight, uint32_t *accumulator,
- uint16_t *count) {
- unsigned int i, j, k;
- int modifier;
- int byte = 0;
- const int rounding = (1 << strength) >> 1;
-
- assert(strength >= 0);
- assert(strength <= 6);
-
- assert(filter_weight >= 0);
- assert(filter_weight <= 2);
-
- for (i = 0, k = 0; i < block_height; i++) {
- for (j = 0; j < block_width; j++, k++) {
- int pixel_value = *frame2;
-
- // non-local mean approach
- int diff_sse[9] = { 0 };
- int idx, idy, index = 0;
-
- for (idy = -1; idy <= 1; ++idy) {
- for (idx = -1; idx <= 1; ++idx) {
- int row = (int)i + idy;
- int col = (int)j + idx;
-
- if (row >= 0 && row < (int)block_height && col >= 0 &&
- col < (int)block_width) {
- int diff = frame1[byte + idy * (int)stride + idx] -
- frame2[idy * (int)block_width + idx];
- diff_sse[index] = diff * diff;
- ++index;
- }
- }
- }
-
- assert(index > 0);
-
- modifier = 0;
- for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
-
- modifier *= 3;
- modifier /= index;
-
- ++frame2;
-
- modifier += rounding;
- modifier >>= strength;
-
- if (modifier > 16) modifier = 16;
-
- modifier = 16 - modifier;
- modifier *= filter_weight;
-
- count[k] += modifier;
- accumulator[k] += modifier * pixel_value;
-
- byte++;
- }
-
- byte += stride - block_width;
- }
-}
-
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_temporal_filter_apply_c(
- const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8,
- unsigned int block_width, unsigned int block_height, int strength,
- int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count) {
- const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
- const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
- unsigned int i, j, k;
- int modifier;
- const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
-
- int diff_sse[BLK_PELS] = { 0 };
- int this_idx = 0;
-
- for (i = 0; i < block_height; i++) {
- for (j = 0; j < block_width; j++) {
- const int diff =
- frame1[i * (int)stride + j] - frame2[i * (int)block_width + j];
- diff_sse[this_idx++] = diff * diff;
- }
- }
-
- modifier = 0;
- for (i = 0, k = 0; i < block_height; i++) {
- for (j = 0; j < block_width; j++, k++) {
- int pixel_value = frame2[i * (int)block_width + j];
- int filter_weight =
- get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
-
- int idx, idy, index = 0;
-
- for (idy = -1; idy <= 1; ++idy) {
- for (idx = -1; idx <= 1; ++idx) {
- int row = (int)i + idy;
- int col = (int)j + idx;
-
- if (row >= 0 && row < (int)block_height && col >= 0 &&
- col < (int)block_width) {
- modifier += diff_sse[row * (int)block_width + col];
- ++index;
- }
- }
- }
- assert(index > 0);
-
- modifier *= 3;
- modifier /= index;
-
- modifier += rounding;
- modifier >>= strength;
-
- if (modifier > 16) modifier = 16;
-
- modifier = 16 - modifier;
- modifier *= filter_weight;
-
- count[k] += modifier;
- accumulator[k] += modifier * pixel_value;
- }
- }
-}
-
void vp9_highbd_apply_temporal_filter_c(
const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index b560e2218..9f9483a9b 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -18,71 +18,6 @@
#include "vp9/encoder/vp9_temporal_filter.h"
#include "vp9/encoder/x86/temporal_filter_constants.h"
-// Load values from 'a' and 'b'. Compute the difference squared and sum
-// neighboring values such that:
-// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
-// Values to the left and right of the row are set to 0.
-// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
-static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
- const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
- const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
-
- const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
- const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
-
- const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
- const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
-
- // Shift all the values one place to the left/right so we can efficiently sum
- // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
- const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
- const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
-
- // It becomes necessary to treat the values as unsigned at this point. The
- // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
- // forward since the filter is only applied to smooth small pixel changes.
- // Once the value has saturated to uint16_t it is well outside the useful
- // range.
- __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
- sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
- *sum = sum_u16;
-}
-
-static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
- __m128i *sum_1) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
- const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
-
- const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
- const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
- const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
- const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
-
- const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
- const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
- const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
- const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
-
- __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
- // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
- __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
-
- __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
- sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
- *sum_0 = sum_u16;
-
- shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
- shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
-
- sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
- sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
- *sum_1 = sum_u16;
-}
-
// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
// difference squared, and store as unsigned 16-bit integer to dst.
static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
@@ -312,148 +247,6 @@ static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
get_sum_8(y_dist + 8, sum_second);
}
-void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
- const uint8_t *b, unsigned int width,
- unsigned int height, int strength,
- int weight, uint32_t *accumulator,
- uint16_t *count) {
- unsigned int h;
- const int rounding = (1 << strength) >> 1;
-
- assert(strength >= 0);
- assert(strength <= 6);
-
- assert(weight >= 0);
- assert(weight <= 2);
-
- assert(width == 8 || width == 16);
-
- if (width == 8) {
- __m128i sum_row_a, sum_row_b, sum_row_c;
- __m128i mul_constants = _mm_setr_epi16(
- NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-
- sum_8(a, b, &sum_row_a);
- sum_8(a + stride, b + width, &sum_row_b);
- sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
- sum_row_c =
- average_8(sum_row_c, &mul_constants, strength, rounding, weight);
- accumulate_and_store_8(sum_row_c, b, count, accumulator);
-
- a += stride + stride;
- b += width;
- count += width;
- accumulator += width;
-
- mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
-
- for (h = 0; h < height - 2; ++h) {
- sum_8(a, b + width, &sum_row_c);
- sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
- sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
- sum_row_a =
- average_8(sum_row_a, &mul_constants, strength, rounding, weight);
- accumulate_and_store_8(sum_row_a, b, count, accumulator);
-
- a += stride;
- b += width;
- count += width;
- accumulator += width;
-
- sum_row_a = sum_row_b;
- sum_row_b = sum_row_c;
- }
-
- mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
- sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
- sum_row_a =
- average_8(sum_row_a, &mul_constants, strength, rounding, weight);
- accumulate_and_store_8(sum_row_a, b, count, accumulator);
-
- } else { // width == 16
- __m128i sum_row_a_0, sum_row_a_1;
- __m128i sum_row_b_0, sum_row_b_1;
- __m128i sum_row_c_0, sum_row_c_1;
- __m128i mul_constants_0 = _mm_setr_epi16(
- NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
- mul_constants_1 = _mm_setr_epi16(
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-
- sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
- sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
-
- sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
- sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-
- average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,
- strength, rounding, weight);
- accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
-
- a += stride + stride;
- b += width;
- count += width;
- accumulator += width;
-
- mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
- mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
- NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
- for (h = 0; h < height - 2; ++h) {
- sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
-
- sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
- sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
- sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
- sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
-
- average_16(&sum_row_a_0, &sum_row_a_1, &mul_constants_0, &mul_constants_1,
- strength, rounding, weight);
- accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
-
- a += stride;
- b += width;
- count += width;
- accumulator += width;
-
- sum_row_a_0 = sum_row_b_0;
- sum_row_a_1 = sum_row_b_1;
- sum_row_b_0 = sum_row_c_0;
- sum_row_b_1 = sum_row_c_1;
- }
-
- mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
- mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
- NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
- sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
- sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-
- average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,
- strength, rounding, weight);
- accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
- }
-}
-
// Read in a row of chroma values corresponds to a row of 16 luma values.
static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
const uint16_t *v_dist,