From 8c308aefea7c58a1a979b81f4aa6d68908e379ee Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Tue, 6 Jun 2023 12:27:34 +0530 Subject: Fix c vs intrinsic mismatch of vpx_hadamard_32x32() function This CL resolves the mismatch between C and intrinsic implementation of vpx_hadamard_32x32 function. The mismatch was due to integer overflow during the addition operation in the intrinsic functions. Specifically, the addition in the intrinsic function was performed at the 16-bit level, while the calculation of a0 + a1 resulted in a 17-bit value. This code change addresses the problem by performing the addition at the 32-bit level (with sign extension) in both SSE2 and AVX2, and then converting the results back to the 16-bit level after a right shift. STATS_CHANGED Change-Id: I576ca64e3b9ebb31d143fcd2da64322790bc5853 --- test/hadamard_test.cc | 27 ++++++++++++++++++++++ vpx_dsp/avg.c | 8 +++---- vpx_dsp/x86/avg_intrin_avx2.c | 53 ++++++++++++++++++++++++++++++++++++------- vpx_dsp/x86/avg_intrin_sse2.c | 53 ++++++++++++++++++++++++++++++++++++------- 4 files changed, 121 insertions(+), 20 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 9f6c99f3c..0de6622e2 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -170,6 +170,31 @@ class HadamardTestBase : public ::testing::TestWithParam { EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); } + void ExtremeValuesTest() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < 2; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + const int sign = (i == 0) ? 1 : -1; + for (int j = 0; j < kMaxBlockSize; ++j) + input_extreme_block[j] = sign * 255; + + ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + void VaryStride() { const int kMaxBlockSize = 32 * 32; DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); @@ -225,6 +250,8 @@ class HadamardLowbdTest : public HadamardTestBase { TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } +TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); } + TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } TEST_P(HadamardLowbdTest, DISABLED_Speed) { diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 391e9eb14..a8dcab7da 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -295,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); } - // coeff: 15 bit, dynamic range [-16320, 16320] + // coeff: 16 bit, dynamic range [-32768, 32767] for (idx = 0; idx < 256; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[256]; tran_low_t a2 = coeff[512]; tran_low_t a3 = coeff[768]; - tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range - tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320] + tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] tran_low_t b3 = (a2 - a3) >> 2; - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] coeff[256] = b1 + b3; coeff[512] = b0 - b2; coeff[768] = b1 - b3; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index b2e01319d..61e4e73c5 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, } #endif // CONFIG_VP9_HIGHBITDEPTH +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = @@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); - __m256i b0 = _mm256_add_epi16(coeff0, coeff1); - __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); - __m256i b2 = _mm256_add_epi16(coeff2, coeff3); - __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); - b0 = _mm256_srai_epi16(b0, 2); - b1 = _mm256_srai_epi16(b1, 2); - b2 = _mm256_srai_epi16(b2, 2); - b3 = _mm256_srai_epi16(b3, 2); + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 015c11a1f..4447dfab7 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -15,6 +15,14 @@ #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; @@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *t_coeff = coeff; #endif int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; @@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); - b0 = _mm_srai_epi16(b0, 2); - b1 = _mm_srai_epi16(b1, 2); - b2 = _mm_srai_epi16(b2, 2); - b3 = _mm_srai_epi16(b3, 2); + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); -- cgit v1.2.3