summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2023-06-12 16:44:21 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2023-06-12 16:44:21 +0000
commit8789421bf3ed15cd86b18b4bb8f0917fda0cccd7 (patch)
tree9a3145b990b8b0d21c098eee0bb22ffd85d1b52c
parentbdb8ccc0af49a87c9f4ee08f1d363ceec347ab6e (diff)
parent8c308aefea7c58a1a979b81f4aa6d68908e379ee (diff)
downloadlibvpx-8789421bf3ed15cd86b18b4bb8f0917fda0cccd7.tar
libvpx-8789421bf3ed15cd86b18b4bb8f0917fda0cccd7.tar.gz
libvpx-8789421bf3ed15cd86b18b4bb8f0917fda0cccd7.tar.bz2
libvpx-8789421bf3ed15cd86b18b4bb8f0917fda0cccd7.zip
Merge "Fix c vs intrinsic mismatch of vpx_hadamard_32x32() function" into main
-rw-r--r--test/hadamard_test.cc27
-rw-r--r--vpx_dsp/avg.c8
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c53
-rw-r--r--vpx_dsp/x86/avg_intrin_sse2.c53
4 files changed, 121 insertions, 20 deletions
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 9f6c99f3c..0de6622e2 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -170,6 +170,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
}
+ void ExtremeValuesTest() {
+ const int kMaxBlockSize = 32 * 32;
+ DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]);
+ DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+ memset(b, 0, sizeof(b));
+
+ tran_low_t b_ref[kMaxBlockSize];
+ memset(b_ref, 0, sizeof(b_ref));
+
+ for (int i = 0; i < 2; ++i) {
+ // Initialize a test block with input range [-mask_, mask_].
+ const int sign = (i == 0) ? 1 : -1;
+ for (int j = 0; j < kMaxBlockSize; ++j)
+ input_extreme_block[j] = sign * 255;
+
+ ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_);
+ ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b));
+
+ // The order of the output is not important. Sort before checking.
+ std::sort(b, b + block_size_);
+ std::sort(b_ref, b_ref + block_size_);
+ EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+ }
+ }
+
void VaryStride() {
const int kMaxBlockSize = 32 * 32;
DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
@@ -225,6 +250,8 @@ class HadamardLowbdTest : public HadamardTestBase {
TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
+TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); }
+
TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
TEST_P(HadamardLowbdTest, DISABLED_Speed) {
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 391e9eb14..a8dcab7da 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -295,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
}
- // coeff: 15 bit, dynamic range [-16320, 16320]
+ // coeff: 16 bit, dynamic range [-32768, 32767]
for (idx = 0; idx < 256; ++idx) {
tran_low_t a0 = coeff[0];
tran_low_t a1 = coeff[256];
tran_low_t a2 = coeff[512];
tran_low_t a3 = coeff[768];
- tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640]
+ tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535]
tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range
- tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320]
+ tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383]
tran_low_t b3 = (a2 - a3) >> 2;
- coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[0] = b0 + b2; // 16 bit, [-32768, 32767]
coeff[256] = b1 + b3;
coeff[512] = b0 - b2;
coeff[768] = b1 - b3;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index b2e01319d..61e4e73c5 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+ __m256i *out_lo,
+ __m256i *out_hi) {
+ const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+ *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
static void hadamard_col8x2_avx2(__m256i *in, int iter) {
__m256i a0 = in[0];
__m256i a1 = in[1];
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
int16_t *t_coeff = coeff;
#endif
int idx;
+ __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m256i b0, b1, b2, b3;
+ const __m256i zero = _mm256_setzero_si256();
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
const int16_t *src_ptr =
@@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
- __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
- __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
- __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
- __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm256_srai_epi32(b0_lo, 2);
+ b1_lo = _mm256_srai_epi32(b1_lo, 2);
+ b2_lo = _mm256_srai_epi32(b2_lo, 2);
+ b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm256_srai_epi32(b0_hi, 2);
+ b1_hi = _mm256_srai_epi32(b1_hi, 2);
+ b2_hi = _mm256_srai_epi32(b2_hi, 2);
+ b3_hi = _mm256_srai_epi32(b3_hi, 2);
- b0 = _mm256_srai_epi16(b0, 2);
- b1 = _mm256_srai_epi16(b1, 2);
- b2 = _mm256_srai_epi16(b2, 2);
- b3 = _mm256_srai_epi16(b3, 2);
+ b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm256_packs_epi32(b3_lo, b3_hi);
store_tran_low(_mm256_add_epi16(b0, b2), coeff);
store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 015c11a1f..4447dfab7 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,14 @@
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_ports/mem.h"
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+ __m128i *out_lo,
+ __m128i *out_hi) {
+ const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+ *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
int *min, int *max) {
__m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
int16_t *t_coeff = coeff;
#endif
int idx;
+ __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m128i b0, b1, b2, b3;
+ const __m128i zero = _mm_setzero_si128();
for (idx = 0; idx < 4; ++idx) {
const int16_t *src_ptr =
src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
@@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
__m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
__m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
- __m128i b0 = _mm_add_epi16(coeff0, coeff1);
- __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
- __m128i b2 = _mm_add_epi16(coeff2, coeff3);
- __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm_srai_epi32(b0_lo, 2);
+ b1_lo = _mm_srai_epi32(b1_lo, 2);
+ b2_lo = _mm_srai_epi32(b2_lo, 2);
+ b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm_srai_epi32(b0_hi, 2);
+ b1_hi = _mm_srai_epi32(b1_hi, 2);
+ b2_hi = _mm_srai_epi32(b2_hi, 2);
+ b3_hi = _mm_srai_epi32(b3_hi, 2);
- b0 = _mm_srai_epi16(b0, 2);
- b1 = _mm_srai_epi16(b1, 2);
- b2 = _mm_srai_epi16(b2, 2);
- b3 = _mm_srai_epi16(b3, 2);
+ b0 = _mm_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm_packs_epi32(b3_lo, b3_hi);
coeff0 = _mm_add_epi16(b0, b2);
coeff1 = _mm_add_epi16(b1, b3);