diff options
author | Johann <johann.koenig@duck.com> | 2018-11-21 13:30:45 -0500 |
---|---|---|
committer | Johann <johann.koenig@duck.com> | 2018-11-28 11:30:37 -0500 |
commit | d566160f32580e5ef8fccbce8dde8495756a0620 (patch) | |
tree | f38b10bc839ba540085ff5a37501fc898d0e69b6 /vpx_dsp | |
parent | 0eeb7975128997b3f978d3a6c81d6e25b4d4a8c9 (diff) | |
download | libvpx-d566160f32580e5ef8fccbce8dde8495756a0620.tar libvpx-d566160f32580e5ef8fccbce8dde8495756a0620.tar.gz libvpx-d566160f32580e5ef8fccbce8dde8495756a0620.tar.bz2 libvpx-d566160f32580e5ef8fccbce8dde8495756a0620.zip |
quantize 32x32: fix dqcoeff
Calculate the high bits of dqcoeff and store them appropriately in high
bit depth builds.
Low bit depth builds still do not pass. C truncates the results after
division. X86 only supports packing with saturation at this step.
BUG=webm:1448
Change-Id: Ic80def575136c7ca37edf18d21e26925b475da98
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_avx.c | 44 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_sse2.h | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_ssse3.c | 44 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_ssse3.h | 57 |
5 files changed, 78 insertions, 72 deletions
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 5ee3bfec7..2495db3f4 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -299,6 +299,7 @@ DSP_SRCS-yes += quantize.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 36345a6d2..0a91d36ea 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -18,6 +18,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -229,27 +230,12 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - // Un-sign to bias rounding like C. - // dequant is almost always negative, so this is probably the backwards way - // to handle the sign. However, it matches the previous assembly. - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - // "Divide" by 2. - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -286,22 +272,12 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(coeff0, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index d93096700..afe2f924b 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -44,10 +44,6 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, *coeff = _mm_mulhi_epi16(qcoeff, shift); } -static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { - return _mm_mullo_epi16(qcoeff, dequant); -} - static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, tran_low_t *dqcoeff) { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 225e33ed9..fc1d91959 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -15,6 +15,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -201,27 +202,12 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - // Un-sign to bias rounding like C. - // dequant is almost always negative, so this is probably the backwards way - // to handle the sign. However, it matches the previous assembly. - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - // "Divide" by 2. - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -262,22 +248,12 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_ssse3.h b/vpx_dsp/x86/quantize_ssse3.h new file mode 100644 index 000000000..35223d7b4 --- /dev/null +++ b/vpx_dsp/x86/quantize_ssse3.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ + +#include <emmintrin.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 2. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant); + (void)zero; + + dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1); + + dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ |