From cf35ffc025086588db5ccab977a94a325a50f949 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 3 Nov 2016 15:21:32 -0700 Subject: Extract high bit depth helper functions These can be used in the vp9 fdct as well. Change-Id: I4f3875e0cba1b8cad209c3a0581e121deba7675e --- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/x86/fdct.h | 46 +++++++++++++++++++++++++++++++++++ vpx_dsp/x86/quantize_sse2.c | 59 +++++++++++++-------------------------------- 3 files changed, 64 insertions(+), 42 deletions(-) create mode 100644 vpx_dsp/x86/fdct.h diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 8c91b141f..2909beb0f 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -252,6 +252,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h +DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c diff --git a/vpx_dsp/x86/fdct.h b/vpx_dsp/x86/fdct.h new file mode 100644 index 000000000..cad73f82a --- /dev/null +++ b/vpx_dsp/x86/fdct.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_FDCT_H_ +#define VPX_DSP_X86_FDCT_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then cast down. +// This does not saturate values. It only truncates. +static INLINE __m128i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2], + (int16_t)a[3], (int16_t)a[4], (int16_t)a[5], + (int16_t)a[6], (int16_t)a[7]); +#else + return _mm_load_si128((const __m128i *)a); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +#else + _mm_store_si128((__m128i *)(b), a); +#endif +} +#endif // VPX_DSP_X86_FDCT_H_ diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 2c7e431c7..0580a7bd7 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -13,32 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" - -static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_VP9_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif -} - -static INLINE void store_coefficients(__m128i coeff_vals, - tran_low_t *coeff_ptr) { -#if CONFIG_VP9_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif -} +#include "vpx_dsp/x86/fdct.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -81,8 +56,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -117,15 +92,15 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -159,8 +134,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -191,14 +166,14 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -237,10 +212,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } } else { do { - store_coefficients(zero, dqcoeff_ptr + n_coeffs); - store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); - store_coefficients(zero, qcoeff_ptr + n_coeffs); - store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); + store_tran_low(zero, dqcoeff_ptr + n_coeffs); + store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(zero, qcoeff_ptr + n_coeffs); + store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; -- cgit v1.2.3