diff options
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/quantize_neon.c | 73 | ||||
-rw-r--r-- | vpx_dsp/quantize.c | 9 | ||||
-rw-r--r-- | vpx_dsp/ssim.c | 14 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_avx2.c | 203 | ||||
-rw-r--r-- | vpx_dsp/x86/quantize_ssse3.h | 12 |
6 files changed, 267 insertions, 50 deletions
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index b5d1e7ecb..adef5f6e1 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -15,6 +15,22 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" +static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff) { + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + +#if CONFIG_VP9_HIGHBITDEPTH + vst1q_s32(dqcoeff, dqcoeff_0); + vst1q_s32(dqcoeff + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -73,9 +89,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - qcoeff = vmulq_s16(qcoeff, dequant); - - store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } @@ -126,9 +140,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - qcoeff = vmulq_s16(qcoeff, dequant); - - store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; n_coeffs -= 8; @@ -152,6 +164,28 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); } +static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff) { + int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff, dqcoeff_0); + vst1q_s32(dqcoeff + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff, + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -194,8 +228,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - int16x8_t dqcoeff; - int32x4_t dqcoeff_0, dqcoeff_1; qcoeff = vaddq_s16(qcoeff, rounded); @@ -217,17 +249,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); - dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - - // Add 1 if negative to round towards zero because the C uses division. - dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); - dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); - - dqcoeff = - vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - - store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } @@ -254,8 +276,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - int16x8_t dqcoeff; - int32x4_t dqcoeff_0, dqcoeff_1; qcoeff = vaddq_s16(qcoeff, rounded); @@ -278,16 +298,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); - dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - - dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); - dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); - - dqcoeff = - vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - - store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } } diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 82a659592..0e6a0b83f 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -12,6 +12,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/quantize.h" +#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, @@ -259,7 +260,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; +#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH + // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than + // truncating with a cast, saturate the value. This is easier to implement + // on x86 and preserves the sign of the value. + dqcoeff_ptr[rc] = + clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX); +#else dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; +#endif // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH if (tmp) eob = idx_arr[i]; } diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c index ba73eb293..7c3c31bad 100644 --- a/vpx_dsp/ssim.c +++ b/vpx_dsp/ssim.c @@ -73,7 +73,7 @@ static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { - int64_t ssim_n, ssim_d; + double ssim_n, ssim_d; int64_t c1, c2; if (bd == 8) { // scale the constants by number of pixels @@ -90,14 +90,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, assert(0); } - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); - return ssim_n * 1.0 / ssim_d; + return ssim_n / ssim_d; } static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 182503810..6dc317630 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -786,13 +786,13 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_32x32 sse2 avx2/; add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_8x8/; + specialize qw/vpx_highbd_hadamard_8x8 avx2/; add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_16x16/; + specialize qw/vpx_highbd_hadamard_16x16 avx2/; add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_highbd_hadamard_32x32/; + specialize qw/vpx_highbd_hadamard_32x32 avx2/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 63a69e582..f39210b6a 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -15,6 +15,209 @@ #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" #include "vpx_ports/mem.h" +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; diff --git a/vpx_dsp/x86/quantize_ssse3.h b/vpx_dsp/x86/quantize_ssse3.h index 35223d7b4..e8d2a0577 100644 --- a/vpx_dsp/x86/quantize_ssse3.h +++ b/vpx_dsp/x86/quantize_ssse3.h @@ -24,7 +24,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, // Un-sign to bias rounding like C. const __m128i coeff = _mm_abs_epi16(qcoeff); -#if CONFIG_VP9_HIGHBITDEPTH const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); @@ -40,17 +39,12 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); +#if CONFIG_VP9_HIGHBITDEPTH _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); #else - __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant); - (void)zero; - - dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1); - - dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff); - - _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); + _mm_store_si128((__m128i *)(dqcoeff), + _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1)); #endif // CONFIG_VP9_HIGHBITDEPTH } |