summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/quantize_neon.c73
-rw-r--r--vpx_dsp/quantize.c9
-rw-r--r--vpx_dsp/ssim.c14
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl6
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c203
-rw-r--r--vpx_dsp/x86/quantize_ssse3.h12
6 files changed, 267 insertions, 50 deletions
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index b5d1e7ecb..adef5f6e1 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -15,6 +15,22 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/mem_neon.h"
+static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
+ const int16x8_t dequant,
+ tran_low_t *dqcoeff) {
+ const int32x4_t dqcoeff_0 =
+ vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ const int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vst1q_s32(dqcoeff, dqcoeff_0);
+ vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -73,9 +89,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
- qcoeff = vmulq_s16(qcoeff, dequant);
-
- store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+ calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
@@ -126,9 +140,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
- qcoeff = vmulq_s16(qcoeff, dequant);
-
- store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+ calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
n_coeffs -= 8;
@@ -152,6 +164,28 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
}
+static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
+ const int16x8_t dequant,
+ tran_low_t *dqcoeff) {
+ int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff, dqcoeff_0);
+ vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff,
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
// Main difference is that zbin values are halved before comparison and dqcoeff
// values are divided by 2. zbin is rounded but dqcoeff is not.
void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -194,8 +228,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
// (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
- int16x8_t dqcoeff;
- int32x4_t dqcoeff_0, dqcoeff_1;
qcoeff = vaddq_s16(qcoeff, rounded);
@@ -217,17 +249,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
- dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
- dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
- // Add 1 if negative to round towards zero because the C uses division.
- dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
- dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
-
- dqcoeff =
- vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
-
- store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+ calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
@@ -254,8 +276,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
// (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
- int16x8_t dqcoeff;
- int32x4_t dqcoeff_0, dqcoeff_1;
qcoeff = vaddq_s16(qcoeff, rounded);
@@ -278,16 +298,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
- dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
- dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
- dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
- dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
-
- dqcoeff =
- vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
-
- store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+ calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
dqcoeff_ptr += 8;
}
}
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 82a659592..0e6a0b83f 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,6 +12,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
@@ -259,7 +260,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+ // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+ // truncating with a cast, saturate the value. This is easier to implement
+ // on x86 and preserves the sign of the value.
+ dqcoeff_ptr[rc] =
+ clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
if (tmp) eob = idx_arr[i];
}
diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c
index ba73eb293..7c3c31bad 100644
--- a/vpx_dsp/ssim.c
+++ b/vpx_dsp/ssim.c
@@ -73,7 +73,7 @@ static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
uint32_t sum_sq_r, uint32_t sum_sxr, int count,
uint32_t bd) {
- int64_t ssim_n, ssim_d;
+ double ssim_n, ssim_d;
int64_t c1, c2;
if (bd == 8) {
// scale the constants by number of pixels
@@ -90,14 +90,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
assert(0);
}
- ssim_n = (2 * sum_s * sum_r + c1) *
- ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+ ssim_n = (2.0 * sum_s * sum_r + c1) *
+ (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
- ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
- ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
- (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+ ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+ ((double)count * sum_sq_s - (double)sum_s * sum_s +
+ (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
- return ssim_n * 1.0 / ssim_d;
+ return ssim_n / ssim_d;
}
static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 182503810..6dc317630 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -786,13 +786,13 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_32x32 sse2 avx2/;
add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_highbd_hadamard_8x8/;
+ specialize qw/vpx_highbd_hadamard_8x8 avx2/;
add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_highbd_hadamard_16x16/;
+ specialize qw/vpx_highbd_hadamard_16x16 avx2/;
add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_highbd_hadamard_32x32/;
+ specialize qw/vpx_highbd_hadamard_32x32 avx2/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon/;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 63a69e582..f39210b6a 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -15,6 +15,209 @@
#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
#include "vpx_ports/mem.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi32(a0, a1);
+ __m256i b1 = _mm256_sub_epi32(a0, a1);
+ __m256i b2 = _mm256_add_epi32(a2, a3);
+ __m256i b3 = _mm256_sub_epi32(a2, a3);
+ __m256i b4 = _mm256_add_epi32(a4, a5);
+ __m256i b5 = _mm256_sub_epi32(a4, a5);
+ __m256i b6 = _mm256_add_epi32(a6, a7);
+ __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+ a0 = _mm256_add_epi32(b0, b2);
+ a1 = _mm256_add_epi32(b1, b3);
+ a2 = _mm256_sub_epi32(b0, b2);
+ a3 = _mm256_sub_epi32(b1, b3);
+ a4 = _mm256_add_epi32(b4, b6);
+ a5 = _mm256_add_epi32(b5, b7);
+ a6 = _mm256_sub_epi32(b4, b6);
+ a7 = _mm256_sub_epi32(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi32(a0, a4);
+ b7 = _mm256_add_epi32(a1, a5);
+ b3 = _mm256_add_epi32(a2, a6);
+ b4 = _mm256_add_epi32(a3, a7);
+ b2 = _mm256_sub_epi32(a0, a4);
+ b6 = _mm256_sub_epi32(a1, a5);
+ b1 = _mm256_sub_epi32(a2, a6);
+ b5 = _mm256_sub_epi32(a3, a7);
+
+ a0 = _mm256_unpacklo_epi32(b0, b1);
+ a1 = _mm256_unpacklo_epi32(b2, b3);
+ a2 = _mm256_unpackhi_epi32(b0, b1);
+ a3 = _mm256_unpackhi_epi32(b2, b3);
+ a4 = _mm256_unpacklo_epi32(b4, b5);
+ a5 = _mm256_unpacklo_epi32(b6, b7);
+ a6 = _mm256_unpackhi_epi32(b4, b5);
+ a7 = _mm256_unpackhi_epi32(b6, b7);
+
+ b0 = _mm256_unpacklo_epi64(a0, a1);
+ b1 = _mm256_unpacklo_epi64(a4, a5);
+ b2 = _mm256_unpackhi_epi64(a0, a1);
+ b3 = _mm256_unpackhi_epi64(a4, a5);
+ b4 = _mm256_unpacklo_epi64(a2, a3);
+ b5 = _mm256_unpacklo_epi64(a6, a7);
+ b6 = _mm256_unpackhi_epi64(a2, a3);
+ b7 = _mm256_unpackhi_epi64(a6, a7);
+
+ in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+ in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+ in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+ in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+ in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+ in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+ in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+ in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+ } else {
+ in[0] = _mm256_add_epi32(a0, a4);
+ in[7] = _mm256_add_epi32(a1, a5);
+ in[3] = _mm256_add_epi32(a2, a6);
+ in[4] = _mm256_add_epi32(a3, a7);
+ in[2] = _mm256_sub_epi32(a0, a4);
+ in[6] = _mm256_sub_epi32(a1, a5);
+ in[1] = _mm256_sub_epi32(a2, a6);
+ in[5] = _mm256_sub_epi32(a3, a7);
+ }
+}
+
+void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src16[8];
+ __m256i src32[8];
+
+ src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+ src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+
+ src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+ src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+ src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+ src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+ src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+ src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+ src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+ src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+ highbd_hadamard_col8_avx2(src32, 0);
+ highbd_hadamard_col8_avx2(src32, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 1);
+ b1 = _mm256_srai_epi32(b1, 1);
+ b2 = _mm256_srai_epi32(b2, 1);
+ b3 = _mm256_srai_epi32(b3, 1);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 2);
+ b1 = _mm256_srai_epi32(b1, 2);
+ b2 = _mm256_srai_epi32(b2, 2);
+ b3 = _mm256_srai_epi32(b3, 2);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static void hadamard_col8x2_avx2(__m256i *in, int iter) {
__m256i a0 = in[0];
__m256i a1 = in[1];
diff --git a/vpx_dsp/x86/quantize_ssse3.h b/vpx_dsp/x86/quantize_ssse3.h
index 35223d7b4..e8d2a0577 100644
--- a/vpx_dsp/x86/quantize_ssse3.h
+++ b/vpx_dsp/x86/quantize_ssse3.h
@@ -24,7 +24,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
// Un-sign to bias rounding like C.
const __m128i coeff = _mm_abs_epi16(qcoeff);
-#if CONFIG_VP9_HIGHBITDEPTH
const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
@@ -40,17 +39,12 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+#if CONFIG_VP9_HIGHBITDEPTH
_mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
_mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
#else
- __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant);
- (void)zero;
-
- dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1);
-
- dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff);
-
- _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+ _mm_store_si128((__m128i *)(dqcoeff),
+ _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
#endif // CONFIG_VP9_HIGHBITDEPTH
}