6 files changed, 267 insertions, 50 deletions
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index b5d1e7ecb..adef5f6e1 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -15,6 +15,22 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
+static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
+                                               const int16x8_t dequant,
+                                               tran_low_t *dqcoeff) {
+  const int32x4_t dqcoeff_0 =
+      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  const int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  vst1q_s32(dqcoeff, dqcoeff_0);
+  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -73,9 +89,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
 
-    qcoeff = vmulq_s16(qcoeff, dequant);
-
-    store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+    calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
@@ -126,9 +140,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
 
-      qcoeff = vmulq_s16(qcoeff, dequant);
-
-      store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+      calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
 
       n_coeffs -= 8;
@@ -152,6 +164,28 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
 }
 
+static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
+                                                     const int16x8_t dequant,
+                                                     tran_low_t *dqcoeff) {
+  int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff, dqcoeff_0);
+  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff,
+            vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -194,8 +228,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
     // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
     int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-    int16x8_t dqcoeff;
-    int32x4_t dqcoeff_0, dqcoeff_1;
 
     qcoeff = vaddq_s16(qcoeff, rounded);
 
@@ -217,17 +249,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
 
-    dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
-    dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
-    // Add 1 if negative to round towards zero because the C uses division.
-    dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-    dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
-
-    dqcoeff =
-        vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
-
-    store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+    calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
@@ -254,8 +276,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
       // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
       int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-      int16x8_t dqcoeff;
-      int32x4_t dqcoeff_0, dqcoeff_1;
 
       qcoeff = vaddq_s16(qcoeff, rounded);
 
@@ -278,16 +298,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
 
-      dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
-      dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
-      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
-
-      dqcoeff =
-          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
-
-      store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+      calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
     }
   }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 82a659592..0e6a0b83f 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
@@ -259,7 +260,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
           15;
 
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+    // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+    // truncating with a cast, saturate the value. This is easier to implement
+    // on x86 and preserves the sign of the value.
+    dqcoeff_ptr[rc] =
+        clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif  // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
 
     if (tmp) eob = idx_arr[i];
   }
diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c
index ba73eb293..7c3c31bad 100644
--- a/vpx_dsp/ssim.c
+++ b/vpx_dsp/ssim.c
@@ -73,7 +73,7 @@ static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
-  int64_t ssim_n, ssim_d;
+  double ssim_n, ssim_d;
   int64_t c1, c2;
   if (bd == 8) {
     // scale the constants by number of pixels
@@ -90,14 +90,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     assert(0);
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
 static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 182503810..6dc317630 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -786,13 +786,13 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
 
     add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_8x8/;
+    specialize qw/vpx_highbd_hadamard_8x8 avx2/;
 
     add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_16x16/;
+    specialize qw/vpx_highbd_hadamard_16x16 avx2/;
 
     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_32x32/;
+    specialize qw/vpx_highbd_hadamard_32x32 avx2/;
 
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 63a69e582..f39210b6a 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -15,6 +15,209 @@
 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
 #include "vpx_ports/mem.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   __m256i a0 = in[0];
   __m256i a1 = in[1];
diff --git a/vpx_dsp/x86/quantize_ssse3.h b/vpx_dsp/x86/quantize_ssse3.h
index 35223d7b4..e8d2a0577 100644
--- a/vpx_dsp/x86/quantize_ssse3.h
+++ b/vpx_dsp/x86/quantize_ssse3.h
@@ -24,7 +24,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
   // Un-sign to bias rounding like C.
   const __m128i coeff = _mm_abs_epi16(qcoeff);
 
-#if CONFIG_VP9_HIGHBITDEPTH
   const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
   const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
 
@@ -40,17 +39,12 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
   dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
   dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
 
+#if CONFIG_VP9_HIGHBITDEPTH
   _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
   _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
 #else
-  __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant);
-  (void)zero;
-
-  dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1);
-
-  dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff);
-
-  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+  _mm_store_si128((__m128i *)(dqcoeff),
+                  _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }