SSE2 optimisation for quantize in high bit depth

When configured with high bit detpth enabled, the 8bit quantize function stopped using optimised code. This made 8bit content decode slowly. This commit re-enables the SSE2 optimisation (but not the SSSE3 optimisation). Change-Id: Id015fe3c1c44580a4bff3f4bd985170f2806a9d9
author: Julia Robson <juliamrobson@gmail.com> 2015-10-02 10:20:06 +0100
committer: Debargha Mukherjee <debargha@google.com> 2015-10-05 10:59:16 -0700
commit: 5e6533e70730e0798b4b00d795d400bc87f8927c (patch)
tree: 0f0e821f65c87a314727f691cebd9b6c0e729507 /vpx_dsp
parent: 7777e7a8d5d4993825f441abd1dd9eb35680bf60 (diff)
download: libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.tar
libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.tar.gz
libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.tar.bz2
libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.zip
2 files changed, 45 insertions, 20 deletions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 535944e29..7f9b882e7 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -849,7 +849,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b/;
+  specialize qw/vpx_quantize_b sse2/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32/;
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index c2a804e15..8aa4568d6 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -14,11 +14,36 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+      (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
+      (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+#else
+  return _mm_load_si128((const __m128i *)coeff_ptr);
+#endif
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
+#else
+  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
+#endif
+}
+
+void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t* zbin_ptr,
                          const int16_t* round_ptr, const int16_t* quant_ptr,
-                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
-                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
+                         tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
                          uint16_t* eob_ptr,
                          const int16_t* scan_ptr,
                          const int16_t* iscan_ptr) {
@@ -56,8 +81,8 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
         // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -92,15 +117,15 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -134,8 +159,8 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
 
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -166,14 +191,14 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -212,10 +237,10 @@ void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
     }
   } else {
     do {
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
+      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
+      store_coefficients(zero, qcoeff_ptr + n_coeffs);
+      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
author	Julia Robson <juliamrobson@gmail.com>	2015-10-02 10:20:06 +0100
committer	Debargha Mukherjee <debargha@google.com>	2015-10-05 10:59:16 -0700
commit	5e6533e70730e0798b4b00d795d400bc87f8927c (patch)
tree	0f0e821f65c87a314727f691cebd9b6c0e729507 /vpx_dsp
parent	7777e7a8d5d4993825f441abd1dd9eb35680bf60 (diff)
download	libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.tar libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.tar.gz libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.tar.bz2 libvpx-5e6533e70730e0798b4b00d795d400bc87f8927c.zip