Merge "Update quantize SSSE3 SIMD to cover 32x32 transform case also."

author: Ronald S. Bultje <rbultje@google.com> 2013-07-02 09:38:08 -0700
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> 2013-07-02 09:38:08 -0700
commit: 9df24b41ca16353acb123acae7c70813cfffafdd (patch)
tree: ac08ef6d484b5db6e24eae5d193d4f167d62cda1
parent: b7cd01ed7375b1e5b6dc67f7427d07298f244471 (diff)
parent: c8defcfdeea614a780af9a2405f59c60cab876ad (diff)
download: libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.tar
libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.tar.gz
libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.tar.bz2
libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.zip
3 files changed, 55 insertions, 20 deletions
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e7cefa57c..330c60f6d 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -569,6 +569,9 @@ specialize vp9_subtract_block sse2
 prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
 
+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
+
 #
 # Structured Similarity (SSIM)
 #
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 6f2e13a0e..862923fd4 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -85,18 +85,19 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
 }
 
 // This function works well for large transform size.
-static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
                             int16_t *zbin_ptr, int16_t *round_ptr,
                             int16_t *quant_ptr, int16_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                             int16_t *dequant_ptr, int zbin_oq_value,
                             uint16_t *eob_ptr, const int16_t *scan,
-                            int *idx_arr) {
+                            const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
   int idx = 0;
+  int idx_arr[1024];
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -179,20 +180,18 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
   // Call different quantization for different transform size.
   if (n_coeffs >= 1024) {
     // Save index of picked coefficient in pre-scan pass.
-    int idx_arr[1024];
-
-    quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-                    n_coeffs, mb->skip_block,
-                    mb->plane[plane].zbin,
-                    mb->plane[plane].round,
-                    mb->plane[plane].quant,
-                    mb->plane[plane].quant_shift,
-                    BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-                    BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                    xd->plane[plane].dequant,
-                    mb->plane[plane].zbin_extra,
-                    &xd->plane[plane].eobs[block],
-                    scan, idx_arr);
+    vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+                         n_coeffs, mb->skip_block,
+                         mb->plane[plane].zbin,
+                         mb->plane[plane].round,
+                         mb->plane[plane].quant,
+                         mb->plane[plane].quant_shift,
+                         BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+                         BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         xd->plane[plane].dequant,
+                         mb->plane[plane].zbin_extra,
+                         &xd->plane[plane].eobs[block],
+                         scan, iscan);
   }
   else {
     vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 665bafacb..b666abbd9 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -15,10 +15,10 @@ pw_1: times 8 dw 1
 
 SECTION .text
 
-INIT_XMM ssse3
-cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                              shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                              eob, scan, iscan
+%macro QUANTIZE_FN 1
+cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                               shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                               eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
 
@@ -57,6 +57,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   punpckhqdq                      m0, m0
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
@@ -77,9 +81,19 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pand                           m13, m12
   mova        [qcoeffq+ncoeffq*2+ 0], m8
   mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
   mova       [dqcoeffq+ncoeffq*2+ 0], m8
   mova       [dqcoeffq+ncoeffq*2+16], m13
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
@@ -99,6 +113,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
   paddw                           m6, m1                   ; m6 += round
@@ -115,8 +133,18 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pand                           m13, m12
   mova        [qcoeffq+ncoeffq*2+ 0], m14
   mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
   pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
   mova       [dqcoeffq+ncoeffq*2+ 0], m14
   mova       [dqcoeffq+ncoeffq*2+16], m13
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
@@ -163,3 +191,8 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   jl .blank_loop
   mov                    word [eobq], 0
   RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b
+QUANTIZE_FN b_32x32
author	Ronald S. Bultje <rbultje@google.com>	2013-07-02 09:38:08 -0700
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	2013-07-02 09:38:08 -0700
commit	9df24b41ca16353acb123acae7c70813cfffafdd (patch)
tree	ac08ef6d484b5db6e24eae5d193d4f167d62cda1
parent	b7cd01ed7375b1e5b6dc67f7427d07298f244471 (diff)
parent	c8defcfdeea614a780af9a2405f59c60cab876ad (diff)
download	libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.tar libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.tar.gz libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.tar.bz2 libvpx-9df24b41ca16353acb123acae7c70813cfffafdd.zip