Use pmovmskb to skip quantize loops over empty coefficients.

If none of the 16 coefficients that we quantize per loop iteration are larger than the zbin, directly skip to the next round of coeffs, rather than doing a full quantize loop that will eventually result in 16 zeroes. This incurs a jump cost, but saves a lot of other work. 32x32 quant goes from 1349 -> 1184 cycles. The same approach yielded no significantly positive results for smaller transforms, so is not used there (8x8: 103 -> 101 cycles; 16x16: 302 -> 306 cycles). Change-Id: I8fca17dc2543fc8eed1dbcd5100145e3c3a9b647
author: Ronald S. Bultje <rbultje@google.com> 2013-07-01 12:03:20 -0700
committer: Ronald S. Bultje <rbultje@google.com> 2013-07-02 16:34:24 -0700
commit: e5fb4b61b66d188b3afed56f1e2548dd6e1a2074 (patch)
tree: 805ad61f17ae17d50e4f9668576dc7f072d29da0 /vp9
parent: 5b872402307b6a62ee4dbe93021c003ccf99a547 (diff)
download: libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar
libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar.gz
libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar.bz2
libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.zip
1 files changed, 25 insertions, 9 deletions
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index b666abbd9..60f799195 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -15,10 +15,10 @@ pw_1: times 8 dw 1
 
 SECTION .text
 
-%macro QUANTIZE_FN 1
-cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                               shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                               eob, scan, iscan
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
 
@@ -43,9 +43,8 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                            m4, [r2]                 ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-  mov                             r2, eobmp
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, eob, qcoeff, dqcoeff, iscan
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
   lea                         iscanq, [  iscanq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -119,6 +118,12 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                        r6, m7
+  pmovmskb                        r2, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
   paddw                           m6, m1                   ; m6 += round
   paddw                          m11, m1                   ; m11 += round
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
@@ -159,16 +164,27 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmaxsw                          m8, m13
   add                        ncoeffq, mmsize
   jl .ac_only_loop
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
 
 .accumulate_eob:
   ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
   pshufd                          m7, m8, 0xe
   pmaxsw                          m8, m7
   pshuflw                         m7, m8, 0xe
   pmaxsw                          m8, m7
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
-  pextrw                      [eobq], m8, 0
+  pextrw                        [r2], m8, 0
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -194,5 +210,5 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endmacro
 
 INIT_XMM ssse3
-QUANTIZE_FN b
-QUANTIZE_FN b_32x32
+QUANTIZE_FN b, 6
+QUANTIZE_FN b_32x32, 7
author	Ronald S. Bultje <rbultje@google.com>	2013-07-01 12:03:20 -0700
committer	Ronald S. Bultje <rbultje@google.com>	2013-07-02 16:34:24 -0700
commit	e5fb4b61b66d188b3afed56f1e2548dd6e1a2074 (patch)
tree	805ad61f17ae17d50e4f9668576dc7f072d29da0 /vp9
parent	5b872402307b6a62ee4dbe93021c003ccf99a547 (diff)
download	libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar.gz libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar.bz2 libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.zip