diff options
author | Ronald S. Bultje <rbultje@google.com> | 2013-07-01 12:03:20 -0700 |
---|---|---|
committer | Ronald S. Bultje <rbultje@google.com> | 2013-07-02 16:34:24 -0700 |
commit | e5fb4b61b66d188b3afed56f1e2548dd6e1a2074 (patch) | |
tree | 805ad61f17ae17d50e4f9668576dc7f072d29da0 /vp9 | |
parent | 5b872402307b6a62ee4dbe93021c003ccf99a547 (diff) | |
download | libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar.gz libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.tar.bz2 libvpx-e5fb4b61b66d188b3afed56f1e2548dd6e1a2074.zip |
Use pmovmskb to skip quantize loops over empty coefficients.
If none of the 16 coefficients that we quantize per loop iteration
are larger than the zbin, directly skip to the next round of coeffs,
rather than doing a full quantize loop that will eventually result
in 16 zeroes. This incurs a jump cost, but saves a lot of other work.
32x32 quant goes from 1349 -> 1184 cycles. The same approach yielded
no significantly positive results for smaller transforms, so is not
used there (8x8: 103 -> 101 cycles; 16x16: 302 -> 306 cycles).
Change-Id: I8fca17dc2543fc8eed1dbcd5100145e3c3a9b647
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/encoder/x86/vp9_quantize_ssse3.asm | 34 |
1 files changed, 25 insertions, 9 deletions
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm index b666abbd9..60f799195 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm @@ -15,10 +15,10 @@ pw_1: times 8 dw 1 SECTION .text -%macro QUANTIZE_FN 1 -cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, zbin_oq, \ - eob, scan, iscan +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + eob, scan, iscan cmp dword skipm, 0 jne .blank @@ -43,9 +43,8 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp - mov r2, eobmp pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, eob, qcoeff, dqcoeff, iscan + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob lea coeffq, [ coeffq+ncoeffq*2] lea iscanq, [ iscanq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2] @@ -119,6 +118,12 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6, m7 + pmovmskb r2, m12 + or r6, r2 + jz .skip_iter +%endif paddw m6, m1 ; m6 += round paddw m11, m1 ; m11 += round pmulhw m14, m6, m2 ; m14 = m6*q>>16 @@ -159,16 +164,27 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmaxsw m8, m13 add ncoeffq, mmsize jl .ac_only_loop +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif .accumulate_eob: ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp pshufd m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0x1 pmaxsw m8, m7 - pextrw [eobq], m8, 0 + pextrw [r2], m8, 0 RET ; skip-block, i.e. just write all zeroes @@ -194,5 +210,5 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endmacro INIT_XMM ssse3 -QUANTIZE_FN b -QUANTIZE_FN b_32x32 +QUANTIZE_FN b, 6 +QUANTIZE_FN b_32x32, 7 |