diff options
author | Kyle Siefring <kylesiefring@gmail.com> | 2017-05-01 09:15:29 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2017-05-01 09:59:18 -0700 |
commit | 8394990b2749608ea710a9fbfe82bb4bba1529c9 (patch) | |
tree | 14fc96045dbc15df111e1b1c865a5f5f93e6e867 /vp9/encoder/x86 | |
parent | ef5918098d5c7f8ffda960274e3f8e38f02cb487 (diff) | |
download | libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.tar libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.tar.gz libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.tar.bz2 libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.zip |
block error sse2: sum in 32 bits when possible
Add 31bit pairs before unpacking in x86 block error code
BUG=webm:1210
Change-Id: I5ca8c7f7775585a17fe09d6bbfc25e1f2955eb0a
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r-- | vp9/encoder/x86/vp9_error_sse2.asm | 19 |
1 files changed, 6 insertions, 13 deletions
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm index 0a472ec74..11d473b2d 100644 --- a/vp9/encoder/x86/vp9_error_sse2.asm +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -39,23 +39,18 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 ; accumulate in 64bit punpckldq m7, m0, m5 punpckhdq m0, m5 paddq m4, m7 - punpckldq m7, m1, m5 - paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m7 punpckldq m7, m2, m5 - paddq m4, m1 + paddq m4, m0 punpckhdq m2, m5 paddq m6, m7 - punpckldq m7, m3, m5 paddq m6, m2 - punpckhdq m3, m5 - paddq m6, m7 - paddq m6, m3 jg .loop ; accumulate horizontally and store in return value @@ -98,15 +93,13 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) pmaddwd m0, m0 pmaddwd m1, m1 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 ; accumulate in 64bit punpckldq m3, m0, m5 punpckhdq m0, m5 paddq m4, m3 - punpckldq m3, m1, m5 paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m3 - paddq m4, m1 jnz .loop ; accumulate horizontally and store in return value |