summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86
diff options
context:
space:
mode:
authorKyle Siefring <kylesiefring@gmail.com>2017-05-01 09:15:29 -0700
committerJohann <johannkoenig@google.com>2017-05-01 09:59:18 -0700
commit8394990b2749608ea710a9fbfe82bb4bba1529c9 (patch)
tree14fc96045dbc15df111e1b1c865a5f5f93e6e867 /vp9/encoder/x86
parentef5918098d5c7f8ffda960274e3f8e38f02cb487 (diff)
downloadlibvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.tar
libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.tar.gz
libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.tar.bz2
libvpx-8394990b2749608ea710a9fbfe82bb4bba1529c9.zip
block error sse2: sum in 32 bits when possible
Add 31bit pairs before unpacking in x86 block error code BUG=webm:1210 Change-Id: I5ca8c7f7775585a17fe09d6bbfc25e1f2955eb0a
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r--vp9/encoder/x86/vp9_error_sse2.asm19
1 files changed, 6 insertions, 13 deletions
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm
index 0a472ec74..11d473b2d 100644
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -39,23 +39,18 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
; accumulate in 64bit
punpckldq m7, m0, m5
punpckhdq m0, m5
paddq m4, m7
- punpckldq m7, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m7
punpckldq m7, m2, m5
- paddq m4, m1
+ paddq m4, m0
punpckhdq m2, m5
paddq m6, m7
- punpckldq m7, m3, m5
paddq m6, m2
- punpckhdq m3, m5
- paddq m6, m7
- paddq m6, m3
jg .loop
; accumulate horizontally and store in return value
@@ -98,15 +93,13 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
; accumulate in 64bit
punpckldq m3, m0, m5
punpckhdq m0, m5
paddq m4, m3
- punpckldq m3, m1, m5
paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m3
- paddq m4, m1
jnz .loop
; accumulate horizontally and store in return value