diff options
author | Jim Bankoski <jimbankoski@google.com> | 2013-02-28 08:32:14 -0800 |
---|---|---|
committer | Jim Bankoski <jimbankoski@google.com> | 2013-02-28 08:46:35 -0800 |
commit | 714aa9f3c072624186df161589bacbb778369312 (patch) | |
tree | 22563b7ebd98666e4c29430196f2fdcde37e6b0d /vp9/encoder/x86 | |
parent | b715e371c05324c84b3a58ca19f5348caa2ff695 (diff) | |
download | libvpx-714aa9f3c072624186df161589bacbb778369312.tar libvpx-714aa9f3c072624186df161589bacbb778369312.tar.gz libvpx-714aa9f3c072624186df161589bacbb778369312.tar.bz2 libvpx-714aa9f3c072624186df161589bacbb778369312.zip |
this commit converts all sad ptrs to uint32
sse4_1 code used uint16_t for returning sad, but that
won't work for 32x32 or 64x64. This code fixes the
assembly for those and also reenables sse4_1 on linux
Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse4.asm | 118 |
1 files changed, 62 insertions, 56 deletions
diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm index b42982a1f..faf1768a9 100644 --- a/vp9/encoder/x86/vp9_sad_sse4.asm +++ b/vp9/encoder/x86/vp9_sad_sse4.asm @@ -154,6 +154,16 @@ paddw xmm1, xmm5 %endmacro +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro ;void vp9_sad16x16x8_sse4( ; const unsigned char *src_ptr, @@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi |