diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder/x86/fwalsh_sse2.asm | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip |
Initial WebM release
Diffstat (limited to 'vp8/encoder/x86/fwalsh_sse2.asm')
-rw-r--r-- | vp8/encoder/x86/fwalsh_sse2.asm | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm new file mode 100644 index 000000000..7d8620178 --- /dev/null +++ b/vp8/encoder/x86/fwalsh_sse2.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_walsh4x4_sse2) +sym(vp8_short_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + + movdqu xmm4, [rsi + 0] ;ip[4] ip[0] + movdqu xmm0, [rsi + 16] ;ip[12] ip[8] + + pxor xmm7, xmm7 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm5 ;ip[4] ip[0] + + paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm6, xmm5 + punpcklqdq xmm5, xmm3 ;d1 a1 + punpckhqdq xmm6, xmm3 ;c1 b1 + + movdqa xmm1, xmm6 ;c1 b1 + paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + movdqa xmm0, xmm6 ;aka b2 a2 + movdqa xmm1, xmm5 ;aka d2 c2 + + pcmpgtw xmm0, xmm7 + pcmpgtw xmm1, xmm7 + + psrlw xmm0, 15 + psrlw xmm1, 15 + + paddw xmm6, xmm0 + paddw xmm5, xmm1 + + psraw xmm6, 1 + psraw xmm5, 1 + + ; a2 = a1 + b1; + ; b2 = c1 + d1; + ; c2 = a1 - b1; + ; d2 = d1 - c1; + ; a2 += (a2>0); + ; b2 += (b2>0); + ; c2 += (c2>0); + ; d2 += (d2>0); + ; op[0] = (a2)>>1; + ; op[4] = (b2)>>1; + ; op[8] = (c2)>>1; + ; op[12]= (d2)>>1; + + movdqu [rdi + 0], xmm6 + movdqu [rdi + 16], xmm5 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret |