summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86/subtract_sse2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder/x86/subtract_sse2.asm')
-rw-r--r--vp8/encoder/x86/subtract_sse2.asm348
1 files changed, 348 insertions, 0 deletions
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644
index 000000000..ef329de71
--- /dev/null
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -0,0 +1,348 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+; short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp8_subtract_b_sse2_impl)
+sym(vp8_subtract_b_sse2_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi], mm0
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2],mm0
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_sse2)
+sym(vp8_subtract_mby_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(1) ;src
+ mov rdi, arg(0) ;diff
+
+ mov rax, arg(2) ;pred
+ movsxd rdx, dword ptr arg(3) ;stride
+
+ mov rcx, 8 ; do two lines at one time
+
+submby_loop:
+ movdqa xmm0, [rsi] ; src
+ movdqa xmm1, [rax] ; pred
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0
+ movdqa [rdi +16], xmm2
+
+ movdqa xmm4, [rsi + rdx]
+ movdqa xmm5, [rax + 16]
+
+ movdqa xmm6, xmm4
+ psubb xmm4, xmm5
+
+ pxor xmm5, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm6, [GLOBAL(t80)]
+ pcmpgtb xmm5, xmm6 ; obtain sign information
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ punpcklbw xmm4, xmm5 ; put sign back to subtraction
+ punpckhbw xmm6, xmm7 ; put sign back to subtraction
+
+ movdqa [rdi +32], xmm4
+ movdqa [rdi +48], xmm6
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+ sub rcx, 1
+ jnz submby_loop
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_sse2)
+sym(vp8_subtract_mbuv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(1) ;z = usrc
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ add rax, 256 ;Predictor = pred + 256
+ movsxd rdx, dword ptr arg(4) ;stride;
+ lea rcx, [rdx + rdx*2]
+
+ ;u
+ ;line 0 1
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx]
+ movdqa xmm1, [rax] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0
+ movdqa [rdi +16], xmm2
+
+ ;line 2 3
+ movq xmm0, [rsi+rdx*2] ; src
+ movq xmm2, [rsi+rcx]
+ movdqa xmm1, [rax+16] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi + 32], xmm0
+ movdqa [rdi + 48], xmm2
+
+ ;line 4 5
+ lea rsi, [rsi + rdx*4]
+
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx]
+ movdqa xmm1, [rax + 32] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi + 64], xmm0
+ movdqa [rdi + 80], xmm2
+
+ ;line 6 7
+ movq xmm0, [rsi+rdx*2] ; src
+ movq xmm2, [rsi+rcx]
+ movdqa xmm1, [rax+ 48] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi + 96], xmm0
+ movdqa [rdi + 112], xmm2
+
+ ;v
+ mov rsi, arg(2) ;z = vsrc
+ add rdi, 64*2 ;diff = diff + 320 (shorts)
+ add rax, 64 ;Predictor = pred + 320
+
+ ;line 0 1
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx]
+ movdqa xmm1, [rax] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0
+ movdqa [rdi +16], xmm2
+
+ ;line 2 3
+ movq xmm0, [rsi+rdx*2] ; src
+ movq xmm2, [rsi+rcx]
+ movdqa xmm1, [rax+16] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi + 32], xmm0
+ movdqa [rdi + 48], xmm2
+
+ ;line 4 5
+ lea rsi, [rsi + rdx*4]
+
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx]
+ movdqa xmm1, [rax + 32] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi + 64], xmm0
+ movdqa [rdi + 80], xmm2
+
+ ;line 6 7
+ movq xmm0, [rsi+rdx*2] ; src
+ movq xmm2, [rsi+rcx]
+ movdqa xmm1, [rax+ 48] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi + 96], xmm0
+ movdqa [rdi + 112], xmm2
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t80:
+ times 16 db 0x80