summaryrefslogtreecommitdiff
path: root/vp8/common
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2011-04-19 06:32:10 -0700
committerCode Review <code-review@webmproject.org>2011-04-19 06:32:10 -0700
commita9b465c5c9a35cb80e38d612bb63ba8d09c6855a (patch)
treed0969f252f0f008384ef5b1530511e8b486c0982 /vp8/common
parent48438d60162cdf0ff16706b243724bc698febf83 (diff)
parentc7cfde42a9ec05b72d15ebaa9a59cefed4cd323a (diff)
downloadlibvpx-a9b465c5c9a35cb80e38d612bb63ba8d09c6855a.tar
libvpx-a9b465c5c9a35cb80e38d612bb63ba8d09c6855a.tar.gz
libvpx-a9b465c5c9a35cb80e38d612bb63ba8d09c6855a.tar.bz2
libvpx-a9b465c5c9a35cb80e38d612bb63ba8d09c6855a.zip
Merge "Add save/restore xmm registers in x86 assembly code"
Diffstat (limited to 'vp8/common')
-rw-r--r--vp8/common/x86/idctllm_sse2.asm50
-rw-r--r--vp8/common/x86/subpixel_ssse3.asm12
2 files changed, 39 insertions, 23 deletions
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index edee1578e..c873869ab 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2):
mov rdx, arg(1) ; dequant
mov rax, arg(0) ; qcoeff
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
movd xmm4, [rax]
movd xmm5, [rdx]
@@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2):
pmullw xmm4, xmm5
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
; clear coeffs
- movd [rax], xmm7
- movd [rax+32], xmm7
+ movd [rax], xmm5
+ movd [rax+32], xmm5
;pshufb
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
@@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2):
lea rcx, [3*rcx]
movq xmm3, [rax+rcx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
mov rax, arg(3) ; dst
movsxd rdx, dword ptr arg(4) ; dst_stride
@@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2):
paddw xmm3, xmm4
; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
; store blocks back out
movq [rax], xmm0
@@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2):
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2):
mov rdi, arg(3) ; dst
mov rdx, arg(5) ; dc
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
@@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2):
psraw xmm4, 3
; Predict buffer needs to be expanded from bytes to words
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
; Add to predict buffer
paddw xmm0, xmm4
@@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2):
paddw xmm3, xmm4
; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
; Load destination stride before writing out,
; doesn't need to persist
@@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2):
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 0ec18de76..1db3d629c 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -286,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -393,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -413,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -508,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -580,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -598,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -670,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -718,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -808,6 +819,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret