summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-04-20 00:05:11 -0400
committerJohn Koleszar <jkoleszar@google.com>2011-04-20 00:05:12 -0400
commit65b44c2911a8625e0152f8d6c893d597780e94fd (patch)
tree0f91b78a77af433c193bbcddabeeb89472157363 /vp8
parenta5d3febc1377275518a3c8cf6595ff4b7ffa7021 (diff)
parenta9b465c5c9a35cb80e38d612bb63ba8d09c6855a (diff)
downloadlibvpx-65b44c2911a8625e0152f8d6c893d597780e94fd.tar
libvpx-65b44c2911a8625e0152f8d6c893d597780e94fd.tar.gz
libvpx-65b44c2911a8625e0152f8d6c893d597780e94fd.tar.bz2
libvpx-65b44c2911a8625e0152f8d6c893d597780e94fd.zip
Merge remote branch 'origin/master' into experimental
Change-Id: I9e9ece0424b2f4b6861e9c7c0986f6eccc9159d6
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/x86/idctllm_sse2.asm50
-rw-r--r--vp8/common/x86/subpixel_ssse3.asm12
-rw-r--r--vp8/encoder/x86/dct_sse2.asm2
-rw-r--r--vp8/encoder/x86/encodeopt.asm82
-rw-r--r--vp8/encoder/x86/sad_sse2.asm18
-rw-r--r--vp8/encoder/x86/sad_sse3.asm10
-rw-r--r--vp8/encoder/x86/sad_ssse3.asm4
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm12
8 files changed, 116 insertions, 74 deletions
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index edee1578e..c873869ab 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2):
mov rdx, arg(1) ; dequant
mov rax, arg(0) ; qcoeff
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
movd xmm4, [rax]
movd xmm5, [rdx]
@@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2):
pmullw xmm4, xmm5
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
; clear coeffs
- movd [rax], xmm7
- movd [rax+32], xmm7
+ movd [rax], xmm5
+ movd [rax+32], xmm5
;pshufb
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
@@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2):
lea rcx, [3*rcx]
movq xmm3, [rax+rcx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
mov rax, arg(3) ; dst
movsxd rdx, dword ptr arg(4) ; dst_stride
@@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2):
paddw xmm3, xmm4
; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
; store blocks back out
movq [rax], xmm0
@@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2):
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2):
mov rdi, arg(3) ; dst
mov rdx, arg(5) ; dc
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
@@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2):
psraw xmm4, 3
; Predict buffer needs to be expanded from bytes to words
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
; Add to predict buffer
paddw xmm0, xmm4
@@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2):
paddw xmm3, xmm4
; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
; Load destination stride before writing out,
; doesn't need to persist
@@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2):
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 0ec18de76..1db3d629c 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -286,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -393,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -413,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -508,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -580,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -598,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -670,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -718,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -808,6 +819,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 652dd9804..287ad482f 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -33,6 +33,7 @@
%define input rcx
%define output rdx
%define pitch r8
+ SAVE_XMM
%else
%define input rdi
%define output rsi
@@ -53,6 +54,7 @@
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
%endif
%endif
ret
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index c0f06bbbb..e142a7573 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -22,33 +22,33 @@ sym(vp8_block_error_xmm):
; end prologue
mov rsi, arg(0) ;coeff_ptr
-
mov rdi, arg(1) ;dcoef_ptr
- movdqa xmm3, [rsi]
- movdqa xmm4, [rdi]
- movdqa xmm5, [rsi+16]
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
+
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
- movdqa xmm6, [rdi+16]
- psubw xmm3, xmm4
+ psubw xmm0, xmm1
+ psubw xmm2, xmm3
- psubw xmm5, xmm6
- pmaddwd xmm3, xmm3
- pmaddwd xmm5, xmm5
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm2, xmm2
- paddd xmm3, xmm5
+ paddd xmm0, xmm2
- pxor xmm7, xmm7
- movdqa xmm0, xmm3
+ pxor xmm5, xmm5
+ movdqa xmm1, xmm0
- punpckldq xmm0, xmm7
- punpckhdq xmm3, xmm7
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
- paddd xmm0, xmm3
- movdqa xmm3, xmm0
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
psrldq xmm0, 8
- paddd xmm0, xmm3
+ paddd xmm0, xmm1
movq rax, xmm0
@@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM ; 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;coeff_ptr
- pxor xmm7, xmm7
+ pxor xmm6, xmm6
mov rdi, arg(1) ;dcoef_ptr
- pxor xmm2, xmm2
+ pxor xmm4, xmm4
- movd xmm1, dword ptr arg(2) ;dc
- por xmm1, xmm2
+ movd xmm5, dword ptr arg(2) ;dc
+ por xmm5, xmm4
- pcmpeqw xmm1, xmm7
+ pcmpeqw xmm5, xmm6
mov rcx, 16
mberror_loop:
- movdqa xmm3, [rsi]
- movdqa xmm4, [rdi]
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
- movdqa xmm5, [rsi+16]
- movdqa xmm6, [rdi+16]
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
- psubw xmm5, xmm6
- pmaddwd xmm5, xmm5
+ psubw xmm2, xmm3
+ pmaddwd xmm2, xmm2
- psubw xmm3, xmm4
- pand xmm3, xmm1
+ psubw xmm0, xmm1
+ pand xmm0, xmm5
- pmaddwd xmm3, xmm3
+ pmaddwd xmm0, xmm0
add rsi, 32
add rdi, 32
sub rcx, 1
- paddd xmm2, xmm5
+ paddd xmm4, xmm2
- paddd xmm2, xmm3
+ paddd xmm4, xmm0
jnz mberror_loop
- movdqa xmm0, xmm2
- punpckldq xmm0, xmm7
+ movdqa xmm0, xmm4
+ punpckldq xmm0, xmm6
- punpckhdq xmm2, xmm7
- paddd xmm0, xmm2
+ punpckhdq xmm4, xmm6
+ paddd xmm0, xmm4
movdqa xmm1, xmm0
psrldq xmm0, 8
@@ -265,6 +266,7 @@ mberror_loop:
pop rdi
pop rsi
; begin epilog
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl):
mov rdi, arg(1) ;d_ptr
mov rcx, 16
- pxor xmm7, xmm7
+ pxor xmm3, xmm3
mbuverror_loop:
@@ -352,7 +354,7 @@ mbuverror_loop:
psubw xmm1, xmm2
pmaddwd xmm1, xmm1
- paddd xmm7, xmm1
+ paddd xmm3, xmm1
add rsi, 16
add rdi, 16
@@ -361,7 +363,7 @@ mbuverror_loop:
jnz mbuverror_loop
pxor xmm0, xmm0
- movdqa xmm1, xmm7
+ movdqa xmm1, xmm3
movdqa xmm2, xmm1
punpckldq xmm1, xmm0
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index cc6bc3cd9..d9ac3ff4f 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM ; 6
push rsi
push rdi
; end prolog
@@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt):
lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8]
- pxor xmm7, xmm7
+ pxor xmm6, xmm6
x16x16sad_wmt_loop:
@@ -52,32 +53,33 @@ x16x16sad_wmt_loop:
punpcklbw xmm1, xmm3
psadbw xmm0, xmm1
- movq xmm6, QWORD PTR [rsi+rax+8]
+ movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
- punpcklbw xmm4, xmm6
+ punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3
psadbw xmm4, xmm5
- paddw xmm7, xmm0
- paddw xmm7, xmm4
+ paddw xmm6, xmm0
+ paddw xmm6, xmm4
cmp rsi, rcx
jne x16x16sad_wmt_loop
- movq xmm0, xmm7
- psrldq xmm7, 8
+ movq xmm0, xmm6
+ psrldq xmm6, 8
- paddw xmm0, xmm7
+ paddw xmm0, xmm6
movq rax, xmm0
; begin epilog
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index f0336ab17..666879267 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -39,8 +39,9 @@
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
- %define result_ptr [rsp+8+4*8]
- %define max_err [rsp+8+4*8]
+ %define result_ptr [rsp+40+4*8]
+ %define max_err [rsp+40+4*8]
+ SAVE_XMM
%else
%define src_ptr rdi
%define src_stride rsi
@@ -72,6 +73,7 @@
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
%endif
%endif
ret
@@ -113,7 +115,8 @@
%define r2_ptr r11
%define r3_ptr r8
%define ref_stride r9
- %define result_ptr [rsp+16+4*8]
+ %define result_ptr [rsp+48+4*8]
+ SAVE_XMM
push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
@@ -151,6 +154,7 @@
%else
%ifidn __OUTPUT_FORMAT__,x64
pop rsi
+ RESTORE_XMM
%endif
%endif
ret
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 69c5eaedc..7c7cd0ade 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM
push rsi
push rdi
push rcx
@@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off:
pop rcx
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM
push rsi
push rdi
push rcx
@@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off:
pop rcx
pop rdi
pop rsi
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index c2c30deb2..2c0e170d8 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
push rbx
push rsi
push rdi
@@ -206,6 +207,7 @@ var16loop:
pop rdi
pop rsi
pop rbx
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -321,6 +324,7 @@ var16peloop:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2):
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1:
pop rdi
pop rsi
RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret