summaryrefslogtreecommitdiff
path: root/vp8/common/x86/recon_sse2.asm
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2012-03-29 14:24:53 -0400
committerScott LaVarnway <slavarnway@google.com>2012-03-29 14:24:53 -0400
commita337725625207758e56c1820c3a06bc00ca22b1a (patch)
tree61447d4962f83059fc0577f1531c567d86b461ea /vp8/common/x86/recon_sse2.asm
parentb3151c80fc33419694e19c8ca028c4d06c69353a (diff)
downloadlibvpx-a337725625207758e56c1820c3a06bc00ca22b1a.tar
libvpx-a337725625207758e56c1820c3a06bc00ca22b1a.tar.gz
libvpx-a337725625207758e56c1820c3a06bc00ca22b1a.tar.bz2
libvpx-a337725625207758e56c1820c3a06bc00ca22b1a.zip
Updated vp8_build_intra_predictors_mby_s(sse2/ssse3)
to work with the latest code. Patch Set 2: aligned the above_row buffers to fix crash Change-Id: I7a6992a20ed079ccd302f8c26215cf3057f8b70c
Diffstat (limited to 'vp8/common/x86/recon_sse2.asm')
-rw-r--r--vp8/common/x86/recon_sse2.asm138
1 files changed, 76 insertions, 62 deletions
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index d371ebd74..7b6e3cffe 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -133,22 +133,20 @@ sym(vp8_intra_pred_uv_dc_mmx2):
; end prolog
; from top
- mov rsi, arg(2) ;above;
- pxor mm0, mm0
- movq mm1, [rsi]
- psadbw mm1, mm0
-
- ; from left
+ mov rdi, arg(2) ;above;
mov rsi, arg(3) ;left;
movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor mm0, mm0
+ movq mm1, [rdi]
lea rdi, [rax*3]
+ psadbw mm1, mm0
+ ; from left
movzx ecx, byte [rsi]
movzx edx, byte [rsi+rax*1]
add ecx, edx
movzx edx, byte [rsi+rax*2]
add ecx, edx
-
movzx edx, byte [rsi+rdi]
lea rsi, [rsi+rax*4]
add ecx, edx
@@ -166,23 +164,23 @@ sym(vp8_intra_pred_uv_dc_mmx2):
lea edx, [edx+ecx+8]
sar edx, 4
movd mm1, edx
+ movsxd rcx, dword ptr arg(1) ;dst_stride
pshufw mm1, mm1, 0x0
+ mov rdi, arg(0) ;dst;
packuswb mm1, mm1
; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
lea rax, [rcx*3]
+ lea rdx, [rdi+rcx*4]
movq [rdi ], mm1
movq [rdi+rcx ], mm1
movq [rdi+rcx*2], mm1
movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
+ movq [rdx ], mm1
+ movq [rdx+rcx ], mm1
+ movq [rdx+rcx*2], mm1
+ movq [rdx+rax ], mm1
; begin epilog
pop rdi
@@ -478,7 +476,7 @@ sym(vp8_intra_pred_uv_ve_mmx):
; int dst_stride
; unsigned char *above,
; unsigned char *left,
-; int left_stride,
+; int left_stride
; )
%macro vp8_intra_pred_uv_ho 1
global sym(vp8_intra_pred_uv_ho_%1)
@@ -575,38 +573,43 @@ vp8_intra_pred_uv_ho ssse3
;void vp8_intra_pred_y_dc_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
; )
global sym(vp8_intra_pred_y_dc_sse2)
sym(vp8_intra_pred_y_dc_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
+ mov rdi, arg(2) ;above
+ mov rsi, arg(3) ;left
+ movsxd rax, dword ptr arg(4) ;left_stride;
+
pxor xmm0, xmm0
- movdqa xmm1, [rsi]
+ movdqa xmm1, [rdi]
psadbw xmm1, xmm0
movq xmm2, xmm1
punpckhqdq xmm1, xmm1
paddw xmm1, xmm2
; from left
- dec rsi
lea rdi, [rax*3]
- movzx ecx, byte [rsi+rax]
+
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
movzx edx, byte [rsi+rax*2]
add ecx, edx
movzx edx, byte [rsi+rdi]
add ecx, edx
lea rsi, [rsi+rax*4]
+
movzx edx, byte [rsi]
add ecx, edx
movzx edx, byte [rsi+rax]
@@ -616,6 +619,7 @@ sym(vp8_intra_pred_y_dc_sse2):
movzx edx, byte [rsi+rdi]
add ecx, edx
lea rsi, [rsi+rax*4]
+
movzx edx, byte [rsi]
add ecx, edx
movzx edx, byte [rsi+rax]
@@ -625,6 +629,7 @@ sym(vp8_intra_pred_y_dc_sse2):
movzx edx, byte [rsi+rdi]
add ecx, edx
lea rsi, [rsi+rax*4]
+
movzx edx, byte [rsi]
add ecx, edx
movzx edx, byte [rsi+rax]
@@ -633,8 +638,6 @@ sym(vp8_intra_pred_y_dc_sse2):
add ecx, edx
movzx edx, byte [rsi+rdi]
add ecx, edx
- movzx edx, byte [rsi+rax*4]
- add ecx, edx
; add up
pextrw edx, xmm1, 0x0
@@ -676,22 +679,23 @@ sym(vp8_intra_pred_y_dc_sse2):
;void vp8_intra_pred_y_dctop_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
; )
global sym(vp8_intra_pred_y_dctop_sse2)
sym(vp8_intra_pred_y_dctop_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
GET_GOT rbx
; end prolog
+ ;arg(3), arg(4) not used
+
; from top
- mov rcx, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rcx, rax
+ mov rcx, arg(2) ;above;
pxor xmm0, xmm0
movdqa xmm1, [rcx]
psadbw xmm1, xmm0
@@ -737,22 +741,25 @@ sym(vp8_intra_pred_y_dctop_sse2):
;void vp8_intra_pred_y_dcleft_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
; )
global sym(vp8_intra_pred_y_dcleft_sse2)
sym(vp8_intra_pred_y_dcleft_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
+ ;arg(2) not used
+
; from left
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- dec rsi
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+
lea rdi, [rax*3]
movzx ecx, byte [rsi]
movzx edx, byte [rsi+rax]
@@ -827,18 +834,21 @@ sym(vp8_intra_pred_y_dcleft_sse2):
;void vp8_intra_pred_y_dc128_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
; )
global sym(vp8_intra_pred_y_dc128_sse2)
sym(vp8_intra_pred_y_dc128_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
GET_GOT rbx
; end prolog
+ ;arg(2), arg(3), arg(4) not used
+
; write out
mov rsi, 2
movdqa xmm1, [GLOBAL(dc_128)]
@@ -870,15 +880,16 @@ sym(vp8_intra_pred_y_dc128_sse2):
;void vp8_intra_pred_y_tm_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
; )
%macro vp8_intra_pred_y_tm 1
global sym(vp8_intra_pred_y_tm_%1)
sym(vp8_intra_pred_y_tm_%1):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
GET_GOT rbx
@@ -886,9 +897,8 @@ sym(vp8_intra_pred_y_tm_%1):
; read top row
mov edx, 8
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
+ mov rsi, arg(2) ;above
+ movsxd rax, dword ptr arg(4) ;left_stride;
pxor xmm0, xmm0
%ifidn %1, ssse3
movdqa xmm3, [GLOBAL(dc_1024)]
@@ -900,7 +910,7 @@ sym(vp8_intra_pred_y_tm_%1):
; set up left ptrs ans subtract topleft
movd xmm4, [rsi-1]
- lea rsi, [rsi+rax-1]
+ mov rsi, arg(3) ;left
%ifidn %1, sse2
punpcklbw xmm4, xmm0
pshuflw xmm4, xmm4, 0x0
@@ -958,27 +968,29 @@ vp8_intra_pred_y_tm ssse3
;void vp8_intra_pred_y_ve_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
; )
global sym(vp8_intra_pred_y_ve_sse2)
sym(vp8_intra_pred_y_ve_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
; end prolog
+ ;arg(3), arg(4) not used
+
+ mov rax, arg(2) ;above;
+ mov rsi, 2
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+
; read from top
- mov rax, arg(2) ;src;
- movsxd rdx, dword ptr arg(3) ;src_stride;
- sub rax, rdx
movdqa xmm1, [rax]
; write out
- mov rsi, 2
mov rax, arg(0) ;dst;
- movsxd rdx, dword ptr arg(1) ;dst_stride
lea rcx, [rdx*3]
.label
@@ -1004,25 +1016,27 @@ sym(vp8_intra_pred_y_ve_sse2):
;void vp8_intra_pred_y_ho_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
global sym(vp8_intra_pred_y_ho_sse2)
sym(vp8_intra_pred_y_ho_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
+ ;arg(2) not used
+
; read from left and write out
mov edx, 8
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
- dec rsi
vp8_intra_pred_y_ho_sse2_loop:
movd xmm0, [rsi]