summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp8/common/rtcd_defs.sh2
-rw-r--r--vp8/common/x86/recon_sse2.asm101
-rw-r--r--vp8/common/x86/recon_wrapper_sse2.c81
3 files changed, 100 insertions, 84 deletions
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index fee896595..c069a21f1 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -126,7 +126,7 @@ prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned
#TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
-#TODO: fix assembly --- specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
specialize vp8_intra4x4_predict media
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 4b68ef5f2..d371ebd74 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -119,35 +119,39 @@ sym(vp8_copy_mem16x16_sse2):
;void vp8_intra_pred_uv_dc_mmx2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
global sym(vp8_intra_pred_uv_dc_mmx2)
sym(vp8_intra_pred_uv_dc_mmx2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
+ mov rsi, arg(2) ;above;
pxor mm0, mm0
movq mm1, [rsi]
psadbw mm1, mm0
; from left
- dec rsi
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
lea rdi, [rax*3]
- movzx ecx, byte [rsi+rax]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax*1]
+ add ecx, edx
movzx edx, byte [rsi+rax*2]
add ecx, edx
+
+
movzx edx, byte [rsi+rdi]
- add ecx, edx
lea rsi, [rsi+rax*4]
+ add ecx, edx
movzx edx, byte [rsi]
add ecx, edx
movzx edx, byte [rsi+rax]
@@ -156,8 +160,6 @@ sym(vp8_intra_pred_uv_dc_mmx2):
add ecx, edx
movzx edx, byte [rsi+rdi]
add ecx, edx
- movzx edx, byte [rsi+rax*4]
- add ecx, edx
; add up
pextrw edx, mm1, 0x0
@@ -192,23 +194,24 @@ sym(vp8_intra_pred_uv_dc_mmx2):
;void vp8_intra_pred_uv_dctop_mmx2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
global sym(vp8_intra_pred_uv_dctop_mmx2)
sym(vp8_intra_pred_uv_dctop_mmx2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
+ ;arg(3), arg(4) not used
+
; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
+ mov rsi, arg(2) ;above;
pxor mm0, mm0
movq mm1, [rsi]
psadbw mm1, mm0
@@ -245,22 +248,24 @@ sym(vp8_intra_pred_uv_dctop_mmx2):
;void vp8_intra_pred_uv_dcleft_mmx2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
global sym(vp8_intra_pred_uv_dcleft_mmx2)
sym(vp8_intra_pred_uv_dcleft_mmx2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
+ ;arg(2) not used
+
; from left
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- dec rsi
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
lea rdi, [rax*3]
movzx ecx, byte [rsi]
movzx edx, byte [rsi+rax]
@@ -310,17 +315,20 @@ sym(vp8_intra_pred_uv_dcleft_mmx2):
;void vp8_intra_pred_uv_dc128_mmx(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
global sym(vp8_intra_pred_uv_dc128_mmx)
sym(vp8_intra_pred_uv_dc128_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
; end prolog
+ ;arg(2), arg(3), arg(4) not used
+
; write out
movq mm1, [GLOBAL(dc_128)]
mov rax, arg(0) ;dst;
@@ -346,15 +354,16 @@ sym(vp8_intra_pred_uv_dc128_mmx):
;void vp8_intra_pred_uv_tm_sse2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
%macro vp8_intra_pred_uv_tm 1
global sym(vp8_intra_pred_uv_tm_%1)
sym(vp8_intra_pred_uv_tm_%1):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
@@ -362,9 +371,8 @@ sym(vp8_intra_pred_uv_tm_%1):
; read top row
mov edx, 4
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
+ mov rsi, arg(2) ;above
+ movsxd rax, dword ptr arg(4) ;left_stride;
pxor xmm0, xmm0
%ifidn %1, ssse3
movdqa xmm2, [GLOBAL(dc_1024)]
@@ -374,7 +382,7 @@ sym(vp8_intra_pred_uv_tm_%1):
; set up left ptrs ans subtract topleft
movd xmm3, [rsi-1]
- lea rsi, [rsi+rax-1]
+ mov rsi, arg(3) ;left;
%ifidn %1, sse2
punpcklbw xmm3, xmm0
pshuflw xmm3, xmm3, 0x0
@@ -427,20 +435,22 @@ vp8_intra_pred_uv_tm ssse3
;void vp8_intra_pred_uv_ve_mmx(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
global sym(vp8_intra_pred_uv_ve_mmx)
sym(vp8_intra_pred_uv_ve_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
; end prolog
+ ; arg(3), arg(4) not used
+
; read from top
mov rax, arg(2) ;src;
- movsxd rdx, dword ptr arg(3) ;src_stride;
- sub rax, rdx
+
movq mm1, [rax]
; write out
@@ -466,15 +476,16 @@ sym(vp8_intra_pred_uv_ve_mmx):
;void vp8_intra_pred_uv_ho_mmx2(
; unsigned char *dst,
; int dst_stride
-; unsigned char *src,
-; int src_stride,
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
; )
%macro vp8_intra_pred_uv_ho 1
global sym(vp8_intra_pred_uv_ho_%1)
sym(vp8_intra_pred_uv_ho_%1):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
%ifidn %1, ssse3
@@ -485,12 +496,14 @@ sym(vp8_intra_pred_uv_ho_%1):
%endif
; end prolog
+ ;arg(2) not used
+
; read from left and write out
%ifidn %1, mmx2
mov edx, 4
%endif
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
+ mov rsi, arg(3) ;left
+ movsxd rax, dword ptr arg(4) ;left_stride;
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
%ifidn %1, ssse3
@@ -498,7 +511,7 @@ sym(vp8_intra_pred_uv_ho_%1):
movdqa xmm2, [GLOBAL(dc_00001111)]
lea rbx, [rax*3]
%endif
- dec rsi
+
%ifidn %1, mmx2
.vp8_intra_pred_uv_ho_%1_loop:
movd mm0, [rsi]
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
index cb9ab80b9..949b2fb0e 100644
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -15,7 +15,8 @@
#define build_intra_predictors_mbuv_prototype(sym) \
void sym(unsigned char *dst, int dst_stride, \
- const unsigned char *src, int src_stride)
+ const unsigned char *above, \
+ const unsigned char *left, int left_stride)
typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
@@ -29,15 +30,19 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_stride,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
build_intra_predictors_mbuv_fn_t tm_func,
build_intra_predictors_mbuv_fn_t ho_func)
{
int mode = x->mode_info_context->mbmi.uv_mode;
build_intra_predictors_mbuv_fn_t fn;
- int src_stride = x->dst.uv_stride;
switch (mode) {
case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
@@ -59,38 +64,48 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
default: return;
}
- fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
- fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+ fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
+ fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
}
-void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
{
- vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
- &x->predictor[320], 8,
+ vp8_build_intra_predictors_mbuv_x86(x,
+ uabove_row, vabove_row,
+ upred_ptr,
+ vpred_ptr, pred_stride,
+ uleft,
+ vleft,
+ left_stride,
vp8_intra_pred_uv_tm_sse2,
vp8_intra_pred_uv_ho_mmx2);
}
-void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
{
- vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
- &x->predictor[320], 8,
- vp8_intra_pred_uv_tm_ssse3,
- vp8_intra_pred_uv_ho_ssse3);
-}
-
-void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
-{
- vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
- x->dst.v_buffer, x->dst.uv_stride,
- vp8_intra_pred_uv_tm_sse2,
- vp8_intra_pred_uv_ho_mmx2);
-}
-
-void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
-{
- vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
- x->dst.v_buffer, x->dst.uv_stride,
+ vp8_build_intra_predictors_mbuv_x86(x,
+ uabove_row, vabove_row,
+ upred_ptr,
+ vpred_ptr, pred_stride,
+ uleft,
+ vleft,
+ left_stride,
vp8_intra_pred_uv_tm_ssse3,
vp8_intra_pred_uv_ho_ssse3);
}
@@ -132,22 +147,10 @@ static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
default: return;
}
- fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
+// fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
return;
}
-void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
-{
- vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
- vp8_intra_pred_y_tm_sse2);
-}
-
-void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
-{
- vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
- vp8_intra_pred_y_tm_ssse3);
-}
-
void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,