summaryrefslogtreecommitdiff
path: root/vp8/common
diff options
context:
space:
mode:
authorRonald S. Bultje <rbultje@google.com>2011-04-29 11:51:37 -0700
committerRonald S. Bultje <rbultje@google.com>2011-04-29 11:52:09 -0700
commit5a23352c030d2b190976ea55a9a759c734bd9eaa (patch)
tree2b7f31efe7035493ff80fcf95376ceb99a86b49c /vp8/common
parentdfa9e2c5ea4a4282d931e382c327fc2d149ebcf8 (diff)
downloadlibvpx-5a23352c030d2b190976ea55a9a759c734bd9eaa.tar
libvpx-5a23352c030d2b190976ea55a9a759c734bd9eaa.tar.gz
libvpx-5a23352c030d2b190976ea55a9a759c734bd9eaa.tar.bz2
libvpx-5a23352c030d2b190976ea55a9a759c734bd9eaa.zip
Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
Change-Id: I658a1df7d825f820573cb2d11ad402f9d2791035
Diffstat (limited to 'vp8/common')
-rw-r--r--vp8/common/x86/recon_sse2.asm59
-rw-r--r--vp8/common/x86/recon_wrapper_sse2.c18
2 files changed, 67 insertions, 10 deletions
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 97dc4f686..aaa6a8fb9 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -578,23 +578,35 @@ sym(vp8_intra_pred_uv_ve_mmx):
; unsigned char *src,
; int src_stride,
; )
-global sym(vp8_intra_pred_uv_ho_mmx2)
-sym(vp8_intra_pred_uv_ho_mmx2):
+%macro vp8_intra_pred_uv_ho 1
+global sym(vp8_intra_pred_uv_ho_%1)
+sym(vp8_intra_pred_uv_ho_%1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
+%ifidn %1, ssse3
+ push rbx
+%endif
; end prolog
; read from left and write out
+%ifidn %1, mmx2
mov edx, 4
+%endif
mov rsi, arg(2) ;src;
movsxd rax, dword ptr arg(3) ;src_stride;
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+ lea rbx, [rax*3]
+ lea rdx, [rcx*3]
+ movdqa xmm2, [GLOBAL(dc_00001111)]
+%endif
dec rsi
-vp8_intra_pred_uv_ho_mmx2_loop:
+%ifidn %1, mmx2
+vp8_intra_pred_uv_ho_%1_loop:
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
@@ -606,14 +618,49 @@ vp8_intra_pred_uv_ho_mmx2_loop:
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
- jnz vp8_intra_pred_uv_ho_mmx2_loop
+ jnz vp8_intra_pred_uv_ho_%1_loop
+%else
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+ lea rsi, [rsi+rax*4]
+ lea rdi, [rdi+rcx*4]
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+%endif
; begin epilog
+%ifidn %1, ssse3
+ pop rbx
+%endif
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
+%endmacro
+
+vp8_intra_pred_uv_ho mmx2
+vp8_intra_pred_uv_ho ssse3
SECTION_RODATA
dc_128:
@@ -623,3 +670,7 @@ dc_4:
align 16
dc_1024:
times 8 dw 0x400
+align 16
+dc_00001111:
+ times 8 db 0
+ times 8 db 1
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
index 86b4da2c2..cb7b69c08 100644
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -23,6 +23,7 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
@@ -31,7 +32,8 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_stride,
- build_intra_predictors_mbuv_fn_t tm_func)
+ build_intra_predictors_mbuv_fn_t tm_func,
+ build_intra_predictors_mbuv_fn_t ho_func)
{
int mode = x->mode_info_context->mbmi.uv_mode;
build_intra_predictors_mbuv_fn_t fn;
@@ -39,7 +41,7 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
switch (mode) {
case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
- case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+ case H_PRED: fn = ho_func; break;
case TM_PRED: fn = tm_func; break;
case DC_PRED:
if (x->up_available) {
@@ -65,26 +67,30 @@ void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
&x->predictor[320], 8,
- vp8_intra_pred_uv_tm_sse2);
+ vp8_intra_pred_uv_tm_sse2,
+ vp8_intra_pred_uv_ho_mmx2);
}
void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
&x->predictor[320], 8,
- vp8_intra_pred_uv_tm_ssse3);
+ vp8_intra_pred_uv_tm_ssse3,
+ vp8_intra_pred_uv_ho_ssse3);
}
void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
x->dst.v_buffer, x->dst.uv_stride,
- vp8_intra_pred_uv_tm_sse2);
+ vp8_intra_pred_uv_tm_sse2,
+ vp8_intra_pred_uv_ho_mmx2);
}
void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
{
vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
x->dst.v_buffer, x->dst.uv_stride,
- vp8_intra_pred_uv_tm_ssse3);
+ vp8_intra_pred_uv_tm_ssse3,
+ vp8_intra_pred_uv_ho_ssse3);
}