summaryrefslogtreecommitdiff
path: root/vp8/common
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-04-28 00:05:07 -0400
committerJohn Koleszar <jkoleszar@google.com>2011-04-28 00:05:07 -0400
commitc26bb0fe8f84d6e9b67f9304fc2b1b980e153a37 (patch)
tree092fd3e570fab64ddf6ac9716ca912095557400b /vp8/common
parentb93faff5a017500f131bdce3777b4774d47e6641 (diff)
parent2e102855f4f69148d17771f584c26e1498ec82e2 (diff)
downloadlibvpx-c26bb0fe8f84d6e9b67f9304fc2b1b980e153a37.tar
libvpx-c26bb0fe8f84d6e9b67f9304fc2b1b980e153a37.tar.gz
libvpx-c26bb0fe8f84d6e9b67f9304fc2b1b980e153a37.tar.bz2
libvpx-c26bb0fe8f84d6e9b67f9304fc2b1b980e153a37.zip
Merge remote branch 'origin/master' into experimental
Change-Id: I7d91efbc3662c86d6efa2d7495eb4689ccdb0ced
Diffstat (limited to 'vp8/common')
-rw-r--r--vp8/common/generic/systemdependent.c6
-rw-r--r--vp8/common/recon.h25
-rw-r--r--vp8/common/reconinter.c104
-rw-r--r--vp8/common/reconinter.h1
-rw-r--r--vp8/common/reconintra.h5
-rw-r--r--vp8/common/reconintra4x4.c2
-rw-r--r--vp8/common/x86/recon_sse2.asm408
-rw-r--r--vp8/common/x86/recon_wrapper_sse2.c90
-rw-r--r--vp8/common/x86/recon_x86.h22
-rw-r--r--vp8/common/x86/x86_systemdependent.c9
10 files changed, 561 insertions, 111 deletions
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 2f8997953..fea6dcd23 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -43,6 +43,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
vp8_build_intra_predictors_mby;
rtcd->recon.build_intra_predictors_mby_s =
vp8_build_intra_predictors_mby_s;
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp8_build_intra_predictors_mbuv;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp8_build_intra_predictors_mbuv_s;
+ rtcd->recon.intra4x4_predict =
+ vp8_intra4x4_predict;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index e608f218c..7cfc779cd 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -26,6 +26,9 @@
#define prototype_build_intra_predictors(sym) \
void sym(MACROBLOCKD *x)
+#define prototype_intra4x4_predict(sym) \
+ void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
+
struct vp8_recon_rtcd_vtable;
#if ARCH_X86 || ARCH_X86_64
@@ -88,11 +91,30 @@ extern prototype_build_intra_predictors\
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mby_s);
+#ifndef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
+#endif
+extern prototype_build_intra_predictors\
+ (vp8_recon_build_intra_predictors_mbuv);
+
+#ifndef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
+#endif
+extern prototype_build_intra_predictors\
+ (vp8_recon_build_intra_predictors_mbuv_s);
+
+#ifndef vp8_recon_intra4x4_predict
+#define vp8_recon_intra4x4_predict vp8_intra4x4_predict
+#endif
+extern prototype_intra4x4_predict\
+ (vp8_recon_intra4x4_predict);
+
typedef prototype_copy_block((*vp8_copy_block_fn_t));
typedef prototype_recon_block((*vp8_recon_fn_t));
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
+typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
{
vp8_copy_block_fn_t copy16x16;
@@ -105,6 +127,9 @@ typedef struct vp8_recon_rtcd_vtable
vp8_recon_mb_fn_t recon_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
+ vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
+ vp8_build_intra_pred_fn_t build_intra_predictors_mbuv;
+ vp8_intra4x4_pred_fn_t intra4x4_predict;
} vp8_recon_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 6862bae11..80b17acb6 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -543,107 +543,3 @@ void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x)
RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
}
}
-void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
-{
- unsigned char *dst_ptr = x->dst.y_buffer;
-
- if (x->mode_info_context->mbmi.mode != SPLITMV)
- {
- vp8_build_inter16x16_predictors_mb_s(x);
- }
- else
- {
- /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
- * if sth is wrong, go back to what it is in build_inter_predictors_mb.
- */
- int i;
-
- if (x->mode_info_context->mbmi.partitioning < 3)
- {
- for (i = 0; i < 4; i++)
- {
- unsigned char *ptr_base;
- unsigned char *ptr;
- BLOCKD *d = &x->block[bbb[i]];
-
- ptr_base = *(d->base_pre);
- ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-
- if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- }
- }
- else
- {
- for (i = 0; i < 16; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- {
- /*build_inter_predictors2b(x, d0, 16);*/
- unsigned char *ptr_base;
- unsigned char *ptr;
-
- ptr_base = *(d0->base_pre);
- ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
- if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
- }
- }
- else
- {
- vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
- vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
- }
- }
- }
-
- for (i = 16; i < 24; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- {
- /*build_inter_predictors2b(x, d0, 8);*/
- unsigned char *ptr_base;
- unsigned char *ptr;
-
- ptr_base = *(d0->base_pre);
- ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
- if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x4(ptr, d0->pre_stride,
- d0->bmi.mv.as_mv.col & 7,
- d0->bmi.mv.as_mv.row & 7,
- dst_ptr, x->dst.uv_stride);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr,
- d0->pre_stride, dst_ptr, x->dst.uv_stride);
- }
- }
- else
- {
- vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
- vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
- }
- }
- }
-}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 688bebe96..bdf49c9d0 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -13,7 +13,6 @@
#define __INC_RECONINTER_H
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h
index 4025a5307..47e479285 100644
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -14,9 +14,4 @@
extern void init_intra_left_above_pixels(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
-
-extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
-
#endif
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index 8ddae0059..12e2e60c7 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -14,7 +14,7 @@
#include "vpx_mem/vpx_mem.h"
#include "reconintra.h"
-void vp8_predict_intra4x4(BLOCKD *x,
+void vp8_intra4x4_predict(BLOCKD *x,
int b_mode,
unsigned char *predictor)
{
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 67b6420a9..a1cc2b1fe 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -229,3 +229,411 @@ sym(vp8_copy_mem16x16_sse2):
UNSHADOW_ARGS
pop rbp
ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dc_mmx2)
+sym(vp8_intra_pred_uv_dc_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor mm0, mm0
+ movd mm1, [rsi]
+ movd mm2, [rsi+4]
+ punpcklbw mm1, mm0
+ punpcklbw mm2, mm0
+ paddw mm1, mm2
+ pshufw mm2, mm1, 0x0e
+ paddw mm1, mm2
+ pshufw mm2, mm1, 0x01
+ paddw mm1, mm2
+
+ ; from left
+ dec rsi
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi+rax]
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*4]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, mm1, 0x0
+ lea edx, [edx+ecx+8]
+ sar edx, 4
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dctop_mmx2)
+sym(vp8_intra_pred_uv_dctop_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor mm0, mm0
+ movd mm1, [rsi]
+ movd mm2, [rsi+4]
+ punpcklbw mm1, mm0
+ punpcklbw mm2, mm0
+ paddw mm1, mm2
+ pshufw mm2, mm1, 0x0e
+ paddw mm1, mm2
+ pshufw mm2, mm1, 0x01
+ paddw mm1, mm2
+
+ ; add up
+ paddw mm1, [GLOBAL(dc_4)]
+ psraw mm1, 3
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dcleft_mmx2)
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from left
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ dec rsi
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+4]
+
+ ; add up
+ shr edx, 3
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dc128_mmx)
+sym(vp8_intra_pred_uv_dc128_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ ; end prolog
+
+ ; write out
+ movq mm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1)
+sym(vp8_intra_pred_uv_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read top row
+ mov edx, 4
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm2, [GLOBAL(dc_1024)]
+%endif
+ movq xmm1, [rsi]
+ punpcklbw xmm1, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm3, [rsi-1]
+ lea rsi, [rsi+rax-1]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ punpcklqdq xmm3, xmm3
+%else
+ pshufb xmm3, xmm2
+%endif
+ psubw xmm1, xmm3
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_uv_tm_%1_loop:
+ movd xmm3, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm3, xmm3
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm3, xmm2
+ pshufb xmm5, xmm2
+%endif
+ paddw xmm3, xmm1
+ paddw xmm5, xmm1
+ packuswb xmm3, xmm5
+ movq [rdi ], xmm3
+ movhps[rdi+rcx], xmm3
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_uv_tm_%1_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_ve_mmx)
+sym(vp8_intra_pred_uv_ve_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ ; end prolog
+
+ ; read from top
+ mov rax, arg(2) ;src;
+ movsxd rdx, dword ptr arg(3) ;src_stride;
+ sub rax, rdx
+ movq mm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_ho_mmx2)
+sym(vp8_intra_pred_uv_ho_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read from left and write out
+ mov edx, 4
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ dec rsi
+vp8_intra_pred_uv_ho_mmx2_loop:
+ movd mm0, [rsi]
+ movd mm1, [rsi+rax]
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm0, mm0, 0x0
+ pshufw mm1, mm1, 0x0
+ movq [rdi ], mm0
+ movq [rdi+rcx], mm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_uv_ho_mmx2_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+dc_128:
+ times 8 db 128
+dc_4:
+ times 4 dw 4
+align 16
+dc_1024:
+ times 8 dw 0x400
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
new file mode 100644
index 000000000..7b17851b5
--- /dev/null
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp8/common/recon.h"
+#include "recon_x86.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *src, int src_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static inline void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_stride,
+ build_intra_predictors_mbuv_fn_t tm_func)
+{
+ int mode = x->mode_info_context->mbmi.uv_mode;
+ build_intra_predictors_mbuv_fn_t fn;
+ int src_stride = x->dst.uv_stride;
+
+ switch (mode) {
+ case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+ case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+ case TM_PRED: fn = tm_func; break;
+ case DC_PRED:
+ if (x->up_available) {
+ if (x->left_available) {
+ fn = vp8_intra_pred_uv_dc_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dctop_mmx2; break;
+ }
+ } else if (x->left_available) {
+ fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dc128_mmx; break;
+ }
+ break;
+ default: return;
+ }
+
+ fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
+ fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+ &x->predictor[320], 8,
+ vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+ &x->predictor[320], 8,
+ vp8_intra_pred_uv_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+ x->dst.v_buffer, x->dst.uv_stride,
+ vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+ x->dst.v_buffer, x->dst.uv_stride,
+ vp8_intra_pred_uv_tm_ssse3);
+}
diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h
index 40ee65a12..fe0f8f0bc 100644
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -46,6 +46,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
extern prototype_recon_block(vp8_recon2b_sse2);
extern prototype_recon_block(vp8_recon4b_sse2);
extern prototype_copy_block(vp8_copy_mem16x16_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon2
@@ -57,6 +59,26 @@ extern prototype_copy_block(vp8_copy_mem16x16_sse2);
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
+#undef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2
+
+#undef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
+
+#endif
+#endif
+
+#if HAVE_SSSE3
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_ssse3
+
+#undef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
+
#endif
#endif
#endif
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index e89c07a4f..17667330a 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -88,6 +88,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->recon.recon2 = vp8_recon2b_sse2;
rtcd->recon.recon4 = vp8_recon4b_sse2;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2;
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp8_build_intra_predictors_mbuv_sse2;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp8_build_intra_predictors_mbuv_s_sse2;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2;
@@ -126,6 +130,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3;
rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3;
rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_ssse3;
+
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp8_build_intra_predictors_mbuv_ssse3;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp8_build_intra_predictors_mbuv_s_ssse3;
}
#endif