summaryrefslogtreecommitdiff
path: root/vp8/common
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/common')
-rw-r--r--vp8/common/generic/systemdependent.c6
-rw-r--r--vp8/common/postproc.c2
-rw-r--r--vp8/common/recon.h25
-rw-r--r--vp8/common/reconinter.c337
-rw-r--r--vp8/common/reconinter.h9
-rw-r--r--vp8/common/reconintra.h5
-rw-r--r--vp8/common/reconintra4x4.c2
-rw-r--r--vp8/common/x86/idctllm_mmx.asm12
-rw-r--r--vp8/common/x86/recon_sse2.asm394
-rw-r--r--vp8/common/x86/recon_wrapper_sse2.c90
-rw-r--r--vp8/common/x86/recon_x86.h22
-rw-r--r--vp8/common/x86/x86_systemdependent.c9
12 files changed, 640 insertions, 273 deletions
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 5c6464772..c36ae2119 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -43,6 +43,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
vp8_build_intra_predictors_mby;
rtcd->recon.build_intra_predictors_mby_s =
vp8_build_intra_predictors_mby_s;
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp8_build_intra_predictors_mbuv;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp8_build_intra_predictors_mbuv_s;
+ rtcd->recon.intra4x4_predict =
+ vp8_intra4x4_predict;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 660880b52..7dbe96649 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -806,7 +806,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
char zz[4];
int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
mi[mb_index].mbmi.mode != SPLITMV &&
- mi[mb_index].mbmi.mb_skip_coeff));
+ mi[mb_index].mbmi.mb_skip_coeff);
if (oci->frame_type == KEY_FRAME)
sprintf(zz, "a");
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index e608f218c..7cfc779cd 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -26,6 +26,9 @@
#define prototype_build_intra_predictors(sym) \
void sym(MACROBLOCKD *x)
+#define prototype_intra4x4_predict(sym) \
+ void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
+
struct vp8_recon_rtcd_vtable;
#if ARCH_X86 || ARCH_X86_64
@@ -88,11 +91,30 @@ extern prototype_build_intra_predictors\
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mby_s);
+#ifndef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
+#endif
+extern prototype_build_intra_predictors\
+ (vp8_recon_build_intra_predictors_mbuv);
+
+#ifndef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
+#endif
+extern prototype_build_intra_predictors\
+ (vp8_recon_build_intra_predictors_mbuv_s);
+
+#ifndef vp8_recon_intra4x4_predict
+#define vp8_recon_intra4x4_predict vp8_intra4x4_predict
+#endif
+extern prototype_intra4x4_predict\
+ (vp8_recon_intra4x4_predict);
+
typedef prototype_copy_block((*vp8_copy_block_fn_t));
typedef prototype_recon_block((*vp8_recon_fn_t));
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
+typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
{
vp8_copy_block_fn_t copy16x16;
@@ -105,6 +127,9 @@ typedef struct vp8_recon_rtcd_vtable
vp8_recon_mb_fn_t recon_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
+ vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
+ vp8_build_intra_pred_fn_t build_intra_predictors_mbuv;
+ vp8_intra4x4_pred_fn_t intra4x4_predict;
} vp8_recon_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 6862bae11..3b0405ca1 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -279,100 +279,111 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
}
}
-void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride)
{
+ int offset;
+ unsigned char *ptr;
+ unsigned char *uptr, *vptr;
- if (x->mode_info_context->mbmi.mode != SPLITMV)
- {
- int offset;
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *uptr, *vptr;
- unsigned char *pred_ptr = x->predictor;
- unsigned char *upred_ptr = &x->predictor[256];
- unsigned char *vpred_ptr = &x->predictor[320];
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
- int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
- int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
- int pre_stride = x->block[0].pre_stride;
+ unsigned char *ptr_base = x->pre.y_buffer;
+ int pre_stride = x->block[0].pre_stride;
- ptr_base = x->pre.y_buffer;
- ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
- }
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, dst_ystride);
+ }
+ else
+ {
+ RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, dst_ystride);
+ }
- mv_row = x->block[16].bmi.mv.as_mv.row;
- mv_col = x->block[16].bmi.mv.as_mv.col;
- pre_stride >>= 1;
- offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
- uptr = x->pre.u_buffer + offset;
- vptr = x->pre.v_buffer + offset;
+ mv_row = x->block[16].bmi.mv.as_mv.row;
+ mv_col = x->block[16].bmi.mv.as_mv.col;
+ pre_stride >>= 1;
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
- x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
- }
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, dst_u, dst_uvstride);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, dst_v, dst_uvstride);
}
else
{
- int i;
-
- if (x->mode_info_context->mbmi.partitioning < 3)
- {
- for (i = 0; i < 4; i++)
- {
- BLOCKD *d = &x->block[bbb[i]];
- build_inter_predictors4b(x, d, 16);
- }
- }
- else
- {
- for (i = 0; i < 16; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
+ RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, dst_u, dst_uvstride);
+ RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, dst_v, dst_uvstride);
+ }
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, 16);
- else
- {
- vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
- vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
- }
+}
- }
+void vp8_build_inter4x4_predictors_mb(MACROBLOCKD *x)
+{
+ int i;
+ if (x->mode_info_context->mbmi.partitioning < 3)
+ {
+ for (i = 0; i < 4; i++)
+ {
+ BLOCKD *d = &x->block[bbb[i]];
+ build_inter_predictors4b(x, d, 16);
}
-
- for (i = 16; i < 24; i += 2)
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, 8);
+ build_inter_predictors2b(x, d0, 16);
else
{
- vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
- vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
}
}
}
+
+ for (i = 16; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, 8);
+ else
+ {
+ vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+ }
+ }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
+{
+ if (x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
+ &x->predictor[320], 16, 8);
+ }
+ else
+ {
+ vp8_build_inter4x4_predictors_mb(x);
+ }
}
void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
@@ -455,195 +466,5 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
}
-/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
- * situation, we can write the result directly to dst buffer instead of writing it to predictor
- * buffer and then copying it to dst buffer.
- */
-static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf)
-{
- int r;
- unsigned char *ptr_base;
- unsigned char *ptr;
- /*unsigned char *pred_ptr = d->predictor;*/
- int dst_stride = d->dst_stride;
- int pre_stride = d->pre_stride;
-
- ptr_base = *(d->base_pre);
-
- if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
- {
- ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
- sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride);
- }
- else
- {
- ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
- ptr = ptr_base;
-
- for (r = 0; r < 4; r++)
- {
-#ifdef MUST_BE_ALIGNED
- dst_ptr[0] = ptr[0];
- dst_ptr[1] = ptr[1];
- dst_ptr[2] = ptr[2];
- dst_ptr[3] = ptr[3];
-#else
- *(int *)dst_ptr = *(int *)ptr ;
-#endif
- dst_ptr += dst_stride;
- ptr += pre_stride;
- }
- }
-}
-
-void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x)
-{
- unsigned char *dst_ptr = x->dst.y_buffer;
-
- int offset;
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *uptr, *vptr;
- unsigned char *udst_ptr = x->dst.u_buffer;
- unsigned char *vdst_ptr = x->dst.v_buffer;
-
- int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
- int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
- int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/
-
- ptr_base = x->pre.y_buffer;
- ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
-
- mv_row = x->block[16].bmi.mv.as_mv.row;
- mv_col = x->block[16].bmi.mv.as_mv.col;
- pre_stride >>= 1;
- offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
- uptr = x->pre.u_buffer + offset;
- vptr = x->pre.v_buffer + offset;
-
- if ((mv_row | mv_col) & 7)
- {
- x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
- x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride);
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
- }
-}
-void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
-{
- unsigned char *dst_ptr = x->dst.y_buffer;
-
- if (x->mode_info_context->mbmi.mode != SPLITMV)
- {
- vp8_build_inter16x16_predictors_mb_s(x);
- }
- else
- {
- /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
- * if sth is wrong, go back to what it is in build_inter_predictors_mb.
- */
- int i;
-
- if (x->mode_info_context->mbmi.partitioning < 3)
- {
- for (i = 0; i < 4; i++)
- {
- unsigned char *ptr_base;
- unsigned char *ptr;
- BLOCKD *d = &x->block[bbb[i]];
-
- ptr_base = *(d->base_pre);
- ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
-
- if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
- }
- }
- }
- else
- {
- for (i = 0; i < 16; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- {
- /*build_inter_predictors2b(x, d0, 16);*/
- unsigned char *ptr_base;
- unsigned char *ptr;
-
- ptr_base = *(d0->base_pre);
- ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
- if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
- }
- }
- else
- {
- vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
- vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
- }
- }
- }
-
- for (i = 16; i < 24; i += 2)
- {
- BLOCKD *d0 = &x->block[i];
- BLOCKD *d1 = &x->block[i+1];
-
- if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- {
- /*build_inter_predictors2b(x, d0, 8);*/
- unsigned char *ptr_base;
- unsigned char *ptr;
-
- ptr_base = *(d0->base_pre);
- ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
-
- if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
- {
- x->subpixel_predict8x4(ptr, d0->pre_stride,
- d0->bmi.mv.as_mv.col & 7,
- d0->bmi.mv.as_mv.row & 7,
- dst_ptr, x->dst.uv_stride);
- }
- else
- {
- RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr,
- d0->pre_stride, dst_ptr, x->dst.uv_stride);
- }
- }
- else
- {
- vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
- vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
- }
- }
- }
-}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 688bebe96..a68e4aaba 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -13,8 +13,13 @@
#define __INC_RECONINTER_H
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
-extern void vp8_build_inter16x16_predictors_mb_s(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+
extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h
index 4025a5307..47e479285 100644
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -14,9 +14,4 @@
extern void init_intra_left_above_pixels(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
-
-extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
-
#endif
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index cd70dca73..18c514541 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -14,7 +14,7 @@
#include "vpx_mem/vpx_mem.h"
#include "reconintra.h"
-void vp8_predict_intra4x4(BLOCKD *x,
+void vp8_intra4x4_predict(BLOCKD *x,
int b_mode,
unsigned char *predictor)
{
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
index 43735bc4b..465626b8f 100644
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -14,18 +14,18 @@
; /****************************************************************************
; * Notes:
; *
-; * This implementation makes use of 16 bit fixed point verio of two multiply
+; * This implementation makes use of 16 bit fixed point version of two multiply
; * constants:
; * 1. sqrt(2) * cos (pi/8)
-; * 2. sqrt(2) * sin (pi/8)
-; * Becuase the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point prrcision as the second one, we use a trick of
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
; * x * a = x + x*(a-1)
; * so
; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
; *
-; * For the second constant, becuase of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it become a negative
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
; * number.
; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
; *
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 67b6420a9..97dc4f686 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -229,3 +229,397 @@ sym(vp8_copy_mem16x16_sse2):
UNSHADOW_ARGS
pop rbp
ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dc_mmx2)
+sym(vp8_intra_pred_uv_dc_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor mm0, mm0
+ movq mm1, [rsi]
+ psadbw mm1, mm0
+
+ ; from left
+ dec rsi
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi+rax]
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*4]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, mm1, 0x0
+ lea edx, [edx+ecx+8]
+ sar edx, 4
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dctop_mmx2)
+sym(vp8_intra_pred_uv_dctop_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor mm0, mm0
+ movq mm1, [rsi]
+ psadbw mm1, mm0
+
+ ; add up
+ paddw mm1, [GLOBAL(dc_4)]
+ psraw mm1, 3
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dcleft_mmx2)
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from left
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ dec rsi
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+4]
+
+ ; add up
+ shr edx, 3
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_dc128_mmx)
+sym(vp8_intra_pred_uv_dc128_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ ; end prolog
+
+ ; write out
+ movq mm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1)
+sym(vp8_intra_pred_uv_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read top row
+ mov edx, 4
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm2, [GLOBAL(dc_1024)]
+%endif
+ movq xmm1, [rsi]
+ punpcklbw xmm1, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm3, [rsi-1]
+ lea rsi, [rsi+rax-1]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ punpcklqdq xmm3, xmm3
+%else
+ pshufb xmm3, xmm2
+%endif
+ psubw xmm1, xmm3
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_uv_tm_%1_loop:
+ movd xmm3, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm3, xmm3
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm3, xmm2
+ pshufb xmm5, xmm2
+%endif
+ paddw xmm3, xmm1
+ paddw xmm5, xmm1
+ packuswb xmm3, xmm5
+ movq [rdi ], xmm3
+ movhps[rdi+rcx], xmm3
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_uv_tm_%1_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_ve_mmx)
+sym(vp8_intra_pred_uv_ve_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ ; end prolog
+
+ ; read from top
+ mov rax, arg(2) ;src;
+ movsxd rdx, dword ptr arg(3) ;src_stride;
+ sub rax, rdx
+ movq mm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp8_intra_pred_uv_ho_mmx2)
+sym(vp8_intra_pred_uv_ho_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read from left and write out
+ mov edx, 4
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ dec rsi
+vp8_intra_pred_uv_ho_mmx2_loop:
+ movd mm0, [rsi]
+ movd mm1, [rsi+rax]
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm0, mm0, 0x0
+ pshufw mm1, mm1, 0x0
+ movq [rdi ], mm0
+ movq [rdi+rcx], mm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_uv_ho_mmx2_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+dc_128:
+ times 8 db 128
+dc_4:
+ times 4 dw 4
+align 16
+dc_1024:
+ times 8 dw 0x400
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
new file mode 100644
index 000000000..86b4da2c2
--- /dev/null
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp8/common/recon.h"
+#include "recon_x86.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *src, int src_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_stride,
+ build_intra_predictors_mbuv_fn_t tm_func)
+{
+ int mode = x->mode_info_context->mbmi.uv_mode;
+ build_intra_predictors_mbuv_fn_t fn;
+ int src_stride = x->dst.uv_stride;
+
+ switch (mode) {
+ case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+ case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+ case TM_PRED: fn = tm_func; break;
+ case DC_PRED:
+ if (x->up_available) {
+ if (x->left_available) {
+ fn = vp8_intra_pred_uv_dc_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dctop_mmx2; break;
+ }
+ } else if (x->left_available) {
+ fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dc128_mmx; break;
+ }
+ break;
+ default: return;
+ }
+
+ fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
+ fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+ &x->predictor[320], 8,
+ vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
+ &x->predictor[320], 8,
+ vp8_intra_pred_uv_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+ x->dst.v_buffer, x->dst.uv_stride,
+ vp8_intra_pred_uv_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
+{
+ vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
+ x->dst.v_buffer, x->dst.uv_stride,
+ vp8_intra_pred_uv_tm_ssse3);
+}
diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h
index 40ee65a12..fe0f8f0bc 100644
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -46,6 +46,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
extern prototype_recon_block(vp8_recon2b_sse2);
extern prototype_recon_block(vp8_recon4b_sse2);
extern prototype_copy_block(vp8_copy_mem16x16_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon2
@@ -57,6 +59,26 @@ extern prototype_copy_block(vp8_copy_mem16x16_sse2);
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
+#undef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2
+
+#undef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
+
+#endif
+#endif
+
+#if HAVE_SSSE3
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_recon_build_intra_predictors_mbuv
+#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_ssse3
+
+#undef vp8_recon_build_intra_predictors_mbuv_s
+#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
+
#endif
#endif
#endif
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index e89c07a4f..17667330a 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -88,6 +88,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->recon.recon2 = vp8_recon2b_sse2;
rtcd->recon.recon4 = vp8_recon4b_sse2;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2;
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp8_build_intra_predictors_mbuv_sse2;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp8_build_intra_predictors_mbuv_s_sse2;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2;
@@ -126,6 +130,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3;
rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3;
rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_ssse3;
+
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp8_build_intra_predictors_mbuv_ssse3;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp8_build_intra_predictors_mbuv_s_ssse3;
}
#endif