summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-06-17 15:36:43 -0400
committerJohn Koleszar <jkoleszar@google.com>2011-06-17 15:36:43 -0400
commitdeb2e9cf620087b4dc5b78423b6ff9bf8971c1bc (patch)
tree592d5eeb0cac290da19d6e660135a0f41dfd1dc3 /vp8/encoder/x86
parentcefb915ab62a7cd91b347cc22449b2eb9b8c1df9 (diff)
parenta60fc419f50eefbef568ceecb93233471e894fa5 (diff)
downloadlibvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.tar
libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.tar.gz
libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.tar.bz2
libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.zip
Merge remote branch 'internal/upstream' into HEAD
Conflicts: vp8/encoder/encodeframe.c vp8/encoder/rdopt.c Change-Id: I183fd3ce9e94617ec888c9f891055b9f1f8ca6c5
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r--vp8/encoder/x86/dct_x86.h6
-rw-r--r--vp8/encoder/x86/variance_impl_mmx.asm130
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm116
-rw-r--r--vp8/encoder/x86/variance_mmx.c37
-rw-r--r--vp8/encoder/x86/variance_sse2.c7
-rw-r--r--vp8/encoder/x86/variance_x86.h21
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c42
7 files changed, 23 insertions, 336 deletions
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index 59a5cb1d7..19f6c1686 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -31,6 +31,12 @@ extern prototype_fdct(vp8_short_fdct8x4_mmx);
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_mmx
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_mmx
+
#endif
#endif
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index 67a9b4d3e..13b76ea91 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -843,136 +843,6 @@ filter_block2d_bil_var_mmx_loop:
pop rbp
ret
-;unsigned int vp8_get16x16pred_error_mmx
-;(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride
-;)
-global sym(vp8_get16x16pred_error_mmx)
-sym(vp8_get16x16pred_error_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;DWORD PTR [src_ptr]
- mov rdi, arg(2) ;DWORD PTR [ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[src_stride]
- movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
-
- pxor mm0, mm0 ; clear xmm0 for unpack
- pxor mm7, mm7 ; clear xmm7 for accumulating diffs
-
- pxor mm6, mm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-var16loop:
-
- movq mm1, [rsi]
- movq mm2, [rdi]
-
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm2
- psubw mm3, mm4
-
- paddw mm7, mm1
- pmaddwd mm1, mm1
-
- paddw mm7, mm3
- pmaddwd mm3, mm3
-
- paddd mm6, mm1
- paddd mm6, mm3
-
-
- movq mm1, [rsi+8]
- movq mm2, [rdi+8]
-
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm2
- psubw mm3, mm4
-
- paddw mm7, mm1
- pmaddwd mm1, mm1
-
- paddw mm7, mm3
- pmaddwd mm3, mm3
-
- paddd mm6, mm1
- paddd mm6, mm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz var16loop
-
-
- movq mm1, mm6
- pxor mm6, mm6
-
- pxor mm5, mm5
- punpcklwd mm6, mm7
-
- punpckhwd mm5, mm7
- psrad mm5, 16
-
- psrad mm6, 16
- paddd mm6, mm5
-
- movq mm2, mm1
- psrlq mm1, 32
-
- paddd mm2, mm1
- movq mm7, mm6
-
- psrlq mm6, 32
- paddd mm6, mm7
-
- movd DWORD PTR [rsp], mm6 ;Sum
- movd DWORD PTR [rsp+4], mm2 ;SSE
-
- ; return (SSE-((Sum*Sum)>>8));
- movsxd rdx, dword ptr [rsp]
- imul rdx, rdx
- sar rdx, 8
- movsxd rax, dword ptr [rsp + 4]
- sub rax, rdx
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
SECTION_RODATA
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 5becc7344..b7a6b3286 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -213,122 +213,6 @@ var16loop:
ret
-;unsigned int vp8_get16x16pred_error_sse2
-;(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride
-;)
-global sym(vp8_get16x16pred_error_sse2)
-sym(vp8_get16x16pred_error_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[src_stride]
- movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-var16peloop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
-
- punpcklbw xmm2, xmm0
- punpckhbw xmm4, xmm0
-
- psubw xmm1, xmm2
- psubw xmm3, xmm4
-
- paddw xmm7, xmm1
- pmaddwd xmm1, xmm1
-
- paddw xmm7, xmm3
- pmaddwd xmm3, xmm3
-
- paddd xmm6, xmm1
- paddd xmm6, xmm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz var16peloop
-
-
- movdqa xmm1, xmm6
- pxor xmm6, xmm6
-
- pxor xmm5, xmm5
- punpcklwd xmm6, xmm7
-
- punpckhwd xmm5, xmm7
- psrad xmm5, 16
-
- psrad xmm6, 16
- paddd xmm6, xmm5
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddd xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddd xmm7, xmm6
- paddd xmm1, xmm2
-
- movd DWORD PTR [rsp], xmm7 ;Sum
- movd DWORD PTR [rsp+4], xmm1 ;SSE
-
- ; return (SSE-((Sum*Sum)>>8));
- movsxd rdx, dword ptr [rsp]
- imul rdx, rdx
- sar rdx, 8
- movsxd rax, dword ptr [rsp + 4]
- sub rax, rdx
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
;unsigned int vp8_get8x8var_sse2
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 4a89868c2..92b695f17 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -76,43 +76,6 @@ extern void vp8_filter_block2d_bil_var_mmx
int *sum,
unsigned int *sumsquared
);
-extern unsigned int vp8_get16x16pred_error_mmx
-(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride
-);
-
-unsigned int vp8_get16x16var_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *SUM
-)
-{
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
-
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
- vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
-
- *SSE = var;
- *SUM = avg;
- return (var - ((avg * avg) >> 8));
-
-}
-
-
-
unsigned int vp8_variance4x4_mmx(
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index dfc0915b9..24062eb9b 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -53,13 +53,6 @@ unsigned int vp8_get16x16var_sse2
unsigned int *SSE,
int *Sum
);
-unsigned int vp8_get16x16pred_error_sse2
-(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride
-);
unsigned int vp8_get8x8var_sse2
(
const unsigned char *src_ptr,
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 77e05e1e8..0ee8eb7e5 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -41,9 +41,7 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx);
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
extern prototype_getmbss(vp8_get_mb_ss_mmx);
extern prototype_variance(vp8_mse16x16_mmx);
-extern prototype_get16x16prederror(vp8_get16x16pred_error_mmx);
extern prototype_variance2(vp8_get8x8var_mmx);
-extern prototype_variance2(vp8_get16x16var_mmx);
extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
@@ -110,15 +108,6 @@ extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_mmx
-#undef vp8_variance_get16x16prederror
-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
-
-#undef vp8_variance_get8x8var
-#define vp8_variance_get8x8var vp8_get8x8var_mmx
-
-#undef vp8_variance_get16x16var
-#define vp8_variance_get16x16var vp8_get16x16var_mmx
-
#undef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
@@ -148,7 +137,6 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt);
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
extern prototype_getmbss(vp8_get_mb_ss_sse2);
extern prototype_variance(vp8_mse16x16_wmt);
-extern prototype_get16x16prederror(vp8_get16x16pred_error_sse2);
extern prototype_variance2(vp8_get8x8var_sse2);
extern prototype_variance2(vp8_get16x16var_sse2);
@@ -216,15 +204,6 @@ extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_wmt
-#undef vp8_variance_get16x16prederror
-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
-
-#undef vp8_variance_get8x8var
-#define vp8_variance_get8x8var vp8_get8x8var_sse2
-
-#undef vp8_variance_get16x16var
-#define vp8_variance_get16x16var vp8_get16x16var_sse2
-
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 378b14066..9a324ec12 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -16,7 +16,7 @@
#if HAVE_MMX
-static void short_fdct8x4_mmx(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
{
vp8_short_fdct4x4_mmx(input, output, pitch);
vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
@@ -26,7 +26,7 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
short *quant_ptr, short *dqcoeff_ptr);
-static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
{
short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
short *coeff_ptr = b->coeff;
@@ -51,7 +51,7 @@ static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
}
int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-static int mbblock_error_mmx(MACROBLOCK *mb, int dc)
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
{
short *coeff_ptr = mb->block[0].coeff;
short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
@@ -59,7 +59,7 @@ static int mbblock_error_mmx(MACROBLOCK *mb, int dc)
}
int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-static int mbuverror_mmx(MACROBLOCK *mb)
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
{
short *s_ptr = &mb->coeff[256];
short *d_ptr = &mb->e_mbd.dqcoeff[256];
@@ -69,7 +69,7 @@ static int mbuverror_mmx(MACROBLOCK *mb)
void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
short *diff, unsigned char *predictor,
int pitch);
-static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
@@ -82,7 +82,7 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#if HAVE_SSE2
int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
{
short *coeff_ptr = mb->block[0].coeff;
short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
@@ -90,7 +90,7 @@ static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
}
int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-static int mbuverror_xmm(MACROBLOCK *mb)
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
{
short *s_ptr = &mb->coeff[256];
short *d_ptr = &mb->e_mbd.dqcoeff[256];
@@ -100,7 +100,7 @@ static int mbuverror_xmm(MACROBLOCK *mb)
void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
short *diff, unsigned char *predictor,
int pitch);
-static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
@@ -175,26 +175,23 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx;
- cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx;
- cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
- cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
- cpi->rtcd.fdct.short8x4 = short_fdct8x4_mmx;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
- cpi->rtcd.fdct.fast8x4 = short_fdct8x4_mmx;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
- cpi->rtcd.encodemb.mberr = mbblock_error_mmx;
- cpi->rtcd.encodemb.mbuverr = mbuverror_mmx;
- cpi->rtcd.encodemb.subb = subtract_b_mmx;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx;
cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx;
- /*cpi->rtcd.quantize.fastquantb = fast_quantize_b_mmx;*/
+ /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
}
#endif
@@ -226,11 +223,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2;
- cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2;
- cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2;
- cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
-
-
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;
@@ -241,9 +233,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ;
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
- cpi->rtcd.encodemb.mberr = mbblock_error_xmm;
- cpi->rtcd.encodemb.mbuverr = mbuverror_xmm;
- cpi->rtcd.encodemb.subb = subtract_b_sse2;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2;
cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2;