diff options
author | John Koleszar <jkoleszar@google.com> | 2011-06-17 15:36:43 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2011-06-17 15:36:43 -0400 |
commit | deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc (patch) | |
tree | 592d5eeb0cac290da19d6e660135a0f41dfd1dc3 /vp8/encoder/x86 | |
parent | cefb915ab62a7cd91b347cc22449b2eb9b8c1df9 (diff) | |
parent | a60fc419f50eefbef568ceecb93233471e894fa5 (diff) | |
download | libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.tar libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.tar.gz libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.tar.bz2 libvpx-deb2e9cf620087b4dc5b78423b6ff9bf8971c1bc.zip |
Merge remote branch 'internal/upstream' into HEAD
Conflicts:
vp8/encoder/encodeframe.c
vp8/encoder/rdopt.c
Change-Id: I183fd3ce9e94617ec888c9f891055b9f1f8ca6c5
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r-- | vp8/encoder/x86/dct_x86.h | 6 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_mmx.asm | 130 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_sse2.asm | 116 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_mmx.c | 37 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_sse2.c | 7 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_x86.h | 21 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 42 |
7 files changed, 23 insertions, 336 deletions
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h index 59a5cb1d7..19f6c1686 100644 --- a/vp8/encoder/x86/dct_x86.h +++ b/vp8/encoder/x86/dct_x86.h @@ -31,6 +31,12 @@ extern prototype_fdct(vp8_short_fdct8x4_mmx); #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp8_short_fdct4x4_mmx + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_short_fdct8x4_mmx + #endif #endif diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm index 67a9b4d3e..13b76ea91 100644 --- a/vp8/encoder/x86/variance_impl_mmx.asm +++ b/vp8/encoder/x86/variance_impl_mmx.asm @@ -843,136 +843,6 @@ filter_block2d_bil_var_mmx_loop: pop rbp ret -;unsigned int vp8_get16x16pred_error_mmx -;( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride -;) -global sym(vp8_get16x16pred_error_mmx) -sym(vp8_get16x16pred_error_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;DWORD PTR [src_ptr] - mov rdi, arg(2) ;DWORD PTR [ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[src_stride] - movsxd rdx, DWORD PTR arg(3) ;[ref_stride] - - pxor mm0, mm0 ; clear xmm0 for unpack - pxor mm7, mm7 ; clear xmm7 for accumulating diffs - - pxor mm6, mm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -var16loop: - - movq mm1, [rsi] - movq mm2, [rdi] - - movq mm3, mm1 - movq mm4, mm2 - - punpcklbw mm1, mm0 - punpckhbw mm3, mm0 - - punpcklbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm2 - psubw mm3, mm4 - - paddw mm7, mm1 - pmaddwd mm1, mm1 - - paddw mm7, mm3 - pmaddwd mm3, mm3 - - paddd mm6, mm1 - paddd mm6, mm3 - - - movq mm1, [rsi+8] - movq mm2, [rdi+8] - - movq mm3, mm1 - movq mm4, mm2 - - punpcklbw mm1, mm0 - punpckhbw mm3, mm0 - - punpcklbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm2 - psubw mm3, mm4 - - paddw mm7, mm1 - pmaddwd mm1, mm1 - - paddw mm7, mm3 - pmaddwd mm3, mm3 - - paddd mm6, mm1 - paddd mm6, mm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz var16loop - - - movq mm1, mm6 - pxor mm6, mm6 - - pxor mm5, mm5 - punpcklwd mm6, mm7 - - punpckhwd mm5, mm7 - psrad mm5, 16 - - psrad mm6, 16 - paddd mm6, mm5 - - movq mm2, mm1 - psrlq mm1, 32 - - paddd mm2, mm1 - movq mm7, mm6 - - psrlq mm6, 32 - paddd mm6, mm7 - - movd DWORD PTR [rsp], mm6 ;Sum - movd DWORD PTR [rsp+4], mm2 ;SSE - - ; return (SSE-((Sum*Sum)>>8)); - movsxd rdx, dword ptr [rsp] - imul rdx, rdx - sar rdx, 8 - movsxd rax, dword ptr [rsp + 4] - sub rax, rdx - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA ;short mmx_bi_rd[4] = { 64, 64, 64, 64}; diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 5becc7344..b7a6b3286 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -213,122 +213,6 @@ var16loop: ret -;unsigned int vp8_get16x16pred_error_sse2 -;( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride -;) -global sym(vp8_get16x16pred_error_sse2) -sym(vp8_get16x16pred_error_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[src_stride] - movsxd rdx, DWORD PTR arg(3) ;[ref_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -var16peloop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz var16peloop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - movd DWORD PTR [rsp], xmm7 ;Sum - movd DWORD PTR [rsp+4], xmm1 ;SSE - - ; return (SSE-((Sum*Sum)>>8)); - movsxd rdx, dword ptr [rsp] - imul rdx, rdx - sar rdx, 8 - movsxd rax, dword ptr [rsp + 4] - sub rax, rdx - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;unsigned int vp8_get8x8var_sse2 diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c index 4a89868c2..92b695f17 100644 --- a/vp8/encoder/x86/variance_mmx.c +++ b/vp8/encoder/x86/variance_mmx.c @@ -76,43 +76,6 @@ extern void vp8_filter_block2d_bil_var_mmx int *sum, unsigned int *sumsquared ); -extern unsigned int vp8_get16x16pred_error_mmx -( - const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride -); - -unsigned int vp8_get16x16var_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *SUM -) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - - *SSE = var; - *SUM = avg; - return (var - ((avg * avg) >> 8)); - -} - - - unsigned int vp8_variance4x4_mmx( diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index dfc0915b9..24062eb9b 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -53,13 +53,6 @@ unsigned int vp8_get16x16var_sse2 unsigned int *SSE, int *Sum ); -unsigned int vp8_get16x16pred_error_sse2 -( - const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride -); unsigned int vp8_get8x8var_sse2 ( const unsigned char *src_ptr, diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 77e05e1e8..0ee8eb7e5 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -41,9 +41,7 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx); extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx); extern prototype_getmbss(vp8_get_mb_ss_mmx); extern prototype_variance(vp8_mse16x16_mmx); -extern prototype_get16x16prederror(vp8_get16x16pred_error_mmx); extern prototype_variance2(vp8_get8x8var_mmx); -extern prototype_variance2(vp8_get16x16var_mmx); extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx); #if !CONFIG_RUNTIME_CPU_DETECT @@ -110,15 +108,6 @@ extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx); #undef vp8_variance_mse16x16 #define vp8_variance_mse16x16 vp8_mse16x16_mmx -#undef vp8_variance_get16x16prederror -#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx - -#undef vp8_variance_get8x8var -#define vp8_variance_get8x8var vp8_get8x8var_mmx - -#undef vp8_variance_get16x16var -#define vp8_variance_get16x16var vp8_get16x16var_mmx - #undef vp8_variance_get4x4sse_cs #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx @@ -148,7 +137,6 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt); extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt); extern prototype_getmbss(vp8_get_mb_ss_sse2); extern prototype_variance(vp8_mse16x16_wmt); -extern prototype_get16x16prederror(vp8_get16x16pred_error_sse2); extern prototype_variance2(vp8_get8x8var_sse2); extern prototype_variance2(vp8_get16x16var_sse2); @@ -216,15 +204,6 @@ extern prototype_variance2(vp8_get16x16var_sse2); #undef vp8_variance_mse16x16 #define vp8_variance_mse16x16 vp8_mse16x16_wmt -#undef vp8_variance_get16x16prederror -#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2 - -#undef vp8_variance_get8x8var -#define vp8_variance_get8x8var vp8_get8x8var_sse2 - -#undef vp8_variance_get16x16var -#define vp8_variance_get16x16var vp8_get16x16var_sse2 - #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 378b14066..9a324ec12 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -16,7 +16,7 @@ #if HAVE_MMX -static void short_fdct8x4_mmx(short *input, short *output, int pitch) +void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) { vp8_short_fdct4x4_mmx(input, output, pitch); vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); @@ -26,7 +26,7 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, short *quant_ptr, short *dqcoeff_ptr); -static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) +void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) { short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; short *coeff_ptr = b->coeff; @@ -51,7 +51,7 @@ static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) } int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -static int mbblock_error_mmx(MACROBLOCK *mb, int dc) +int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; @@ -59,7 +59,7 @@ static int mbblock_error_mmx(MACROBLOCK *mb, int dc) } int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); -static int mbuverror_mmx(MACROBLOCK *mb) +int vp8_mbuverror_mmx(MACROBLOCK *mb) { short *s_ptr = &mb->coeff[256]; short *d_ptr = &mb->e_mbd.dqcoeff[256]; @@ -69,7 +69,7 @@ static int mbuverror_mmx(MACROBLOCK *mb) void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, short *diff, unsigned char *predictor, int pitch); -static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) +void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { unsigned char *z = *(be->base_src) + be->src; unsigned int src_stride = be->src_stride; @@ -82,7 +82,7 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #if HAVE_SSE2 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -static int mbblock_error_xmm(MACROBLOCK *mb, int dc) +int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; @@ -90,7 +90,7 @@ static int mbblock_error_xmm(MACROBLOCK *mb, int dc) } int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); -static int mbuverror_xmm(MACROBLOCK *mb) +int vp8_mbuverror_xmm(MACROBLOCK *mb) { short *s_ptr = &mb->coeff[256]; short *d_ptr = &mb->e_mbd.dqcoeff[256]; @@ -100,7 +100,7 @@ static int mbuverror_xmm(MACROBLOCK *mb) void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, short *diff, unsigned char *predictor, int pitch); -static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) +void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) { unsigned char *z = *(be->base_src) + be->src; unsigned int src_stride = be->src_stride; @@ -175,26 +175,23 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx; - cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx; - cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; - cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; - cpi->rtcd.fdct.short8x4 = short_fdct8x4_mmx; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; - cpi->rtcd.fdct.fast8x4 = short_fdct8x4_mmx; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; cpi->rtcd.encodemb.berr = vp8_block_error_mmx; - cpi->rtcd.encodemb.mberr = mbblock_error_mmx; - cpi->rtcd.encodemb.mbuverr = mbuverror_mmx; - cpi->rtcd.encodemb.subb = subtract_b_mmx; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx; + cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx; cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; - /*cpi->rtcd.quantize.fastquantb = fast_quantize_b_mmx;*/ + /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/ } #endif @@ -226,11 +223,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2; - cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; - cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; - cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; - - /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; @@ -241,9 +233,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ; cpi->rtcd.encodemb.berr = vp8_block_error_xmm; - cpi->rtcd.encodemb.mberr = mbblock_error_xmm; - cpi->rtcd.encodemb.mbuverr = mbuverror_xmm; - cpi->rtcd.encodemb.subb = subtract_b_sse2; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; + cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2; cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; |