diff options
-rwxr-xr-x | build/make/ads2gas.pl | 26 | ||||
-rw-r--r-- | vp8/common/blockd.h | 8 | ||||
-rw-r--r-- | vp8/encoder/generic/csystemdependent.c | 4 | ||||
-rw-r--r-- | vp8/encoder/lookahead.c | 3 | ||||
-rw-r--r-- | vp8/encoder/mcomp.c | 48 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 17 | ||||
-rw-r--r-- | vp8/encoder/sad_c.c | 63 | ||||
-rw-r--r-- | vp8/encoder/variance.h | 15 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse2.asm | 80 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse3.asm | 66 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_x86.h | 8 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 2 | ||||
-rw-r--r-- | vp8/vp8_dx_iface.c | 45 | ||||
-rw-r--r-- | vpx_scale/generic/yv12config.c | 28 |
14 files changed, 373 insertions, 40 deletions
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index be4658253..388133aa2 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -21,6 +21,9 @@ print "@ This file was created from a .asm file\n"; print "@ using the ads2gas.pl script.\n"; print "\t.equ DO1STROUNDING, 0\n"; +# Stack of procedure names. +@proc_stack = (); + while (<STDIN>) { # Load and store alignment @@ -133,9 +136,23 @@ while (<STDIN>) # Strip PRESERVE8 s/\sPRESERVE8/@ PRESERVE8/g; - # Strip PROC and ENDPROC - s/\sPROC/@/g; - s/\sENDP/@/g; + # Use PROC and ENDP to give the symbols a .size directive. + # This makes them show up properly in debugging tools like gdb and valgrind. + if (/\bPROC\b/) + { + my $proc; + /^_([\.0-9A-Z_a-z]\w+)\b/; + $proc = $1; + push(@proc_stack, $proc) if ($proc); + s/\bPROC\b/@ $&/; + } + if (/\bENDP\b/) + { + my $proc; + s/\bENDP\b/@ $&/; + $proc = pop(@proc_stack); + $_ = "\t.size $proc, .-$proc".$_ if ($proc); + } # EQU directive s/(.*)EQU(.*)/.equ $1, $2/; @@ -154,3 +171,6 @@ while (<STDIN>) next if /^\s*END\s*$/; print; } + +# Mark that this object doesn't need an executable stack. +printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n"); diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 16b7281e0..96155237a 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -266,6 +266,14 @@ typedef struct MacroBlockD int corrupted; +#if ARCH_X86 || ARCH_X86_64 + /* This is an intermediate buffer currently used in sub-pixel motion search + * to keep a copy of the reference area. This buffer can be used for other + * purpose. + */ + DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]); +#endif + #if CONFIG_RUNTIME_CPU_DETECT struct VP8_COMMON_RTCD *rtcd; #endif diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index 536636fd1..990610554 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -47,7 +47,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c; cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_c; cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_c; - +#if ARCH_X86 || ARCH_X86_64 + cpi->rtcd.variance.copy32xn = vp8_copy32xn_c; +#endif cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c index 3b86d4094..d7f85cba1 100644 --- a/vp8/encoder/lookahead.c +++ b/vp8/encoder/lookahead.c @@ -86,7 +86,8 @@ vp8_lookahead_init(unsigned int width, if(!ctx->buf) goto bail; for(i=0; i<depth; i++) - if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, width, height, 16)) + if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, + width, height, VP8BORDERINPIXELS)) goto bail; } return ctx; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 4b55aceba..58b524f82 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -11,7 +11,7 @@ #include "mcomp.h" #include "vpx_mem/vpx_mem.h" - +#include "vpx_ports/config.h" #include <stdio.h> #include <limits.h> #include <math.h> @@ -165,19 +165,25 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) x->searches_per_step = 8; } - +/* + * To avoid the penalty for crossing cache-line read, preload the reference + * area in a small buffer, which is aligned to make sure there won't be crossing + * cache-line read while reading from this buffer. This reduced the cpu + * cycles spent on reading ref data in sub-pixel filter functions. + * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x + * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we + * could reduce the area. + */ #define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) -#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector +#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset))) // pointer to predictor base of a motionvector #define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc -#define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. +#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best #define MIN(x,y) (((x)<(y))?(x):(y)) #define MAX(x,y) (((x)>(y))?(x):(y)) -//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; } - int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -185,7 +191,6 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int *mvcost[2], int *distortion, unsigned int *sse1) { - unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; unsigned char *z = (*(b->base_src) + b->src); int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1; @@ -204,12 +209,38 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1)); int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1)); + int y_stride; + int offset; + +#if ARCH_X86 || ARCH_X86_64 + MACROBLOCKD *xd = &x->e_mbd; + unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; + unsigned char *y; + int buf_r1, buf_r2, buf_c1, buf_c2; + + // Clamping to avoid out-of-range data access + buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3; + buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3; + buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3; + buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3; + y_stride = 32; + + /* Copy to intermediate buffer before searching. */ + vfp->copymem(y0 - buf_c1 - d->pre_stride*buf_r1, d->pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2); + y = xd->y_buf + y_stride*buf_r1 +buf_c1; +#else + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; + y_stride = d->pre_stride; +#endif + + offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + // central mv bestmv->as_mv.row <<= 3; bestmv->as_mv.col <<= 3; // calculate central point error - besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); + besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); @@ -296,6 +327,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #undef PRE #undef SP #undef DIST +#undef IFMVCV #undef ERR #undef CHECK_BETTER #undef MIN diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 5fda5c791..de045b48e 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -879,6 +879,10 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->improved_quant = 0; sf->improved_dct = 0; + + sf->use_fastquant_for_pick = 1; + sf->no_skip_block4x4_search = 0; + sf->first_step = 1; } if (Speed > 1) @@ -1237,7 +1241,7 @@ static void alloc_raw_frame_buffers(VP8_COMP *cpi) #if VP8_TEMPORAL_ALT_REF if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer, - width, height, 16)) + width, height, VP8BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); @@ -1287,7 +1291,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); - if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, 16)) + if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, + width, height, VP8BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); @@ -2070,6 +2075,14 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8); cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d); +#if ARCH_X86 || ARCH_X86_64 + cpi->fn_ptr[BLOCK_16X16].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn); + cpi->fn_ptr[BLOCK_16X8].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn); + cpi->fn_ptr[BLOCK_8X16].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn); + cpi->fn_ptr[BLOCK_8X8].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn); + cpi->fn_ptr[BLOCK_4X4].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn); +#endif + cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search); cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search); cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search); diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c index 5eaca5935..c734458a9 100644 --- a/vp8/encoder/sad_c.c +++ b/vp8/encoder/sad_c.c @@ -10,6 +10,8 @@ #include <stdlib.h> +#include "vpx_ports/config.h" +#include "vpx/vpx_integer.h" unsigned int vp8_sad16x16_c( const unsigned char *src_ptr, @@ -337,3 +339,64 @@ void vp8_sad4x4x4d_c( sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } + +/* Copy 2 macroblocks to a buffer */ +void vp8_copy32xn_c( + unsigned char *src_ptr, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + int height) +{ + int r; + + for (r = 0; r < height; r++) + { +#if !(CONFIG_FAST_UNALIGNED) + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr[4] = src_ptr[4]; + dst_ptr[5] = src_ptr[5]; + dst_ptr[6] = src_ptr[6]; + dst_ptr[7] = src_ptr[7]; + dst_ptr[8] = src_ptr[8]; + dst_ptr[9] = src_ptr[9]; + dst_ptr[10] = src_ptr[10]; + dst_ptr[11] = src_ptr[11]; + dst_ptr[12] = src_ptr[12]; + dst_ptr[13] = src_ptr[13]; + dst_ptr[14] = src_ptr[14]; + dst_ptr[15] = src_ptr[15]; + dst_ptr[16] = src_ptr[16]; + dst_ptr[17] = src_ptr[17]; + dst_ptr[18] = src_ptr[18]; + dst_ptr[19] = src_ptr[19]; + dst_ptr[20] = src_ptr[20]; + dst_ptr[21] = src_ptr[21]; + dst_ptr[22] = src_ptr[22]; + dst_ptr[23] = src_ptr[23]; + dst_ptr[24] = src_ptr[24]; + dst_ptr[25] = src_ptr[25]; + dst_ptr[26] = src_ptr[26]; + dst_ptr[27] = src_ptr[27]; + dst_ptr[28] = src_ptr[28]; + dst_ptr[29] = src_ptr[29]; + dst_ptr[30] = src_ptr[30]; + dst_ptr[31] = src_ptr[31]; +#else + ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ; + ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ; + ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ; + ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ; + ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ; + ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ; + ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ; + ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ; +#endif + src_ptr += src_stride; + dst_ptr += dst_stride; + + } +} diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 894b4f9e4..5fd6d3ae0 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -222,6 +222,13 @@ extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d); #endif extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d); +#if ARCH_X86 || ARCH_X86_64 +#ifndef vp8_variance_copy32xn +#define vp8_variance_copy32xn vp8_copy32xn_c +#endif +extern prototype_sad(vp8_variance_copy32xn); +#endif + //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- #ifndef vp8_variance_var4x4 @@ -381,6 +388,10 @@ typedef struct vp8_sad_multi_d_fn_t sad8x8x4d; vp8_sad_multi_d_fn_t sad4x4x4d; +#if ARCH_X86 || ARCH_X86_64 + vp8_sad_fn_t copy32xn; +#endif + #if CONFIG_INTERNAL_STATS vp8_ssimpf_fn_t ssimpf_8x8; vp8_ssimpf_fn_t ssimpf; @@ -399,7 +410,9 @@ typedef struct vp8_sad_multi_fn_t sdx3f; vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi_d_fn_t sdx4df; - +#if ARCH_X86 || ARCH_X86_64 + vp8_sad_fn_t copymem; +#endif } vp8_variance_fn_ptr_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index 04ee72f72..1011c9553 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -328,3 +328,83 @@ x16x8sad_wmt_early_exit: UNSHADOW_ARGS pop rbp ret + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse2) +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge block_copy_sse2_loopx4 + + cmp rcx, 0 + je copy_is_done + +block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne block_copy_sse2_loop + +copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 2dbcc7dc9..1c41c322a 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -20,6 +20,7 @@ %define ret_var rbx %define result_ptr arg(4) %define max_err arg(4) + %define height dword ptr arg(4) push rbp mov rbp, rsp push rsi @@ -42,6 +43,7 @@ %define ret_var r11 %define result_ptr [rsp+xmm_stack_space+8+4*8] %define max_err [rsp+xmm_stack_space+8+4*8] + %define height [rsp+xmm_stack_space+8+4*8] %else %define src_ptr rdi %define src_stride rsi @@ -51,6 +53,7 @@ %define ret_var r10 %define result_ptr r8 %define max_err r8 + %define height r8 %endif %endif @@ -65,6 +68,7 @@ %define ret_var %define result_ptr %define max_err + %define height %if ABI_IS_32BIT pop rbx @@ -632,6 +636,67 @@ sym(vp8_sad16x16_sse3): STACK_FRAME_DESTROY_X3 +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse3) +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je copy_is_done + +block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne block_copy_sse3_loop + +copy_is_done: + STACK_FRAME_DESTROY_X3 + ;void vp8_sad16x16x4d_sse3( ; unsigned char *src_ptr, ; int src_stride, @@ -892,3 +957,4 @@ sym(vp8_sad4x4x4d_sse3): STACK_FRAME_DESTROY_X4 + diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 0ee8eb7e5..af6c4d27e 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -121,6 +121,7 @@ extern prototype_sad(vp8_sad8x8_wmt); extern prototype_sad(vp8_sad8x16_wmt); extern prototype_sad(vp8_sad16x8_wmt); extern prototype_sad(vp8_sad16x16_wmt); +extern prototype_sad(vp8_copy32xn_sse2); extern prototype_variance(vp8_variance4x4_wmt); extern prototype_variance(vp8_variance8x8_wmt); extern prototype_variance(vp8_variance8x16_wmt); @@ -156,6 +157,9 @@ extern prototype_variance2(vp8_get16x16var_sse2); #undef vp8_variance_sad16x16 #define vp8_variance_sad16x16 vp8_sad16x16_wmt +#undef vp8_variance_copy32xn +#define vp8_variance_copy32xn vp8_copy32xn_sse2 + #undef vp8_variance_var4x4 #define vp8_variance_var4x4 vp8_variance4x4_wmt @@ -222,6 +226,7 @@ extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); +extern prototype_sad(vp8_copy32xn_sse3); #if !CONFIG_RUNTIME_CPU_DETECT @@ -258,6 +263,9 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #undef vp8_variance_sad4x4x4d #define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3 +#undef vp8_variance_copy32xn +#define vp8_variance_copy32xn vp8_copy32xn_sse3 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 9a324ec12..badb9f044 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -203,6 +203,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; + cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse2; cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; @@ -263,6 +264,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; + cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse3; cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; cpi->rtcd.search.refining_search = vp8_refining_search_sadx4; } diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 58dc486de..13a072bff 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -301,6 +301,36 @@ update_error_state(vpx_codec_alg_priv_t *ctx, return res; } +static void yuvconfig2image(vpx_image_t *img, + const YV12_BUFFER_CONFIG *yv12, + void *user_priv) +{ + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + img->fmt = yv12->clrtype == REG_YUV ? + VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420; + img->w = yv12->y_stride; + img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; + img->d_w = yv12->y_width; + img->d_h = yv12->y_height; + img->x_chroma_shift = 1; + img->y_chroma_shift = 1; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; + img->bps = 12; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, @@ -429,21 +459,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags)) { - /* Align width/height */ - unsigned int a_w = (sd.y_width + 15) & ~15; - unsigned int a_h = (sd.y_height + 15) & ~15; - - vpx_img_wrap(&ctx->img, VPX_IMG_FMT_I420, - a_w + 2 * VP8BORDERINPIXELS, - a_h + 2 * VP8BORDERINPIXELS, - 1, - sd.buffer_alloc); - vpx_img_set_rect(&ctx->img, - VP8BORDERINPIXELS, VP8BORDERINPIXELS, - sd.y_width, sd.y_height); - ctx->img.user_priv = user_priv; + yuvconfig2image(&ctx->img, &sd, user_priv); ctx->img_avail = 1; - } } diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index d02cde28f..eff594e2d 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -49,25 +49,33 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int if (ybf) { + int y_stride = ((width + 2 * border) + 31) & ~31; + int yplane_size = (height + 2 * border) * y_stride; int uv_width = width >> 1; int uv_height = height >> 1; - int yplane_size = (height + 2 * border) * (width + 2 * border); - int uvplane_size = (uv_height + border) * (uv_width + border); + /** There is currently a bunch of code which assumes + * uv_stride == y_stride/2, so enforce this here. */ + int uv_stride = y_stride >> 1; + int uvplane_size = (uv_height + border) * uv_stride; vp8_yv12_de_alloc_frame_buffer(ybf); - /* only support allocating buffers that have - a height and width that are multiples of 16 */ - if ((width & 0xf) | (height & 0xf)) + /** Only support allocating buffers that have a height and width that + * are multiples of 16, and a border that's a multiple of 32. + * The border restriction is required to get 16-byte alignment of the + * start of the chroma rows without intoducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if ((width & 0xf) | (height & 0xf) | (border & 0x1f)) return -3; ybf->y_width = width; ybf->y_height = height; - ybf->y_stride = width + 2 * border; + ybf->y_stride = y_stride; ybf->uv_width = uv_width; ybf->uv_height = uv_height; - ybf->uv_stride = uv_width + border; + ybf->uv_stride = uv_stride; ybf->border = border; ybf->frame_size = yplane_size + 2 * uvplane_size; @@ -77,9 +85,9 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int if (ybf->buffer_alloc == NULL) return -1; - ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border; - ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2; - ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2; + ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; + ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * uv_stride) + border / 2; ybf->corrupted = 0; /* assume not currupted by errors */ } |