14 files changed, 373 insertions, 40 deletions
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index be4658253..388133aa2 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -21,6 +21,9 @@ print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
 print "\t.equ DO1STROUNDING, 0\n";
 
+# Stack of procedure names.
+@proc_stack = ();
+
 while (<STDIN>)
 {
     # Load and store alignment
@@ -133,9 +136,23 @@ while (<STDIN>)
     # Strip PRESERVE8
     s/\sPRESERVE8/@ PRESERVE8/g;
 
-    # Strip PROC and ENDPROC
-    s/\sPROC/@/g;
-    s/\sENDP/@/g;
+    # Use PROC and ENDP to give the symbols a .size directive.
+    # This makes them show up properly in debugging tools like gdb and valgrind.
+    if (/\bPROC\b/)
+    {
+        my $proc;
+        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        $proc = $1;
+        push(@proc_stack, $proc) if ($proc);
+        s/\bPROC\b/@ $&/;
+    }
+    if (/\bENDP\b/)
+    {
+        my $proc;
+        s/\bENDP\b/@ $&/;
+        $proc = pop(@proc_stack);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+    }
 
     # EQU directive
     s/(.*)EQU(.*)/.equ $1, $2/;
@@ -154,3 +171,6 @@ while (<STDIN>)
     next if /^\s*END\s*$/;
     print;
 }
+
+# Mark that this object doesn't need an executable stack.
+printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 16b7281e0..96155237a 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -266,6 +266,14 @@ typedef struct MacroBlockD
 
     int corrupted;
 
+#if ARCH_X86 || ARCH_X86_64
+    /* This is an intermediate buffer currently used in sub-pixel motion search
+     * to keep a copy of the reference area. This buffer can be used for other
+     * purpose.
+     */
+    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+
 #if CONFIG_RUNTIME_CPU_DETECT
     struct VP8_COMMON_RTCD  *rtcd;
 #endif
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 536636fd1..990610554 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -47,7 +47,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
     cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_c;
     cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_c;
-
+#if ARCH_X86 || ARCH_X86_64
+    cpi->rtcd.variance.copy32xn              = vp8_copy32xn_c;
+#endif
     cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
     cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
     cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index 3b86d4094..d7f85cba1 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -86,7 +86,8 @@ vp8_lookahead_init(unsigned int width,
         if(!ctx->buf)
             goto bail;
         for(i=0; i<depth; i++)
-            if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, width, height, 16))
+            if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
+                                            width, height, VP8BORDERINPIXELS))
                 goto bail;
     }
     return ctx;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 4b55aceba..58b524f82 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -11,7 +11,7 @@
 
 #include "mcomp.h"
 #include "vpx_mem/vpx_mem.h"
-
+#include "vpx_ports/config.h"
 #include <stdio.h>
 #include <limits.h>
 #include <math.h>
@@ -165,19 +165,25 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
     x->searches_per_step = 8;
 }
 
-
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
 #define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
+#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset))) // pointer to predictor base of a motionvector
 #define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
-#define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
 
-//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
-
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
                                              int error_per_bit,
@@ -185,7 +191,6 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int *mvcost[2], int *distortion,
                                              unsigned int *sse1)
 {
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
     unsigned char *z = (*(b->base_src) + b->src);
 
     int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
@@ -204,12 +209,38 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
     int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
 
+    int y_stride;
+    int offset;
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+    int buf_r1, buf_r2, buf_c1, buf_c2;
+
+    // Clamping to avoid out-of-range data access
+    buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
+    buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
+    buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
+    buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3;
+    y_stride = 32;
+
+    /* Copy to intermediate buffer before searching. */
+    vfp->copymem(y0 - buf_c1 - d->pre_stride*buf_r1, d->pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
+    y = xd->y_buf + y_stride*buf_r1 +buf_c1;
+#else
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+    y_stride = d->pre_stride;
+#endif
+
+    offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
     // central mv
     bestmv->as_mv.row <<= 3;
     bestmv->as_mv.col <<= 3;
 
     // calculate central point error
-    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1);
+    besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
     *distortion = besterr;
     besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
@@ -296,6 +327,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #undef PRE
 #undef SP
 #undef DIST
+#undef IFMVCV
 #undef ERR
 #undef CHECK_BETTER
 #undef MIN
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 5fda5c791..de045b48e 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -879,6 +879,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
             sf->improved_quant = 0;
             sf->improved_dct = 0;
+
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;
+            sf->first_step = 1;
         }
 
         if (Speed > 1)
@@ -1237,7 +1241,7 @@ static void alloc_raw_frame_buffers(VP8_COMP *cpi)
 #if VP8_TEMPORAL_ALT_REF
 
     if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
-                                    width, height, 16))
+                                    width, height, VP8BORDERINPIXELS))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate altref buffer");
 
@@ -1287,7 +1291,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate last frame buffer");
 
-    if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, 16))
+    if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
+                                    width, height, VP8BORDERINPIXELS))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate scaled source buffer");
 
@@ -2070,6 +2075,14 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     cpi->fn_ptr[BLOCK_4X4].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
     cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
 
+#if ARCH_X86 || ARCH_X86_64
+    cpi->fn_ptr[BLOCK_16X16].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
+    cpi->fn_ptr[BLOCK_16X8].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
+    cpi->fn_ptr[BLOCK_8X16].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
+    cpi->fn_ptr[BLOCK_8X8].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
+    cpi->fn_ptr[BLOCK_4X4].copymem        = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
+#endif
+
     cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
     cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
     cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 5eaca5935..c734458a9 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -10,6 +10,8 @@
 
 
 #include <stdlib.h>
+#include "vpx_ports/config.h"
+#include "vpx/vpx_integer.h"
 
 unsigned int vp8_sad16x16_c(
     const unsigned char *src_ptr,
@@ -337,3 +339,64 @@ void vp8_sad4x4x4d_c(
     sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
     sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
 }
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *dst_ptr,
+    int  dst_stride,
+    int height)
+{
+    int r;
+
+    for (r = 0; r < height; r++)
+    {
+#if !(CONFIG_FAST_UNALIGNED)
+        dst_ptr[0] = src_ptr[0];
+        dst_ptr[1] = src_ptr[1];
+        dst_ptr[2] = src_ptr[2];
+        dst_ptr[3] = src_ptr[3];
+        dst_ptr[4] = src_ptr[4];
+        dst_ptr[5] = src_ptr[5];
+        dst_ptr[6] = src_ptr[6];
+        dst_ptr[7] = src_ptr[7];
+        dst_ptr[8] = src_ptr[8];
+        dst_ptr[9] = src_ptr[9];
+        dst_ptr[10] = src_ptr[10];
+        dst_ptr[11] = src_ptr[11];
+        dst_ptr[12] = src_ptr[12];
+        dst_ptr[13] = src_ptr[13];
+        dst_ptr[14] = src_ptr[14];
+        dst_ptr[15] = src_ptr[15];
+        dst_ptr[16] = src_ptr[16];
+        dst_ptr[17] = src_ptr[17];
+        dst_ptr[18] = src_ptr[18];
+        dst_ptr[19] = src_ptr[19];
+        dst_ptr[20] = src_ptr[20];
+        dst_ptr[21] = src_ptr[21];
+        dst_ptr[22] = src_ptr[22];
+        dst_ptr[23] = src_ptr[23];
+        dst_ptr[24] = src_ptr[24];
+        dst_ptr[25] = src_ptr[25];
+        dst_ptr[26] = src_ptr[26];
+        dst_ptr[27] = src_ptr[27];
+        dst_ptr[28] = src_ptr[28];
+        dst_ptr[29] = src_ptr[29];
+        dst_ptr[30] = src_ptr[30];
+        dst_ptr[31] = src_ptr[31];
+#else
+        ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
+        ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
+        ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
+        ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
+        ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
+        ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
+        ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
+        ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
+#endif
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+
+    }
+}
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 894b4f9e4..5fd6d3ae0 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -222,6 +222,13 @@ extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d);
 #endif
 extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d);
 
+#if ARCH_X86 || ARCH_X86_64
+#ifndef vp8_variance_copy32xn
+#define vp8_variance_copy32xn vp8_copy32xn_c
+#endif
+extern prototype_sad(vp8_variance_copy32xn);
+#endif
+
 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_var4x4
@@ -381,6 +388,10 @@ typedef struct
     vp8_sad_multi_d_fn_t     sad8x8x4d;
     vp8_sad_multi_d_fn_t     sad4x4x4d;
 
+#if ARCH_X86 || ARCH_X86_64
+    vp8_sad_fn_t             copy32xn;
+#endif
+
 #if CONFIG_INTERNAL_STATS
     vp8_ssimpf_fn_t          ssimpf_8x8;
     vp8_ssimpf_fn_t          ssimpf;
@@ -399,7 +410,9 @@ typedef struct
     vp8_sad_multi_fn_t      sdx3f;
     vp8_sad_multi1_fn_t     sdx8f;
     vp8_sad_multi_d_fn_t    sdx4df;
-
+#if ARCH_X86 || ARCH_X86_64
+    vp8_sad_fn_t            copymem;
+#endif
 } vp8_variance_fn_ptr_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 04ee72f72..1011c9553 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -328,3 +328,83 @@ x16x8sad_wmt_early_exit:
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+;void vp8_copy32xn_sse2(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse2)
+sym(vp8_copy32xn_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;dst_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
+        movsxd          rcx,        dword ptr arg(4) ;height
+
+block_copy_sse2_loopx4:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,        [rsi+rax*2]
+
+        movdqu          xmm4,       XMMWORD PTR [rsi]
+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,    [rsi+rax*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
+
+        lea             rdi,    [rdi+rdx*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm4
+        movdqa          XMMWORD PTR [rdi + 16], xmm5
+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
+
+        lea             rdi,    [rdi+rdx*2]
+
+        sub             rcx,     4
+        cmp             rcx,     4
+        jge             block_copy_sse2_loopx4
+
+        cmp             rcx, 0
+        je              copy_is_done
+
+block_copy_sse2_loop:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        lea             rsi,    [rsi+rax]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        lea             rdi,    [rdi+rdx]
+
+        sub             rcx,     1
+        jne             block_copy_sse2_loop
+
+copy_is_done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 2dbcc7dc9..1c41c322a 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -20,6 +20,7 @@
   %define     ret_var       rbx
   %define     result_ptr    arg(4)
   %define     max_err       arg(4)
+  %define     height        dword ptr arg(4)
     push        rbp
     mov         rbp,        rsp
     push        rsi
@@ -42,6 +43,7 @@
     %define     ret_var     r11
     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
     %define     max_err     [rsp+xmm_stack_space+8+4*8]
+    %define     height      [rsp+xmm_stack_space+8+4*8]
   %else
     %define     src_ptr     rdi
     %define     src_stride  rsi
@@ -51,6 +53,7 @@
     %define     ret_var     r10
     %define     result_ptr  r8
     %define     max_err     r8
+    %define     height      r8
   %endif
 %endif
 
@@ -65,6 +68,7 @@
   %define     ret_var
   %define     result_ptr
   %define     max_err
+  %define     height
 
 %if ABI_IS_32BIT
     pop         rbx
@@ -632,6 +636,67 @@ sym(vp8_sad16x16_sse3):
 
     STACK_FRAME_DESTROY_X3
 
+;void vp8_copy32xn_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse3)
+sym(vp8_copy32xn_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+block_copy_sse3_loopx4:
+        lea             end_ptr,    [src_ptr+src_stride*2]
+
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
+
+        lea             src_ptr,    [src_ptr+src_stride*4]
+
+        lea             end_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+        movdqa          XMMWORD PTR [end_ptr], xmm4
+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
+
+        sub             height,     4
+        cmp             height,     4
+        jge             block_copy_sse3_loopx4
+
+        ;Check to see if there is more rows need to be copied.
+        cmp             height, 0
+        je              copy_is_done
+
+block_copy_sse3_loop:
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        lea             src_ptr,    [src_ptr+src_stride]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        lea             ref_ptr,    [ref_ptr+ref_stride]
+
+        sub             height,     1
+        jne             block_copy_sse3_loop
+
+copy_is_done:
+    STACK_FRAME_DESTROY_X3
+
 ;void vp8_sad16x16x4d_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -892,3 +957,4 @@ sym(vp8_sad4x4x4d_sse3):
 
 
     STACK_FRAME_DESTROY_X4
+
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 0ee8eb7e5..af6c4d27e 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -121,6 +121,7 @@ extern prototype_sad(vp8_sad8x8_wmt);
 extern prototype_sad(vp8_sad8x16_wmt);
 extern prototype_sad(vp8_sad16x8_wmt);
 extern prototype_sad(vp8_sad16x16_wmt);
+extern prototype_sad(vp8_copy32xn_sse2);
 extern prototype_variance(vp8_variance4x4_wmt);
 extern prototype_variance(vp8_variance8x8_wmt);
 extern prototype_variance(vp8_variance8x16_wmt);
@@ -156,6 +157,9 @@ extern prototype_variance2(vp8_get16x16var_sse2);
 #undef  vp8_variance_sad16x16
 #define vp8_variance_sad16x16 vp8_sad16x16_wmt
 
+#undef  vp8_variance_copy32xn
+#define vp8_variance_copy32xn vp8_copy32xn_sse2
+
 #undef  vp8_variance_var4x4
 #define vp8_variance_var4x4 vp8_variance4x4_wmt
 
@@ -222,6 +226,7 @@ extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
 extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
 extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
 extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
+extern prototype_sad(vp8_copy32xn_sse3);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 
@@ -258,6 +263,9 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
 #undef  vp8_variance_sad4x4x4d
 #define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
 
+#undef  vp8_variance_copy32xn
+#define vp8_variance_copy32xn vp8_copy32xn_sse3
+
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 9a324ec12..badb9f044 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -203,6 +203,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad8x16               = vp8_sad8x16_wmt;
         cpi->rtcd.variance.sad8x8                = vp8_sad8x8_wmt;
         cpi->rtcd.variance.sad4x4                = vp8_sad4x4_wmt;
+        cpi->rtcd.variance.copy32xn              = vp8_copy32xn_sse2;
 
         cpi->rtcd.variance.var4x4                = vp8_variance4x4_wmt;
         cpi->rtcd.variance.var8x8                = vp8_variance8x8_wmt;
@@ -263,6 +264,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
         cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_sse3;
         cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
+        cpi->rtcd.variance.copy32xn              = vp8_copy32xn_sse3;
         cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
         cpi->rtcd.search.refining_search         = vp8_refining_search_sadx4;
     }
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 58dc486de..13a072bff 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -301,6 +301,36 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
     return res;
 }
 
+static void yuvconfig2image(vpx_image_t               *img,
+                            const YV12_BUFFER_CONFIG  *yv12,
+                            void                      *user_priv)
+{
+    /** vpx_img_wrap() doesn't allow specifying independent strides for
+      * the Y, U, and V planes, nor other alignment adjustments that
+      * might be representable by a YV12_BUFFER_CONFIG, so we just
+      * initialize all the fields.*/
+    img->fmt = yv12->clrtype == REG_YUV ?
+        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+    img->w = yv12->y_stride;
+    img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+    img->d_w = yv12->y_width;
+    img->d_h = yv12->y_height;
+    img->x_chroma_shift = 1;
+    img->y_chroma_shift = 1;
+    img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+    img->planes[VPX_PLANE_U] = yv12->u_buffer;
+    img->planes[VPX_PLANE_V] = yv12->v_buffer;
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = yv12->y_stride;
+    img->stride[VPX_PLANE_U] = yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+    img->bps = 12;
+    img->user_priv = user_priv;
+    img->img_data = yv12->buffer_alloc;
+    img->img_data_owner = 0;
+    img->self_allocd = 0;
+}
 
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                                   const uint8_t         *data,
@@ -429,21 +459,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
 
         if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
         {
-            /* Align width/height */
-            unsigned int a_w = (sd.y_width + 15) & ~15;
-            unsigned int a_h = (sd.y_height + 15) & ~15;
-
-            vpx_img_wrap(&ctx->img, VPX_IMG_FMT_I420,
-                         a_w + 2 * VP8BORDERINPIXELS,
-                         a_h + 2 * VP8BORDERINPIXELS,
-                         1,
-                         sd.buffer_alloc);
-            vpx_img_set_rect(&ctx->img,
-                             VP8BORDERINPIXELS, VP8BORDERINPIXELS,
-                             sd.y_width, sd.y_height);
-            ctx->img.user_priv = user_priv;
+            yuvconfig2image(&ctx->img, &sd, user_priv);
             ctx->img_avail = 1;
-
         }
     }
 
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index d02cde28f..eff594e2d 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -49,25 +49,33 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int
 
     if (ybf)
     {
+        int y_stride = ((width + 2 * border) + 31) & ~31;
+        int yplane_size = (height + 2 * border) * y_stride;
         int uv_width = width >> 1;
         int uv_height = height >> 1;
-        int yplane_size = (height + 2 * border) * (width + 2 * border);
-        int uvplane_size = (uv_height + border) * (uv_width + border);
+        /** There is currently a bunch of code which assumes
+          *  uv_stride == y_stride/2, so enforce this here. */
+        int uv_stride = y_stride >> 1;
+        int uvplane_size = (uv_height + border) * uv_stride;
 
         vp8_yv12_de_alloc_frame_buffer(ybf);
 
-        /* only support allocating buffers that have
-          a height and width that are multiples of 16 */
-        if ((width & 0xf) | (height & 0xf))
+        /** Only support allocating buffers that have a height and width that
+          *  are multiples of 16, and a border that's a multiple of 32.
+          * The border restriction is required to get 16-byte alignment of the
+          *  start of the chroma rows without intoducing an arbitrary gap
+          *  between planes, which would break the semantics of things like
+          *  vpx_img_set_rect(). */
+        if ((width & 0xf) | (height & 0xf) | (border & 0x1f))
             return -3;
 
         ybf->y_width  = width;
         ybf->y_height = height;
-        ybf->y_stride = width + 2 * border;
+        ybf->y_stride = y_stride;
 
         ybf->uv_width = uv_width;
         ybf->uv_height = uv_height;
-        ybf->uv_stride = uv_width + border;
+        ybf->uv_stride = uv_stride;
 
         ybf->border = border;
         ybf->frame_size = yplane_size + 2 * uvplane_size;
@@ -77,9 +85,9 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int
         if (ybf->buffer_alloc == NULL)
             return -1;
 
-        ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border;
-        ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * ybf->uv_stride) + border / 2;
-        ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * ybf->uv_stride) + border / 2;
+        ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+        ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * uv_stride) + border / 2;
+        ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * uv_stride) + border / 2;
 
         ybf->corrupted = 0; /* assume not currupted by errors */
     }