summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r--vp8/encoder/x86/csystemdependent.c289
-rw-r--r--vp8/encoder/x86/dct_mmx.asm846
-rw-r--r--vp8/encoder/x86/dct_sse2.asm260
-rw-r--r--vp8/encoder/x86/dct_x86.h73
-rw-r--r--vp8/encoder/x86/encodemb_x86.h73
-rw-r--r--vp8/encoder/x86/encodeopt.asm393
-rw-r--r--vp8/encoder/x86/fwalsh_sse2.asm117
-rw-r--r--vp8/encoder/x86/mcomp_x86.h27
-rw-r--r--vp8/encoder/x86/preproc_mmx.c297
-rw-r--r--vp8/encoder/x86/quantize_mmx.asm438
-rw-r--r--vp8/encoder/x86/sad_mmx.asm428
-rw-r--r--vp8/encoder/x86/sad_sse2.asm329
-rw-r--r--vp8/encoder/x86/sad_sse3.asm939
-rw-r--r--vp8/encoder/x86/sad_ssse3.asm367
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm431
-rw-r--r--vp8/encoder/x86/variance_impl_mmx.asm980
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm975
-rw-r--r--vp8/encoder/x86/variance_mmx.c596
-rw-r--r--vp8/encoder/x86/variance_sse2.c514
-rw-r--r--vp8/encoder/x86/variance_x86.h275
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c287
21 files changed, 8934 insertions, 0 deletions
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
new file mode 100644
index 000000000..186ee6856
--- /dev/null
+++ b/vp8/encoder/x86/csystemdependent.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *, short *);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern int vp8_block_error_c(short *, short *);
+extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_mmx(short *, short *);
+extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_xmm(short *, short *);
+extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
+
+
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// c imports
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
+
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction vp8_sad16x16_c;
+extern SADFunction vp8_sad16x8_c;
+extern SADFunction vp8_sad8x16_c;
+extern SADFunction vp8_sad8x8_c;
+extern SADFunction vp8_sad4x4_c;
+
+extern SADFunction vp8_sad16x16_wmt;
+extern SADFunction vp8_sad16x8_wmt;
+extern SADFunction vp8_sad8x16_wmt;
+extern SADFunction vp8_sad8x8_wmt;
+extern SADFunction vp8_sad4x4_wmt;
+
+extern SADFunction vp8_sad16x16_mmx;
+extern SADFunction vp8_sad16x8_mmx;
+extern SADFunction vp8_sad8x16_mmx;
+extern SADFunction vp8_sad8x8_mmx;
+extern SADFunction vp8_sad4x4_mmx;
+
+extern variance_function vp8_variance16x16_c;
+extern variance_function vp8_variance8x16_c;
+extern variance_function vp8_variance16x8_c;
+extern variance_function vp8_variance8x8_c;
+extern variance_function vp8_variance4x4_c;
+extern variance_function vp8_mse16x16_c;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// mmx imports
+extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
+extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_mmx;
+extern variance_function vp8_variance8x8_mmx;
+extern variance_function vp8_variance8x16_mmx;
+extern variance_function vp8_variance16x8_mmx;
+extern variance_function vp8_variance16x16_mmx;
+
+extern variance_function vp8_mse16x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
+
+extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_mmx(short *);
+extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+
+// wmt imports
+extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
+extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_wmt;
+extern variance_function vp8_variance8x8_wmt;
+extern variance_function vp8_variance8x16_wmt;
+extern variance_function vp8_variance16x8_wmt;
+extern variance_function vp8_variance16x16_wmt;
+
+extern variance_function vp8_mse16x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
+extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
+extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+void vp8_cmachine_specific_config(void)
+{
+ int mmx_enabled;
+ int xmm_enabled;
+ int wmt_enabled;
+
+ vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+ if (wmt_enabled) // Willamette
+ {
+ // Willamette instruction set available:
+ vp8_mbuverror = vp8_mbuverror_xmm;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
+ vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt;
+ vp8_subtract_b = vp8_subtract_b_mmx;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
+ vp8_variance4x4 = vp8_variance4x4_mmx;
+ vp8_variance8x8 = vp8_variance8x8_mmx;
+ vp8_variance8x16 = vp8_variance8x16_wmt;
+ vp8_variance16x8 = vp8_variance16x8_wmt;
+ vp8_variance16x16 = vp8_variance16x16_wmt;
+ vp8_mse16x16 = vp8_mse16x16_wmt;
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt;
+ vp8_get_mb_ss = vp8_get_mb_ss_sse2;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_sse2;
+ vp8_get8x8var = vp8_get8x8var_sse2;
+ vp8_get16x16var = vp8_get16x16var_sse2;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx;
+ vp8_sad16x16 = vp8_sad16x16_wmt;
+ vp8_sad16x8 = vp8_sad16x8_wmt;
+ vp8_sad8x16 = vp8_sad8x16_wmt;
+ vp8_sad8x8 = vp8_sad8x8_wmt;
+ vp8_sad4x4 = vp8_sad4x4_wmt;
+ vp8_block_error = vp8_block_error_xmm;
+ vp8_mbblock_error = vp8_mbblock_error_xmm;
+ vp8_subtract_mby = vp8_subtract_mby_mmx;
+
+ }
+ else if (mmx_enabled)
+ {
+ // MMX instruction set available:
+ vp8_mbuverror = vp8_mbuverror_mmx;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
+ vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx;
+ vp8_subtract_b = vp8_subtract_b_mmx;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
+ vp8_variance4x4 = vp8_variance4x4_mmx;
+ vp8_variance8x8 = vp8_variance8x8_mmx;
+ vp8_variance8x16 = vp8_variance8x16_mmx;
+ vp8_variance16x8 = vp8_variance16x8_mmx;
+ vp8_variance16x16 = vp8_variance16x16_mmx;
+ vp8_mse16x16 = vp8_mse16x16_mmx;
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx;
+ vp8_get_mb_ss = vp8_get_mb_ss_mmx;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_mmx;
+ vp8_get8x8var = vp8_get8x8var_mmx;
+ vp8_get16x16var = vp8_get16x16var_mmx;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx;
+ vp8_sad16x16 = vp8_sad16x16_mmx;
+ vp8_sad16x8 = vp8_sad16x8_mmx;
+ vp8_sad8x16 = vp8_sad8x16_mmx;
+ vp8_sad8x8 = vp8_sad8x8_mmx;
+ vp8_sad4x4 = vp8_sad4x4_mmx;
+ vp8_block_error = vp8_block_error_mmx;
+ vp8_mbblock_error = vp8_mbblock_error_mmx;
+ vp8_subtract_mby = vp8_subtract_mby_mmx;
+
+ }
+ else
+ {
+ // Pure C:
+ vp8_mbuverror = vp8_mbuverror_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+ vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
+ vp8_subtract_b = vp8_subtract_b_c;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_c;
+ vp8_variance4x4 = vp8_variance4x4_c;
+ vp8_variance8x8 = vp8_variance8x8_c;
+ vp8_variance8x16 = vp8_variance8x16_c;
+ vp8_variance16x8 = vp8_variance16x8_c;
+ vp8_variance16x16 = vp8_variance16x16_c;
+ vp8_mse16x16 = vp8_mse16x16_c;
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c;
+ vp8_get_mb_ss = vp8_get_mb_ss_c;
+ vp8_get16x16pred_error = vp8_get16x16pred_error_c;
+ vp8_get8x8var = vp8_get8x8var_c;
+ vp8_get16x16var = vp8_get16x16var_c;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
+ vp8_sad16x16 = vp8_sad16x16_c;
+ vp8_sad16x8 = vp8_sad16x8_c;
+ vp8_sad8x16 = vp8_sad8x16_c;
+ vp8_sad8x8 = vp8_sad8x8_c;
+ vp8_sad4x4 = vp8_sad4x4_c;
+ vp8_block_error = vp8_block_error_c;
+ vp8_mbblock_error = vp8_mbblock_error_c;
+ vp8_subtract_mby = vp8_subtract_mby_c;
+ }
+
+}
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 000000000..e13423796
--- /dev/null
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -0,0 +1,846 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+ global sym(vp8_short_fdct4x4_mmx)
+ global sym(vp8_fast_fdct4x4_mmx)
+ global sym(vp8_fast_fdct8x4_wmt)
+
+
+%define DCTCONSTANTSBITS (16)
+%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
+%define x_c1 (60547) ; cos(pi /8) * (1<<15)
+%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
+%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
+
+
+%define _1STSTAGESHIFT 14
+%define _2NDSTAGESHIFT 16
+
+; using matrix multiply with source and destbuffer has a pitch
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_short_fdct4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;input
+ mov rdi, arg(1) ;output
+
+ movsxd rax, dword ptr arg(2) ;pitch
+ lea rdx, [dct_matrix GLOBAL]
+
+ movq mm0, [rsi ]
+ movq mm1, [rsi + rax]
+
+ movq mm2, [rsi + rax*2]
+ lea rsi, [rsi + rax*2]
+
+ movq mm3, [rsi + rax]
+
+ ; first column
+ movq mm4, mm0
+ movq mm7, [rdx]
+
+ pmaddwd mm4, mm7
+ movq mm5, mm1
+
+ pmaddwd mm5, mm7
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+
+ pmaddwd mm5, mm7
+ movq mm6, mm3
+
+ pmaddwd mm6, mm7
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _1STSTAGESHIFT
+ psrad mm5, _1STSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi], mm4
+
+ ;second column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+8]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+8]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _1STSTAGESHIFT
+ psrad mm5, _1STSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+8], mm4
+
+
+ ;third column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+16]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+16]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _1STSTAGESHIFT
+ psrad mm5, _1STSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+16], mm4
+
+ ;fourth column (this is the last column, so we do not have save the source any more)
+
+ pmaddwd mm0, [rdx+24]
+
+ pmaddwd mm1, [rdx+24]
+ movq mm6, mm0
+
+ punpckldq mm0, mm1
+ punpckhdq mm6, mm1
+
+ paddd mm0, mm6
+
+ pmaddwd mm2, [rdx+24]
+
+ pmaddwd mm3, [rdx+24]
+ movq mm7, mm2
+
+ punpckldq mm2, mm3
+ punpckhdq mm7, mm3
+
+ paddd mm2, mm7
+ movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
+
+ paddd mm0, mm6
+ paddd mm2, mm6
+
+ psrad mm0, _1STSTAGESHIFT
+ psrad mm2, _1STSTAGESHIFT
+
+ packssdw mm0, mm2
+
+ movq mm3, mm0
+
+ ; done with one pass
+ ; now start second pass
+ movq mm0, [rdi ]
+ movq mm1, [rdi+ 8]
+ movq mm2, [rdi+ 16]
+
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi], mm4
+
+ ;second column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+8]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+8]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+8]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+8], mm4
+
+
+ ;third column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+16]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+16]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+16]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+16], mm4
+
+ ;fourth column
+ movq mm4, mm0
+
+ pmaddwd mm4, [rdx+24]
+ movq mm5, mm1
+
+ pmaddwd mm5, [rdx+24]
+ movq mm6, mm4
+
+ punpckldq mm4, mm5
+ punpckhdq mm6, mm5
+
+ paddd mm4, mm6
+ movq mm5, mm2
+
+ pmaddwd mm5, [rdx+24]
+ movq mm6, mm3
+
+ pmaddwd mm6, [rdx+24]
+ movq mm7, mm5
+
+ punpckldq mm5, mm6
+ punpckhdq mm7, mm6
+
+ paddd mm5, mm7
+ movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
+
+ paddd mm4, mm6
+ paddd mm5, mm6
+
+ psrad mm4, _2NDSTAGESHIFT
+ psrad mm5, _2NDSTAGESHIFT
+
+ packssdw mm4, mm5
+ movq [rdi+24], mm4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_fast_fdct4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;input
+ mov rdi, arg(1) ;output
+
+ lea rdx, [dct_const_mmx GLOBAL]
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
+ ; read the input data
+ movq mm0, [rsi]
+ movq mm1, [rsi + rax ]
+
+ movq mm2, [rcx]
+ movq mm3, [rcx + rax]
+ ; get the constants
+ ;shift to left by 1 for prescision
+ paddw mm0, mm0
+ paddw mm1, mm1
+
+ psllw mm2, 1
+ psllw mm3, 1
+
+ ; transpose for the second stage
+ movq mm4, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 10 11 12 03
+
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm4, mm1 ; 02 12 03 13
+
+ punpcklwd mm2, mm3 ; 20 30 21 31
+ punpckhwd mm5, mm3 ; 22 32 23 33
+
+
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+
+ punpckhdq mm1, mm2 ; 01 11 21 31
+
+ movq mm2, mm4 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
+
+ punpckhdq mm4, mm5 ; 03 13 23 33
+ movq mm3, mm4
+
+
+ ; first stage
+ movq mm5, mm0
+ movq mm4, mm1
+
+ paddw mm0, mm3 ; a = 0 + 3
+ paddw mm1, mm2 ; b = 1 + 2
+
+ psubw mm4, mm2 ; c = 1 - 2
+ psubw mm5, mm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movq mm6, [rdx + 16] ; c2
+ movq mm2, mm0 ; a
+
+ paddw mm0, mm1 ; a + b
+ psubw mm2, mm1 ; a - b
+
+ movq mm1, mm0 ; a + b
+ pmulhw mm0, mm6 ; 00 01 02 03
+
+ paddw mm0, mm1 ; output 00 01 02 03
+ pmulhw mm6, mm2 ; 20 21 22 23
+
+ paddw mm2, mm6 ; output 20 21 22 23
+
+ ; output 1 and 3
+ movq mm6, [rdx + 8] ; c1
+ movq mm7, [rdx + 24] ; c3
+
+ movq mm1, mm4 ; c
+ movq mm3, mm5 ; d
+
+ pmulhw mm1, mm7 ; c * c3
+ pmulhw mm3, mm6 ; d * c1
+
+ paddw mm3, mm5 ; d * c1 rounded
+ paddw mm1, mm3 ; output 10 11 12 13
+
+ movq mm3, mm4 ; c
+ pmulhw mm5, mm7 ; d * c3
+
+ pmulhw mm4, mm6 ; c * c1
+ paddw mm3, mm4 ; round c* c1
+
+ psubw mm5, mm3 ; output 30 31 32 33
+ movq mm3, mm5
+
+
+ ; done with vertical
+ ; transpose for the second stage
+ movq mm4, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 10 11 12 03
+
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm4, mm1 ; 02 12 03 13
+
+ punpcklwd mm2, mm3 ; 20 30 21 31
+ punpckhwd mm5, mm3 ; 22 32 23 33
+
+
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+
+ punpckhdq mm1, mm2 ; 01 11 21 31
+
+ movq mm2, mm4 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
+
+ punpckhdq mm4, mm5 ; 03 13 23 33
+ movq mm3, mm4
+
+
+ ; first stage
+ movq mm5, mm0
+ movq mm4, mm1
+
+ paddw mm0, mm3 ; a = 0 + 3
+ paddw mm1, mm2 ; b = 1 + 2
+
+ psubw mm4, mm2 ; c = 1 - 2
+ psubw mm5, mm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movq mm6, [rdx + 16] ; c2
+ movq mm2, mm0 ; a
+ paddw mm0, mm1 ; a + b
+
+ psubw mm2, mm1 ; a - b
+
+ movq mm1, mm0 ; a + b
+ pmulhw mm0, mm6 ; 00 01 02 03
+
+ paddw mm0, mm1 ; output 00 01 02 03
+ pmulhw mm6, mm2 ; 20 21 22 23
+
+ paddw mm2, mm6 ; output 20 21 22 23
+
+
+ ; output 1 and 3
+ movq mm6, [rdx + 8] ; c1
+ movq mm7, [rdx + 24] ; c3
+
+ movq mm1, mm4 ; c
+ movq mm3, mm5 ; d
+
+ pmulhw mm1, mm7 ; c * c3
+ pmulhw mm3, mm6 ; d * c1
+
+ paddw mm3, mm5 ; d * c1 rounded
+ paddw mm1, mm3 ; output 10 11 12 13
+
+ movq mm3, mm4 ; c
+ pmulhw mm5, mm7 ; d * c3
+
+ pmulhw mm4, mm6 ; c * c1
+ paddw mm3, mm4 ; round c* c1
+
+ psubw mm5, mm3 ; output 30 31 32 33
+ movq mm3, mm5
+ ; done with vertical
+
+ pcmpeqw mm4, mm4
+ pcmpeqw mm5, mm5
+ psrlw mm4, 15
+ psrlw mm5, 15
+
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm2, mm4
+ paddw mm3, mm5
+
+ psraw mm0, 1
+ psraw mm1, 1
+ psraw mm2, 1
+ psraw mm3, 1
+
+ movq [rdi ], mm0
+ movq [rdi+ 8], mm1
+ movq [rdi+16], mm2
+ movq [rdi+24], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_fast_fdct8x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;input
+ mov rdi, arg(1) ;output
+
+ lea rdx, [dct_const_xmm GLOBAL]
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
+ ; read the input data
+ movdqa xmm0, [rsi]
+ movdqa xmm2, [rsi + rax]
+
+ movdqa xmm4, [rcx]
+ movdqa xmm3, [rcx + rax]
+ ; get the constants
+ ;shift to left by 1 for prescision
+ psllw xmm0, 1
+ psllw xmm2, 1
+
+ psllw xmm4, 1
+ psllw xmm3, 1
+
+ ; transpose for the second stage
+ movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
+ movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
+
+ punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
+ punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+
+ punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
+ punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+
+ movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
+ punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+
+ punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+
+
+ movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
+ punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+
+ punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
+ movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+
+ punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
+ punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
+ punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+
+ punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
+
+ ; xmm0 0
+ ; xmm1 1
+ ; xmm2 2
+ ; xmm3 3
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a = 0 + 3
+ paddw xmm1, xmm2 ; b = 1 + 2
+
+ psubw xmm4, xmm2 ; c = 1 - 2
+ psubw xmm5, xmm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movdqa xmm6, [rdx + 32] ; c2
+ movdqa xmm2, xmm0 ; a
+
+ paddw xmm0, xmm1 ; a + b
+ psubw xmm2, xmm1 ; a - b
+
+ movdqa xmm1, xmm0 ; a + b
+ pmulhw xmm0, xmm6 ; 00 01 02 03
+
+ paddw xmm0, xmm1 ; output 00 01 02 03
+ pmulhw xmm6, xmm2 ; 20 21 22 23
+
+ paddw xmm2, xmm6 ; output 20 21 22 23
+
+ ; output 1 and 3
+ movdqa xmm6, [rdx + 16] ; c1
+ movdqa xmm7, [rdx + 48] ; c3
+
+ movdqa xmm1, xmm4 ; c
+ movdqa xmm3, xmm5 ; d
+
+ pmulhw xmm1, xmm7 ; c * c3
+ pmulhw xmm3, xmm6 ; d * c1
+
+ paddw xmm3, xmm5 ; d * c1 rounded
+ paddw xmm1, xmm3 ; output 10 11 12 13
+
+ movdqa xmm3, xmm4 ; c
+ pmulhw xmm5, xmm7 ; d * c3
+
+ pmulhw xmm4, xmm6 ; c * c1
+ paddw xmm3, xmm4 ; round c* c1
+
+ psubw xmm5, xmm3 ; output 30 31 32 33
+ movdqa xmm3, xmm5
+
+
+ ; done with vertical
+ ; transpose for the second stage
+ movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36
+ movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34
+ movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36
+
+ punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31
+ punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35
+
+ punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33
+ punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+
+ movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31
+ punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13
+
+ punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33
+
+
+ movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35
+ punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17
+
+ punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37
+ movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33
+
+ punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37
+ punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27
+
+ movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13
+ punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07
+
+ punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a = 0 + 3
+ paddw xmm1, xmm2 ; b = 1 + 2
+
+ psubw xmm4, xmm2 ; c = 1 - 2
+ psubw xmm5, xmm3 ; d = 0 - 3
+
+
+ ; output 0 and 2
+ movdqa xmm6, [rdx + 32] ; c2
+ movdqa xmm2, xmm0 ; a
+
+ paddw xmm0, xmm1 ; a + b
+ psubw xmm2, xmm1 ; a - b
+
+ movdqa xmm1, xmm0 ; a + b
+ pmulhw xmm0, xmm6 ; 00 01 02 03
+
+ paddw xmm0, xmm1 ; output 00 01 02 03
+ pmulhw xmm6, xmm2 ; 20 21 22 23
+
+ paddw xmm2, xmm6 ; output 20 21 22 23
+
+ ; output 1 and 3
+ movdqa xmm6, [rdx + 16] ; c1
+ movdqa xmm7, [rdx + 48] ; c3
+
+ movdqa xmm1, xmm4 ; c
+ movdqa xmm3, xmm5 ; d
+
+ pmulhw xmm1, xmm7 ; c * c3
+ pmulhw xmm3, xmm6 ; d * c1
+
+ paddw xmm3, xmm5 ; d * c1 rounded
+ paddw xmm1, xmm3 ; output 10 11 12 13
+
+ movdqa xmm3, xmm4 ; c
+ pmulhw xmm5, xmm7 ; d * c3
+
+ pmulhw xmm4, xmm6 ; c * c1
+ paddw xmm3, xmm4 ; round c* c1
+
+ psubw xmm5, xmm3 ; output 30 31 32 33
+ movdqa xmm3, xmm5
+ ; done with vertical
+
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm5, xmm5;
+ psrlw xmm4, 15
+ psrlw xmm5, 15
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ psraw xmm0, 1
+ psraw xmm1, 1
+ psraw xmm2, 1
+ psraw xmm3, 1
+
+ movq QWORD PTR[rdi ], xmm0
+ movq QWORD PTR[rdi+ 8], xmm1
+ movq QWORD PTR[rdi+16], xmm2
+ movq QWORD PTR[rdi+24], xmm3
+
+ psrldq xmm0, 8
+ psrldq xmm1, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+
+ movq QWORD PTR[rdi+32], xmm0
+ movq QWORD PTR[rdi+40], xmm1
+ movq QWORD PTR[rdi+48], xmm2
+ movq QWORD PTR[rdi+56], xmm3
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;static const unsigned int dct1st_stage_rounding_mmx[2] =
+align 16
+dct1st_stage_rounding_mmx:
+ times 2 dd 8192
+
+
+;static const unsigned int dct2nd_stage_rounding_mmx[2] =
+align 16
+dct2nd_stage_rounding_mmx:
+ times 2 dd 32768
+
+
+;static const short dct_matrix[4][4]=
+align 16
+dct_matrix:
+ times 4 dw 23170
+
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+
+ dw 23170
+ times 2 dw -23170
+ dw 23170
+
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
+
+
+;static const unsigned short dct_const_mmx[4 * 4]=
+align 16
+dct_const_mmx:
+ times 4 dw 0
+ times 4 dw 60547
+ times 4 dw 46341
+ times 4 dw 25080
+
+
+;static const unsigned short dct_const_xmm[8 * 4]=
+align 16
+dct_const_xmm:
+ times 8 dw 0
+ times 8 dw 60547
+ times 8 dw 46341
+ times 8 dw 25080
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..3e5e9a70c
--- /dev/null
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,260 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_short_fdct4x4_wmt)
+
+%define DCTCONSTANTSBITS (16)
+%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
+%define x_c1 (60547) ; cos(pi /8) * (1<<15)
+%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
+%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
+
+%define _1STSTAGESHIFT 14
+%define _2NDSTAGESHIFT 16
+
+
+;; using matrix multiply
+;void vp8_short_fdct4x4_wmt(short *input, short *output)
+sym(vp8_short_fdct4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ GET_GOT rbx
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rcx, arg(1) ;output
+
+ lea rdx, [dct_matrix_sse2 GLOBAL]
+
+ movdqu xmm0, [rax ]
+ movdqu xmm1, [rax+16]
+
+ ; first column
+ movdqa xmm2, xmm0
+ movdqa xmm7, [rdx]
+
+ pmaddwd xmm2, xmm7
+ movdqa xmm3, xmm1
+
+ pmaddwd xmm3, xmm7
+ movdqa xmm4, xmm2
+
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ movdqa xmm3, xmm2
+ punpckldq xmm2, xmm4
+
+ punpckhdq xmm3, xmm4
+ paddd xmm2, xmm3
+
+
+ paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+ psrad xmm2, _1STSTAGESHIFT
+ ;second column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+16]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+16]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+ psrad xmm3, _1STSTAGESHIFT
+ packssdw xmm2, xmm3
+
+ ;third column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+32]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+32]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _1STSTAGESHIFT
+
+ ;fourth column (this is the last column, so we do not have save the source any more)
+ pmaddwd xmm0, [rdx+48]
+ pmaddwd xmm1, [rdx+48]
+
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+
+ punpckhdq xmm4, xmm1
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm4
+ punpckhdq xmm1, xmm4
+
+ paddd xmm0, xmm1
+ paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+ psrad xmm0, _1STSTAGESHIFT
+ packssdw xmm3, xmm0
+ ; done with one pass
+ ; now start second pass
+ movdqa xmm0, xmm2
+ movdqa xmm1, xmm3
+
+ pmaddwd xmm2, xmm7
+ pmaddwd xmm3, xmm7
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+
+ punpckhdq xmm4, xmm3
+ movdqa xmm3, xmm2
+
+ punpckldq xmm2, xmm4
+ punpckhdq xmm3, xmm4
+
+ paddd xmm2, xmm3
+ paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm2, _2NDSTAGESHIFT
+
+ ;second column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+16]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+16]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _2NDSTAGESHIFT
+ packssdw xmm2, xmm3
+
+ movdqu [rcx], xmm2
+ ;third column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+32]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+32]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _2NDSTAGESHIFT
+ ;fourth column
+ pmaddwd xmm0, [rdx+48]
+ pmaddwd xmm1, [rdx+48]
+
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+
+ punpckhdq xmm4, xmm1
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm4
+ punpckhdq xmm1, xmm4
+
+ paddd xmm0, xmm1
+ paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm0, _2NDSTAGESHIFT
+ packssdw xmm3, xmm0
+
+ movdqu [rcx+16], xmm3
+
+ mov rsp, rbp
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;static unsigned int dct1st_stage_rounding_sse2[4] =
+align 16
+dct1st_stage_rounding_sse2:
+ times 4 dd 8192
+
+
+;static unsigned int dct2nd_stage_rounding_sse2[4] =
+align 16
+dct2nd_stage_rounding_sse2:
+ times 4 dd 32768
+
+;static short dct_matrix_sse2[4][8]=
+align 16
+dct_matrix_sse2:
+ times 8 dw 23170
+
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+
+ dw 23170
+ times 2 dw -23170
+ times 2 dw 23170
+ times 2 dw -23170
+ dw 23170
+
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
new file mode 100644
index 000000000..bc80e64ef
--- /dev/null
+++ b/vp8/encoder/x86/dct_x86.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef DCT_X86_H
+#define DCT_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_fdct(vp8_short_fdct4x4_mmx);
+extern prototype_fdct(vp8_short_fdct8x4_mmx);
+extern prototype_fdct(vp8_fast_fdct4x4_mmx);
+extern prototype_fdct(vp8_fast_fdct8x4_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
+
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_fdct(vp8_short_fdct4x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_fast_fdct8x4_wmt);
+
+extern prototype_fdct(vp8_short_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#if 0
+/* short SSE2 DCT currently disabled, does not match the MMX version */
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#endif
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
+
+#endif
+
+
+#endif
+
+#endif
diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
new file mode 100644
index 000000000..9397a6cca
--- /dev/null
+++ b/vp8/encoder/x86/encodemb_x86.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_X86_H
+#define ENCODEMB_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_berr(vp8_block_error_mmx);
+extern prototype_mberr(vp8_mbblock_error_mmx);
+extern prototype_mbuverr(vp8_mbuverror_mmx);
+extern prototype_subb(vp8_subtract_b_mmx);
+extern prototype_submby(vp8_subtract_mby_mmx);
+extern prototype_submbuv(vp8_subtract_mbuv_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_mmx
+
+#undef vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_mmx
+
+#undef vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_mmx
+
+#undef vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_mmx
+
+#undef vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_mmx
+
+#undef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_berr(vp8_block_error_xmm);
+extern prototype_mberr(vp8_mbblock_error_xmm);
+extern prototype_mbuverr(vp8_mbuverror_xmm);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_xmm
+
+#undef vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_xmm
+
+#undef vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_xmm
+
+#endif
+#endif
+
+
+#endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 000000000..194047155
--- /dev/null
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -0,0 +1,393 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp8_block_error_xmm)
+sym(vp8_block_error_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor xmm7, xmm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ movdqa xmm3, [rsi]
+
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rsi+16]
+
+ movdqa xmm6, [rdi+16]
+ pxor xmm1, xmm1 ; from movd xmm1, dc; dc=0
+
+ movdqa xmm2, xmm7
+ psubw xmm5, xmm6
+
+ por xmm1, xmm2
+ pmaddwd xmm5, xmm5
+
+ pcmpeqw xmm1, xmm7
+ psubw xmm3, xmm4
+
+ pand xmm1, xmm3
+ pmaddwd xmm1, xmm1
+
+ paddd xmm1, xmm5
+ movdqa xmm0, xmm1
+
+ punpckldq xmm0, xmm7
+ punpckhdq xmm1, xmm7
+
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ psrldq xmm0, 8
+ paddd xmm0, xmm1
+
+ movd rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp8_block_error_mmx)
+sym(vp8_block_error_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ movq mm3, [rsi]
+
+ movq mm4, [rdi]
+ movq mm5, [rsi+8]
+
+ movq mm6, [rdi+8]
+ pxor mm1, mm1 ; from movd mm1, dc ; dc =0
+
+ movq mm2, mm7
+ psubw mm5, mm6
+
+ por mm1, mm2
+ pmaddwd mm5, mm5
+
+ pcmpeqw mm1, mm7
+ psubw mm3, mm4
+
+ pand mm1, mm3
+ pmaddwd mm1, mm1
+
+ paddd mm1, mm5
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm3, mm5
+
+ paddd mm1, mm3
+ movq mm0, mm1
+
+ psrlq mm1, 32
+ paddd mm0, mm1
+
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl)
+sym(vp8_mbblock_error_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor mm2, mm2
+
+ movd mm1, dword ptr arg(2) ;dc
+ por mm1, mm2
+
+ pcmpeqw mm1, mm7
+ mov rcx, 16
+
+mberror_loop_mmx:
+ movq mm3, [rsi]
+ movq mm4, [rdi]
+
+ movq mm5, [rsi+8]
+ movq mm6, [rdi+8]
+
+
+ psubw mm5, mm6
+ pmaddwd mm5, mm5
+
+ psubw mm3, mm4
+ pand mm3, mm1
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ add rsi, 32
+
+ add rdi, 32
+ sub rcx, 1
+
+ jnz mberror_loop_mmx
+
+ movq mm0, mm2
+ psrlq mm2, 32
+
+ paddd mm0, mm2
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl)
+sym(vp8_mbblock_error_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor xmm7, xmm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor xmm2, xmm2
+
+ movd xmm1, dword ptr arg(2) ;dc
+ por xmm1, xmm2
+
+ pcmpeqw xmm1, xmm7
+ mov rcx, 16
+
+mberror_loop:
+ movdqa xmm3, [rsi]
+ movdqa xmm4, [rdi]
+
+ movdqa xmm5, [rsi+16]
+ movdqa xmm6, [rdi+16]
+
+
+ psubw xmm5, xmm6
+ pmaddwd xmm5, xmm5
+
+ psubw xmm3, xmm4
+ pand xmm3, xmm1
+
+ pmaddwd xmm3, xmm3
+ add rsi, 32
+
+ add rdi, 32
+
+ sub rcx, 1
+ paddd xmm2, xmm5
+
+ paddd xmm2, xmm3
+ jnz mberror_loop
+
+ movdqa xmm0, xmm2
+ punpckldq xmm0, xmm7
+
+ punpckhdq xmm2, xmm7
+ paddd xmm0, xmm2
+
+ movdqa xmm1, xmm0
+ psrldq xmm0, 8
+
+ paddd xmm0, xmm1
+ movd rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl)
+sym(vp8_mbuverror_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor mm7, mm7
+
+mbuverror_loop_mmx:
+
+ movq mm1, [rsi]
+ movq mm2, [rdi]
+
+ psubw mm1, mm2
+ pmaddwd mm1, mm1
+
+
+ movq mm3, [rsi+8]
+ movq mm4, [rdi+8]
+
+ psubw mm3, mm4
+ pmaddwd mm3, mm3
+
+
+ paddd mm7, mm1
+ paddd mm7, mm3
+
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz mbuverror_loop_mmx
+
+ movq mm0, mm7
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl)
+sym(vp8_mbuverror_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor xmm7, xmm7
+
+mbuverror_loop:
+
+ movdqa xmm1, [rsi]
+ movdqa xmm2, [rdi]
+
+ psubw xmm1, xmm2
+ pmaddwd xmm1, xmm1
+
+ paddd xmm7, xmm1
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz mbuverror_loop
+
+ pxor xmm0, xmm0
+ movdqa xmm1, xmm7
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ paddd xmm1, xmm2
+
+ movdqa xmm2, xmm1
+
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ movd rax, xmm1
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 000000000..7d8620178
--- /dev/null
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -0,0 +1,117 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2)
+sym(vp8_short_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0)
+ mov rdi, arg(1)
+
+ movdqu xmm4, [rsi + 0] ;ip[4] ip[0]
+ movdqu xmm0, [rsi + 16] ;ip[12] ip[8]
+
+ pxor xmm7, xmm7
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm5 ;ip[4] ip[0]
+
+ paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm3 ;d1 a1
+ punpckhqdq xmm6, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm6 ;c1 b1
+ paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ movdqa xmm0, xmm6 ;aka b2 a2
+ movdqa xmm1, xmm5 ;aka d2 c2
+
+ pcmpgtw xmm0, xmm7
+ pcmpgtw xmm1, xmm7
+
+ psrlw xmm0, 15
+ psrlw xmm1, 15
+
+ paddw xmm6, xmm0
+ paddw xmm5, xmm1
+
+ psraw xmm6, 1
+ psraw xmm5, 1
+
+ ; a2 = a1 + b1;
+ ; b2 = c1 + d1;
+ ; c2 = a1 - b1;
+ ; d2 = d1 - c1;
+ ; a2 += (a2>0);
+ ; b2 += (b2>0);
+ ; c2 += (c2>0);
+ ; d2 += (d2>0);
+ ; op[0] = (a2)>>1;
+ ; op[4] = (b2)>>1;
+ ; op[8] = (c2)>>1;
+ ; op[12]= (d2)>>1;
+
+ movdqu [rdi + 0], xmm6
+ movdqu [rdi + 16], xmm5
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
new file mode 100644
index 000000000..5661491ad
--- /dev/null
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef MCOMP_X86_H
+#define MCOMP_X86_H
+
+#if HAVE_SSE3
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx3
+
+#undef vp8_search_diamond_search
+#define vp8_search_diamond_search vp8_diamond_search_sadx4
+
+#endif
+#endif
+
+#endif
+
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
new file mode 100644
index 000000000..69617ca47
--- /dev/null
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "memory.h"
+#include "preproc.h"
+#include "pragmas.h"
+
+/****************************************************************************
+* Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+* Imports
+****************************************************************************/
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+* Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+/****************************************************************************
+ *
+ * ROUTINE : temp_filter_wmt
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ * unsigned char *s : Pointer to source frame.
+ * unsigned char *d : Pointer to destination frame.
+ * int bytes : Number of bytes to filter.
+ * int strength : Strength of filter to apply.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs a closesness adjusted temporarl blur
+ *
+ * SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_wmt
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+)
+{
+ int byte = 0;
+ unsigned char *frameptr = ppi->frame_buffer;
+
+ __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
+ __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
+
+ if (ppi->frame == 0)
+ {
+ do
+ {
+ int i;
+ int frame = 0;
+
+ do
+ {
+ for (i = 0; i < 8; i++)
+ {
+ *frameptr = s[byte+i];
+ ++frameptr;
+ }
+
+ ++frame;
+ }
+ while (frame < FRAMECOUNT);
+
+ for (i = 0; i < 8; i++)
+ d[byte+i] = s[byte+i];
+
+ byte += 8;
+
+ }
+ while (byte < bytes);
+ }
+ else
+ {
+ int i;
+ int offset2 = (ppi->frame % FRAMECOUNT);
+
+ do
+ {
+ __declspec(align(16)) unsigned short counts[8];
+ __declspec(align(16)) unsigned short sums[8];
+ __asm
+ {
+ mov eax, offset2
+ mov edi, s // source pixels
+ pxor xmm1, xmm1 // accumulator
+
+ pxor xmm7, xmm7
+
+ mov esi, frameptr // accumulator
+ pxor xmm2, xmm2 // count
+
+ movq xmm3, QWORD PTR [edi]
+
+ movq QWORD PTR [esi+8*eax], xmm3
+
+ punpcklbw xmm3, xmm2 // xmm3 source pixels
+ mov ecx, FRAMECOUNT
+
+ next_frame:
+ movq xmm4, QWORD PTR [esi] // get frame buffer values
+ punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
+ movdqa xmm6, xmm4 // save the pixel values
+ psubsw xmm4, xmm3 // subtracted pixel values
+ pmullw xmm4, xmm4 // square xmm4
+ movd xmm5, strength
+ psrlw xmm4, xmm5 // should be strength
+ pmullw xmm4, threes // 3 * modifier
+ movdqa xmm5, sixteens // 16s
+ psubusw xmm5, xmm4 // 16 - modifiers
+ movdqa xmm4, xmm5 // save the modifiers
+ pmullw xmm4, xmm6 // multiplier values
+ paddusw xmm1, xmm4 // accumulator
+ paddusw xmm2, xmm5 // count
+ add esi, 8 // next frame
+ dec ecx // next set of eight pixels
+ jnz next_frame
+
+ movdqa counts, xmm2
+ psrlw xmm2, 1 // divide count by 2 for rounding
+ paddusw xmm1, xmm2 // rounding added in
+
+ mov frameptr, esi
+
+ movdqa sums, xmm1
+ }
+
+ for (i = 0; i < 8; i++)
+ {
+ int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+ blurvalue >>= 16;
+ d[i] = blurvalue;
+ }
+
+ s += 8;
+ d += 8;
+ byte += 8;
+ }
+ while (byte < bytes);
+ }
+
+ ++ppi->frame;
+ __asm emms
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : temp_filter_mmx
+ *
+ * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ * unsigned char *s : Pointer to source frame.
+ * unsigned char *d : Pointer to destination frame.
+ * int bytes : Number of bytes to filter.
+ * int strength : Strength of filter to apply.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs a closesness adjusted temporarl blur
+ *
+ * SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_mmx
+(
+ pre_proc_instance *ppi,
+ unsigned char *s,
+ unsigned char *d,
+ int bytes,
+ int strength
+)
+{
+ int byte = 0;
+ unsigned char *frameptr = ppi->frame_buffer;
+
+ __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3};
+ __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
+
+ if (ppi->frame == 0)
+ {
+ do
+ {
+ int i;
+ int frame = 0;
+
+ do
+ {
+ for (i = 0; i < 4; i++)
+ {
+ *frameptr = s[byte+i];
+ ++frameptr;
+ }
+
+ ++frame;
+ }
+ while (frame < FRAMECOUNT);
+
+ for (i = 0; i < 4; i++)
+ d[byte+i] = s[byte+i];
+
+ byte += 4;
+
+ }
+ while (byte < bytes);
+ }
+ else
+ {
+ int i;
+ int offset2 = (ppi->frame % FRAMECOUNT);
+
+ do
+ {
+ __declspec(align(16)) unsigned short counts[8];
+ __declspec(align(16)) unsigned short sums[8];
+ __asm
+ {
+
+ mov eax, offset2
+ mov edi, s // source pixels
+ pxor mm1, mm1 // accumulator
+ pxor mm7, mm7
+
+ mov esi, frameptr // accumulator
+ pxor mm2, mm2 // count
+
+ movd mm3, DWORD PTR [edi]
+ movd DWORD PTR [esi+4*eax], mm3
+
+ punpcklbw mm3, mm2 // mm3 source pixels
+ mov ecx, FRAMECOUNT
+
+ next_frame:
+ movd mm4, DWORD PTR [esi] // get frame buffer values
+ punpcklbw mm4, mm7 // mm4 frame buffer pixels
+ movq mm6, mm4 // save the pixel values
+ psubsw mm4, mm3 // subtracted pixel values
+ pmullw mm4, mm4 // square mm4
+ movd mm5, strength
+ psrlw mm4, mm5 // should be strength
+ pmullw mm4, threes // 3 * modifier
+ movq mm5, sixteens // 16s
+ psubusw mm5, mm4 // 16 - modifiers
+ movq mm4, mm5 // save the modifiers
+ pmullw mm4, mm6 // multiplier values
+ paddusw mm1, mm4 // accumulator
+ paddusw mm2, mm5 // count
+ add esi, 4 // next frame
+ dec ecx // next set of eight pixels
+ jnz next_frame
+
+ movq counts, mm2
+ psrlw mm2, 1 // divide count by 2 for rounding
+ paddusw mm1, mm2 // rounding added in
+
+ mov frameptr, esi
+
+ movq sums, mm1
+
+ }
+
+ for (i = 0; i < 4; i++)
+ {
+ int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+ blurvalue >>= 16;
+ d[i] = blurvalue;
+ }
+
+ s += 4;
+ d += 4;
+ byte += 4;
+ }
+ while (byte < bytes);
+ }
+
+ ++ppi->frame;
+ __asm emms
+}
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 000000000..847fc6e37
--- /dev/null
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -0,0 +1,438 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx)
+sym(vp8_fast_quantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ movq mm0, [rsi]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm1, [rax]
+
+ movq mm3, mm0
+ psraw mm0, 15
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ; abs
+
+ movq mm2, mm3
+ pcmpgtw mm1, mm2
+
+ pandn mm1, mm2
+ movq mm3, mm1
+
+ mov rdx, arg(6) ;quant_ptr
+ movq mm1, [rdx]
+
+ mov rcx, arg(5) ;round_ptr
+ movq mm2, [rcx]
+
+ paddw mm3, mm2
+ pmulhuw mm3, mm1
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ movq mm0, mm3
+
+ movq [rdi], mm3
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm2, [rax]
+
+ pmullw mm3, mm2
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax], mm3
+
+ ; next 8
+ movq mm4, [rsi+8]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+8]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+8]
+ movq mm6, [rcx+8]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+8], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+8]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+8], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+16]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+16]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+16]
+ movq mm6, [rcx+16]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+16], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+16]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+16], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+24]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+24]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+24]
+ movq mm6, [rcx+24]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+24], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+24]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+24], mm7
+
+
+
+ mov rdi, arg(4) ;scan_mask
+ mov rsi, arg(2) ;qcoeff_ptr
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rsi+8]
+
+ movq mm2, [rdi]
+ movq mm3, [rdi+8];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ movq mm5, mm0
+
+ paddd mm5, mm1
+
+ movq mm0, [rsi+16]
+ movq mm1, [rsi+24]
+
+ movq mm2, [rdi+16]
+ movq mm3, [rdi+24];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ paddd mm5, mm0
+
+ paddd mm5, mm1
+ movq mm0, mm5
+
+ psrlq mm5, 32
+ paddd mm0, mm5
+
+ ; eob adjustment begins here
+ movd rcx, mm0
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx ; rdx=-rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+ ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+ ; following is kept as reference
+ ; movd rcx, mm0
+ ; bsr rax, rcx
+ ;
+ ; mov eob, rax
+ ; mov eee, rcx
+ ;
+ ;if(eee==0)
+ ;{
+ ; eob=-1;
+ ;}
+ ;else if(eee<0)
+ ;{
+ ; eob=15;
+ ;}
+ ;d->eob = eob+1;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse)
+sym(vp8_fast_quantize_b_impl_sse):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ movdqa xmm0, [rsi]
+
+ mov rax, arg(1) ;zbin_ptr
+ movdqa xmm1, [rax]
+
+ movdqa xmm3, xmm0
+ psraw xmm0, 15
+
+ pxor xmm3, xmm0
+ psubw xmm3, xmm0 ; abs
+
+ movdqa xmm2, xmm3
+ pcmpgtw xmm1, xmm2
+
+ pandn xmm1, xmm2
+ movdqa xmm3, xmm1
+
+ mov rdx, arg(6) ; quant_ptr
+ movdqa xmm1, [rdx]
+
+ mov rcx, arg(5) ; round_ptr
+ movdqa xmm2, [rcx]
+
+ paddw xmm3, xmm2
+ pmulhuw xmm3, xmm1
+
+ pxor xmm3, xmm0
+ psubw xmm3, xmm0 ;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ movdqa xmm0, xmm3
+
+ movdqa [rdi], xmm3
+
+ mov rax, arg(3) ;dequant_ptr
+ movdqa xmm2, [rax]
+
+ pmullw xmm3, xmm2
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movdqa [rax], xmm3
+
+ ; next 8
+ movdqa xmm4, [rsi+16]
+
+ mov rax, arg(1) ;zbin_ptr
+ movdqa xmm5, [rax+16]
+
+ movdqa xmm7, xmm4
+ psraw xmm4, 15
+
+ pxor xmm7, xmm4
+ psubw xmm7, xmm4 ; abs
+
+ movdqa xmm6, xmm7
+ pcmpgtw xmm5, xmm6
+
+ pandn xmm5, xmm6
+ movdqa xmm7, xmm5
+
+ movdqa xmm5, [rdx+16]
+ movdqa xmm6, [rcx+16]
+
+
+ paddw xmm7, xmm6
+ pmulhuw xmm7, xmm5
+
+ pxor xmm7, xmm4
+ psubw xmm7, xmm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movdqa xmm1, xmm7
+ movdqa [rdi+16], xmm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movdqa xmm6, [rax+16]
+
+ pmullw xmm7, xmm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movdqa [rax+16], xmm7
+ mov rdi, arg(4) ;scan_mask
+
+ pxor xmm7, xmm7
+ movdqa xmm2, [rdi]
+
+ movdqa xmm3, [rdi+16];
+ pcmpeqw xmm0, xmm7
+
+ pcmpeqw xmm1, xmm7
+ pcmpeqw xmm6, xmm6
+
+ pxor xmm0, xmm6
+ pxor xmm1, xmm6
+
+ psrlw xmm0, 15
+ psrlw xmm1, 15
+
+ pmaddwd xmm0, xmm2
+ pmaddwd xmm1, xmm3
+
+ movq xmm2, xmm0
+ movq xmm3, xmm1
+
+ psrldq xmm0, 8
+ psrldq xmm1, 8
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+
+ paddd xmm0, xmm2
+ movq xmm1, xmm0
+
+ psrldq xmm0, 4
+ paddd xmm1, xmm0
+
+ movd rcx, xmm1
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
new file mode 100644
index 000000000..a825698e7
--- /dev/null
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -0,0 +1,428 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx)
+global sym(vp8_sad8x16_mmx)
+global sym(vp8_sad8x8_mmx)
+global sym(vp8_sad4x4_mmx)
+global sym(vp8_sad16x8_mmx)
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x16x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpcklbw mm2, mm6
+
+ punpckhbw mm1, mm6
+ punpckhbw mm3, mm6
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ paddw mm7, mm1
+
+ cmp rsi, rcx
+ jne x16x16sad_mmx_loop
+
+
+ movq mm0, mm7
+
+ punpcklwd mm0, mm6
+ punpckhwd mm7, mm6
+
+ paddw mm0, mm7
+ movq mm7, mm0
+
+
+ psrlq mm0, 32
+ paddw mm7, mm0
+
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x8x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ paddw mm7, mm2
+ cmp rsi, rcx
+
+ jne x8x16sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x8x8sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ cmp rsi, rcx
+
+ jne x8x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rdi]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ pxor mm3, mm3
+
+ punpcklbw mm0, mm3
+ punpckhbw mm2, mm3
+
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm4, QWORD PTR [rsi]
+ movd mm5, QWORD PTR [rdi]
+
+ movd mm6, QWORD PTR [rsi+rax]
+ movd mm7, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+
+ movq mm6, mm4
+ psubusb mm4, mm5
+
+ psubusb mm5, mm6
+ por mm4, mm5
+
+ movq mm5, mm4
+ punpcklbw mm4, mm3
+
+ punpckhbw mm5, mm3
+ paddw mm4, mm5
+
+ paddw mm0, mm4
+ movq mm1, mm0
+
+ punpcklwd mm0, mm3
+ punpckhwd mm1, mm3
+
+ paddw mm0, mm1
+ movq mm1, mm0
+
+ psrlq mm0, 32
+ paddw mm0, mm1
+
+ movd rax, mm0
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+x16x8sad_mmx_loop:
+
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+
+ movq mm2, [rsi+8]
+ movq mm3, [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpckhbw mm1, mm6
+
+ punpcklbw mm2, mm6
+ punpckhbw mm3, mm6
+
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+ paddw mm0, mm1
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne x16x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movd rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
new file mode 100644
index 000000000..53240bbf1
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -0,0 +1,329 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x16_wmt)
+sym(vp8_sad16x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor xmm7, xmm7
+
+x16x16sad_wmt_loop:
+
+ movq xmm0, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rsi+8]
+
+ movq xmm1, QWORD PTR [rdi]
+ movq xmm3, QWORD PTR [rdi+8]
+
+ movq xmm4, QWORD PTR [rsi+rax]
+ movq xmm5, QWORD PTR [rdi+rdx]
+
+
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm3
+
+ psadbw xmm0, xmm1
+ movq xmm6, QWORD PTR [rsi+rax+8]
+
+ movq xmm3, QWORD PTR [rdi+rdx+8]
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ punpcklbw xmm4, xmm6
+
+ punpcklbw xmm5, xmm3
+ psadbw xmm4, xmm5
+
+ paddw xmm7, xmm0
+ paddw xmm7, xmm4
+
+ cmp rsi, rcx
+ jne x16x16sad_wmt_loop
+
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd rax, xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad8x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_err)
+global sym(vp8_sad8x16_wmt)
+sym(vp8_sad8x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+x8x16sad_wmt_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg x8x16sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, QWORD PTR [rsi+rbx]
+ movq mm3, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm7, mm0
+ paddw mm7, mm2
+
+ cmp rsi, rcx
+ jne x8x16sad_wmt_loop
+
+ movd rax, mm7
+
+x8x16sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad8x8_wmt)
+sym(vp8_sad8x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+x8x8sad_wmt_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg x8x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rbx]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne x8x8sad_wmt_loop
+
+ movd rax, mm7
+x8x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad4x4_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad4x4_wmt)
+sym(vp8_sad4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rdi]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ movd mm4, QWORD PTR [rsi]
+
+ movd mm5, QWORD PTR [rdi]
+ movd mm6, QWORD PTR [rsi+rax]
+
+ movd mm7, QWORD PTR [rdi+rdx]
+ punpcklbw mm4, mm6
+
+ punpcklbw mm5, mm7
+ psadbw mm4, mm5
+
+ paddw mm0, mm4
+ movd rax, mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x8_wmt)
+sym(vp8_sad16x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+x16x8sad_wmt_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg x16x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne x16x8sad_wmt_loop
+
+ movd rax, mm7
+
+x16x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
new file mode 100644
index 000000000..38cc02957
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -0,0 +1,939 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, [rsi]
+ lddqu xmm5, [rdi]
+ lddqu xmm6, [rdi+1]
+ lddqu xmm7, [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ lddqu xmm1, [rdi]
+ lddqu xmm2, [rdi+1]
+ lddqu xmm3, [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ lddqu xmm1, QWORD PTR [rdi+rdx]
+ lddqu xmm2, QWORD PTR [rdi+rdx+1]
+ lddqu xmm3, QWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 1
+%if %1
+ movq mm0, [rsi]
+ movq mm5, [rdi]
+ movq mm6, [rdi+1]
+ movq mm7, [rdi+2]
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+ movq mm2, [rdi+1]
+ movq mm3, [rdi+2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endif
+ movq mm0, QWORD PTR [rsi+rax]
+ movq mm1, QWORD PTR [rdi+rdx]
+ movq mm2, QWORD PTR [rdi+rdx+1]
+ movq mm3, QWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+ mov %2, [%1+REG_SZ_BYTES*0]
+ mov %3, [%1+REG_SZ_BYTES*1]
+
+ mov %4, [%1+REG_SZ_BYTES*2]
+ mov %5, [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 1
+%if %1
+ movdqa xmm0, [rsi]
+ lddqu xmm4, [rcx]
+ lddqu xmm5, [rdx]
+ lddqu xmm6, [rbx]
+ lddqu xmm7, [rdi]
+
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ lddqu xmm1, [rcx]
+ lddqu xmm2, [rdx]
+ lddqu xmm3, [rbx]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, [rdi]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ lddqu xmm1, QWORD PTR [rcx+rbp]
+ lddqu xmm2, QWORD PTR [rdx+rbp]
+ lddqu xmm3, QWORD PTR [rbx+rbp]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, QWORD PTR [rdi+rbp]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rcx, [rcx+rbp*2]
+
+ lea rdx, [rdx+rbp*2]
+ lea rbx, [rbx+rbp*2]
+
+ lea rdi, [rdi+rbp*2]
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 1
+%if %1
+ movq mm0, [rsi]
+ movq mm4, [rcx]
+ movq mm5, [rdx]
+ movq mm6, [rbx]
+ movq mm7, [rdi]
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, [rsi]
+ movq mm1, [rcx]
+ movq mm2, [rdx]
+ movq mm3, [rbx]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, [rdi]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+%endif
+ movq mm0, QWORD PTR [rsi+rax]
+ movq mm1, QWORD PTR [rcx+rbp]
+ movq mm2, QWORD PTR [rdx+rbp]
+ movq mm3, QWORD PTR [rbx+rbp]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [rdi+rbp]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ lea rsi, [rsi+rax*2]
+ lea rcx, [rcx+rbp*2]
+
+ lea rdx, [rdx+rbp*2]
+ lea rbx, [rbx+rbp*2]
+
+ lea rdi, [rdi+rbp*2]
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_sse3)
+sym(vp8_sad16x16x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad16x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_sse3)
+sym(vp8_sad16x8x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x3_sse3)
+sym(vp8_sad8x16x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X3 1
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm5
+ movd [rdi+4], mm6
+ movd [rdi+8], mm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x3_sse3)
+sym(vp8_sad8x8x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X3 1
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+ PROCESS_8X2X3 0
+
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm5
+ movd [rdi+4], mm6
+ movd [rdi+8], mm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad4x4x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x3_sse3)
+sym(vp8_sad4x4x3_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rdi]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, QWORD PTR [rdi+1]
+ movd mm5, QWORD PTR [rdi+2]
+
+ movd mm2, QWORD PTR [rdi+rdx+1]
+ movd mm3, QWORD PTR [rdi+rdx+2]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+
+
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm2, QWORD PTR [rdi]
+
+ movd mm3, QWORD PTR [rsi+rax]
+ movd mm6, QWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm6
+
+ movd mm3, QWORD PTR [rdi+1]
+ movd mm7, QWORD PTR [rdi+2]
+
+ psadbw mm2, mm0
+
+ paddw mm1, mm2
+
+ movd mm2, QWORD PTR [rdi+rdx+1]
+ movd mm6, QWORD PTR [rdi+rdx+2]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm6
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ mov rdi, arg(4) ;Results
+ movd [rdi], mm1
+
+ movd [rdi+4], mm3
+ movd [rdi+8], mm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad16x16_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_err)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3)
+sym(vp8_sad16x16_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+vp8_sad16x16_sse3_loop:
+
+ movd rax, mm7
+ cmp rax, arg(4)
+ jg vp8_sad16x16_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne vp8_sad16x16_sse3_loop
+
+ movd rax, mm7
+
+vp8_sad16x16_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_sad16x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x4d_sse3)
+sym(vp8_sad16x16x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_16X2X4 1
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+12], xmm0
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_sad16x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x4d_sse3)
+sym(vp8_sad16x8x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_16X2X4 1
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+ PROCESS_16X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+12], xmm0
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x4d_sse3)
+sym(vp8_sad8x16x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_8X2X4 1
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm4
+ movd [rdi+4], mm5
+ movd [rdi+8], mm6
+ movd [rdi+12], mm7
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad8x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x4d_sse3)
+sym(vp8_sad8x8x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ PROCESS_8X2X4 1
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+ PROCESS_8X2X4 0
+
+ pop rbp
+ mov rdi, arg(4) ;Results
+
+ movd [rdi], mm4
+ movd [rdi+4], mm5
+ movd [rdi+8], mm6
+ movd [rdi+12], mm7
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad4x4x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x4d_sse3)
+sym(vp8_sad4x4x4d_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rbp, dword ptr arg(3) ;ref_stride
+
+ xchg rbx, rax
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm1, QWORD PTR [rcx]
+
+ movd mm2, QWORD PTR [rsi+rax]
+ movd mm3, QWORD PTR [rcx+rbp]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, QWORD PTR [rdx]
+ movd mm5, QWORD PTR [rbx]
+
+ movd mm6, QWORD PTR [rdi]
+ movd mm2, QWORD PTR [rdx+rbp]
+
+ movd mm3, QWORD PTR [rbx+rbp]
+ movd mm7, QWORD PTR [rdi+rbp]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ punpcklbw mm6, mm7
+ psadbw mm4, mm0
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+
+
+
+ lea rsi, [rsi+rax*2]
+ lea rcx, [rcx+rbp*2]
+
+ lea rdx, [rdx+rbp*2]
+ lea rbx, [rbx+rbp*2]
+
+ lea rdi, [rdi+rbp*2]
+
+ movd mm0, QWORD PTR [rsi]
+ movd mm2, QWORD PTR [rcx]
+
+ movd mm3, QWORD PTR [rsi+rax]
+ movd mm7, QWORD PTR [rcx+rbp]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm7
+
+ movd mm3, QWORD PTR [rdx]
+ movd mm7, QWORD PTR [rbx]
+
+ psadbw mm2, mm0
+ mov rax, rbp
+
+ pop rbp
+ mov rsi, arg(4) ;Results
+
+ paddw mm1, mm2
+ movd [rsi], mm1
+
+ movd mm2, QWORD PTR [rdx+rax]
+ movd mm1, QWORD PTR [rbx+rax]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm1
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ movd mm2, QWORD PTR [rdi]
+ movd mm1, QWORD PTR [rdi+rax]
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ movd [rsi+4], mm3
+ punpcklbw mm2, mm1
+
+ movd [rsi+8], mm7
+ psadbw mm2, mm0
+
+ paddw mm2, mm6
+ movd [rsi+12], mm2
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
new file mode 100644
index 000000000..1bb956121
--- /dev/null
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -0,0 +1,367 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, [rsi]
+ lddqu xmm5, [rdi]
+ lddqu xmm6, [rdi+1]
+ lddqu xmm7, [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ lddqu xmm1, [rdi]
+ lddqu xmm2, [rdi+1]
+ lddqu xmm3, [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ lddqu xmm1, QWORD PTR [rdi+rdx]
+ lddqu xmm2, QWORD PTR [rdi+rdx+1]
+ lddqu xmm3, QWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+ movdqa xmm0, [rsi]
+ movdqa xmm4, [rdi]
+ movdqa xmm7, [rdi+16]
+
+ movdqa xmm5, xmm7
+ palignr xmm5, xmm4, %2
+
+ movdqa xmm6, xmm7
+ palignr xmm6, xmm4, (%2+1)
+
+ palignr xmm7, xmm4, (%2+2)
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, [rsi]
+ movdqa xmm4, [rdi]
+ movdqa xmm3, [rdi+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, QWORD PTR [rsi+rax]
+ movdqa xmm4, QWORD PTR [rdi+rdx]
+ movdqa xmm3, QWORD PTR [rdi+rdx+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_ssse3)
+sym(vp8_sad16x16x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp vp8_sad16x16x3_ssse3_skiptable
+vp8_sad16x16x3_ssse3_jumptable:
+ dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
+ dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_skiptable:
+
+ call vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+
+vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+vp8_sad16x16x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad16x8x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_ssse3)
+sym(vp8_sad16x8x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp vp8_sad16x8x3_ssse3_skiptable
+vp8_sad16x8x3_ssse3_jumptable:
+ dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
+ dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_skiptable:
+
+ call vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+
+vp8_sad16x8x3_ssse3_aligned_by_15:
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+vp8_sad16x8x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
new file mode 100644
index 000000000..ce3e61066
--- /dev/null
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -0,0 +1,431 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+; unsigned short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl)
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi], mm0
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2],mm0
+
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_mmx)
+sym(vp8_subtract_mby_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(1) ;src
+ mov rdi, arg(0) ;diff
+
+ mov rax, arg(2) ;pred
+ movsxd rdx, dword ptr arg(3) ;stride
+
+ mov rcx, 16
+ pxor mm0, mm0
+
+submby_loop:
+
+ movq mm1, [rsi]
+ movq mm3, [rax]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi], mm1
+ movq [rdi+8], mm2
+
+
+ movq mm1, [rsi+8]
+ movq mm3, [rax+8]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi+16], mm1
+ movq [rdi+24], mm2
+
+
+ add rdi, 32
+ add rax, 16
+
+ lea rsi, [rsi+rdx]
+
+ sub rcx, 1
+ jnz submby_loop
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx)
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;short *udiff = diff + 256;
+ ;short *vdiff = diff + 320;
+ ;unsigned char *upred = pred + 256;
+ ;unsigned char *vpred = pred + 320;
+
+ ;unsigned char *z = usrc;
+ ;unsigned short *diff = udiff;
+ ;unsigned char *Predictor= upred;
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(1) ;z = usrc
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ add rax, 256 ;Predictor = pred + 256
+ movsxd rdx, dword ptr arg(4) ;stride;
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+ ;unsigned char *z = vsrc;
+ ;unsigned short *diff = vdiff;
+ ;unsigned char *Predictor= vpred;
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(2) ;z = usrc
+ add rdi, 320*2 ;diff = diff + 320 (shorts)
+ add rax, 320 ;Predictor = pred + 320
+ movsxd rdx, dword ptr arg(4) ;stride;
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
new file mode 100644
index 000000000..d0da82ad4
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -0,0 +1,980 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx)
+sym(vp8_get_mb_ss_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 8
+ ; end prolog
+
+ mov rax, arg(0) ;src_ptr
+ mov rcx, 16
+ pxor mm4, mm4
+
+NEXTROW:
+ movq mm0, [rax]
+ movq mm1, [rax+8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+
+ paddd mm4, mm0
+ paddd mm4, mm1
+ paddd mm4, mm2
+ paddd mm4, mm3
+
+ add rax, 32
+ dec rcx
+ ja NEXTROW
+ movq QWORD PTR [rsp], mm4
+
+ ;return sum[0]+sum[1];
+ movsxd rax, dword ptr [rsp]
+ movsxd rcx, dword ptr [rsp+4]
+ add rax, rcx
+
+
+ ; begin epilog
+ add rsp, 8
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get8x8var_mmx)
+sym(vp8_get8x8var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 5
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ ; movq mm4, [rbx + rdx]
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 6
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 7
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 8
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get4x4var_mmx)
+sym(vp8_get4x4var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx)
+sym(vp8_get4x4sse_cs_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+ ; Row 1
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 2
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm1, mm6
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movd rax, mm0
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%define mmx_filter_shift 7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx)
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+
+ mov rax, arg(4) ;HFilter ;
+ mov rdx, arg(5) ;VFilter ;
+
+ mov rsi, arg(0) ;ref_ptr ;
+ mov rdi, arg(2) ;src_ptr ;
+
+ mov rcx, 4 ;
+ pxor mm0, mm0 ;
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm5, mm1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+
+filter_block2d_bil4x4_var_mmx_loop:
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm3, mm5 ;
+
+ movq mm5, mm1 ;
+ pmullw mm3, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ paddw mm1, mm3 ;
+
+
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+ psraw mm1, mmx_filter_shift ;
+
+ movd mm3, [rdi] ;
+ punpcklbw mm3, mm0 ;
+
+ psubw mm1, mm3 ;
+ paddw mm6, mm1 ;
+
+ pmaddwd mm1, mm1 ;
+ paddd mm7, mm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz filter_block2d_bil4x4_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(6) ;sum
+ mov rsi, arg(7) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx)
+sym(vp8_filter_block2d_bil_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor mm0, mm0 ;
+ movq mm1, [rsi] ;
+
+ movq mm3, [rsi+1] ;
+ movq mm2, mm1 ;
+
+ movq mm4, mm3 ;
+ punpcklbw mm1, mm0 ;
+
+ punpckhbw mm2, mm0 ;
+ pmullw mm1, [rax] ;
+
+ pmullw mm2, [rax] ;
+ punpcklbw mm3, mm0 ;
+
+ punpckhbw mm4, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ pmullw mm4, [rax+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm2, mm4 ;
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ paddw mm2, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm2, mmx_filter_shift ;
+ movq mm5, mm1
+
+ packuswb mm5, mm2 ;
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ add rsi, r8
+%endif
+
+filter_block2d_bil_var_mmx_loop:
+
+ movq mm1, [rsi] ;
+ movq mm3, [rsi+1] ;
+
+ movq mm2, mm1 ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm1, mm0 ;
+ punpckhbw mm2, mm0 ;
+
+ pmullw mm1, [rax] ;
+ pmullw mm2, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, [rax+8] ;
+ pmullw mm4, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+ psraw mm1, mmx_filter_shift ;
+
+ paddw mm2, [mmx_bi_rd GLOBAL] ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, mm5 ;
+ movq mm4, mm5 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ movq mm5, mm1 ;
+ packuswb mm5, mm2 ;
+
+ pmullw mm3, [rdx] ;
+ pmullw mm4, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ pmullw mm2, [rdx+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm2, [mmx_bi_rd GLOBAL] ;
+
+ psraw mm1, mmx_filter_shift ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, [rdi] ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ psubw mm1, mm3 ;
+ psubw mm2, mm4 ;
+
+ paddw mm6, mm1 ;
+ pmaddwd mm1, mm1 ;
+
+ paddw mm6, mm2 ;
+ pmaddwd mm2, mm2 ;
+
+ paddd mm7, mm1 ;
+ paddd mm7, mm2 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(7) ;sum
+ mov rsi, arg(8) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_get16x16pred_error_mmx
+;(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride
+;)
+global sym(vp8_get16x16pred_error_mmx)
+sym(vp8_get16x16pred_error_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;DWORD PTR [src_ptr]
+ mov rdi, arg(2) ;DWORD PTR [ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+
+ pxor mm0, mm0 ; clear xmm0 for unpack
+ pxor mm7, mm7 ; clear xmm7 for accumulating diffs
+
+ pxor mm6, mm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+var16loop:
+
+ movq mm1, [rsi]
+ movq mm2, [rdi]
+
+ movq mm3, mm1
+ movq mm4, mm2
+
+ punpcklbw mm1, mm0
+ punpckhbw mm3, mm0
+
+ punpcklbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm2
+ psubw mm3, mm4
+
+ paddw mm7, mm1
+ pmaddwd mm1, mm1
+
+ paddw mm7, mm3
+ pmaddwd mm3, mm3
+
+ paddd mm6, mm1
+ paddd mm6, mm3
+
+
+ movq mm1, [rsi+8]
+ movq mm2, [rdi+8]
+
+ movq mm3, mm1
+ movq mm4, mm2
+
+ punpcklbw mm1, mm0
+ punpckhbw mm3, mm0
+
+ punpcklbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm2
+ psubw mm3, mm4
+
+ paddw mm7, mm1
+ pmaddwd mm1, mm1
+
+ paddw mm7, mm3
+ pmaddwd mm3, mm3
+
+ paddd mm6, mm1
+ paddd mm6, mm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz var16loop
+
+
+ movq mm1, mm6
+ pxor mm6, mm6
+
+ pxor mm5, mm5
+ punpcklwd mm6, mm7
+
+ punpckhwd mm5, mm7
+ psrad mm5, 16
+
+ psrad mm6, 16
+ paddd mm6, mm5
+
+ movq mm2, mm1
+ psrlq mm1, 32
+
+ paddd mm2, mm1
+ movq mm7, mm6
+
+ psrlq mm6, 32
+ paddd mm6, mm7
+
+ movd DWORD PTR [rsp], mm6 ;Sum
+ movd DWORD PTR [rsp+4], mm2 ;SSE
+
+ ; return (SSE-((Sum*Sum)>>8));
+ movsxd rdx, dword ptr [rsp]
+ imul rdx, rdx
+ sar rdx, 8
+ movsxd rax, dword ptr [rsp + 4]
+ sub rax, rdx
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+ times 4 dw 64
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
new file mode 100644
index 000000000..7e5ee284b
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -0,0 +1,975 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+; short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2)
+sym(vp8_get_mb_ss_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 1
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ mov rax, arg(0) ;[src_ptr]
+ mov rcx, 8
+ pxor xmm4, xmm4
+
+NEXTROW:
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax+16]
+ movdqa xmm2, [rax+32]
+ movdqa xmm3, [rax+48]
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+ paddd xmm4, xmm0
+ paddd xmm4, xmm2
+
+ add rax, 0x40
+ dec rcx
+ ja NEXTROW
+
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,4
+ paddd xmm4,xmm3
+ movd rax,xmm4
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get16x16var_sse2)
+sym(vp8_get16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz var16loop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd DWORD PTR [rax], xmm7
+ movd DWORD PTR [rdi], xmm1
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get16x16pred_error_sse2
+;(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride
+;)
+global sym(vp8_get16x16pred_error_sse2)
+sym(vp8_get16x16pred_error_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[src_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+var16peloop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz var16peloop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ movd DWORD PTR [rsp], xmm7 ;Sum
+ movd DWORD PTR [rsp+4], xmm1 ;SSE
+
+ ; return (SSE-((Sum*Sum)>>8));
+ movsxd rdx, dword ptr [rsp]
+ imul rdx, rdx
+ sar rdx, 8
+ movsxd rax, dword ptr [rsp + 4]
+ sub rax, rdx
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get8x8var_sse2)
+sym(vp8_get8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ movq xmm1, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rdi]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ psubsw xmm1, xmm2
+ paddw xmm7, xmm1
+
+ pmaddwd xmm1, xmm1
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movq xmm2, QWORD PTR[rsi + rax * 2]
+ movq xmm3, QWORD PTR[rdi + rdx * 2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+
+ punpckhwd xmm7, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm6, xmm7
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddw xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddw xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd rdx, xmm7
+ movsx rcx, dx
+
+ mov dword ptr [rax], ecx
+ movd DWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2)
+sym(vp8_filter_block2d_bil_var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor xmm6, xmm6 ;
+ pxor xmm7, xmm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+
+ movq xmm3, QWORD PTR [rsi+1] ;
+ punpcklbw xmm1, xmm0 ;
+
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0
+ ;
+ pmullw xmm3, [rax+16] ;
+ paddw xmm1, xmm3 ;
+
+ paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movdqa xmm5, xmm1
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+filter_block2d_bil_var_sse2_loop:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, [xmm_bi_rd GLOBAL] ;
+
+ psraw xmm1, xmm_filter_shift ;
+ movdqa xmm3, xmm5 ;
+
+ movdqa xmm5, xmm1 ;
+ pmullw xmm3, [rdx] ;
+
+ pmullw xmm1, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+
+ paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_sse2_loop ;
+
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(7) ; sum
+ mov rdi, arg(8) ; sumsquared
+
+ movd [rsi], mm2 ; xsum
+ movd [rdi], mm4 ; xxsum
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+%else
+ add rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance16x_h_1:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm2, QWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance16x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+vp8_half_vert_variance16x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_vert_variance16x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+vp8_half_horiz_variance16x16_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance16x16_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+ times 8 dw 64
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
new file mode 100644
index 000000000..4a5b25b0d
--- /dev/null
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *vp7_filter
+);
+extern void filter_block1d_v6_mmx
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *vp7_filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp8_get4x4sse_cs_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern unsigned int vp8_get16x16pred_error_mmx
+(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride
+);
+
+
+void vp8_test_get_mb_ss(void)
+{
+ short zz[] =
+ {
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+ };
+ int s = 0, x = vp8_get_mb_ss_mmx(zz);
+ {
+ int y;
+
+ for (y = 0; y < 256; y++)
+ s += (zz[y] * zz[y]);
+ }
+
+ x += 0;
+}
+
+
+unsigned int vp8_get16x16var_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned *SSE,
+ unsigned *SUM
+)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+
+ *SSE = var;
+ *SUM = avg;
+ return (var - ((avg * avg) >> 8));
+
+}
+
+
+
+
+
+unsigned int vp8_variance4x4_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+
+ return (var - ((avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ *sse = var;
+ return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
+{
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_mmx(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_i_sub_pixel_variance16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+ int f2soffset = (src_pixels_per_line >> 1);
+ int f2doffset = (dst_pixels_per_line >> 1);
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + f2soffset, src_pixels_per_line,
+ dst_ptr + f2doffset, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + f2soffset + 8, src_pixels_per_line,
+ dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+ int f2soffset = (src_pixels_per_line >> 1);
+ int f2doffset = (dst_pixels_per_line >> 1);
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + f2soffset, src_pixels_per_line,
+ dst_ptr + f2doffset, dst_pixels_per_line, 8,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
new file mode 100644
index 000000000..ea80753bd
--- /dev/null
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+ short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+unsigned int vp8_get16x16pred_error_sse2
+(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride
+);
+unsigned int vp8_get8x8var_sse2
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+ unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
+
+unsigned int vp8_variance4x4_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ return (var - ((avg * avg) >> 4));
+
+}
+
+
+
+unsigned int vp8_variance8x8_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+
+ return (var - ((avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0;
+ int sum0;
+
+
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return (sse0 - ((sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+
+ unsigned int sse0;
+ int sum0;
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
+{
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum, &xxsum
+ );
+
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ // note we could avoid these if statements if the calling function
+ // just called the appropriate functions inside.
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum1, &xxsum1
+ );
+ }
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+ &xsum, &xxsum
+ );
+
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_wmt(
+ unsigned char *src_ptr,
+ int source_stride,
+ unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_i_sub_pixel_variance16x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_wmt
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
new file mode 100644
index 000000000..35fc90c48
--- /dev/null
+++ b/vp8/encoder/x86/variance_x86.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_X86_H
+#define VARIANCE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_sad(vp8_sad4x4_mmx);
+extern prototype_sad(vp8_sad8x8_mmx);
+extern prototype_sad(vp8_sad8x16_mmx);
+extern prototype_sad(vp8_sad16x8_mmx);
+extern prototype_sad(vp8_sad16x16_mmx);
+extern prototype_variance(vp8_variance4x4_mmx);
+extern prototype_variance(vp8_variance8x8_mmx);
+extern prototype_variance(vp8_variance8x16_mmx);
+extern prototype_variance(vp8_variance16x8_mmx);
+extern prototype_variance(vp8_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
+extern prototype_getmbss(vp8_get_mb_ss_mmx);
+extern prototype_variance(vp8_mse16x16_mmx);
+extern prototype_sad(vp8_get16x16pred_error_mmx);
+extern prototype_variance2(vp8_get8x8var_mmx);
+extern prototype_variance2(vp8_get16x16var_mmx);
+extern prototype_sad(vp8_get4x4sse_cs_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_mmx
+
+#undef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_mmx
+
+#undef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_mmx
+
+#undef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_mmx
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_mmx
+
+#undef vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_mmx
+
+#undef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_mmx
+
+#undef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_mmx
+
+#undef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_mmx
+
+#undef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_mmx
+
+#undef vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx
+
+#undef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx
+
+#undef vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx
+
+#undef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx
+
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
+
+#undef vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
+
+#undef vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_mmx
+
+#undef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_mmx
+
+#undef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
+
+#undef vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_mmx
+
+#undef vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_mmx
+
+#undef vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_sad(vp8_sad4x4_wmt);
+extern prototype_sad(vp8_sad8x8_wmt);
+extern prototype_sad(vp8_sad8x16_wmt);
+extern prototype_sad(vp8_sad16x8_wmt);
+extern prototype_sad(vp8_sad16x16_wmt);
+extern prototype_variance(vp8_variance4x4_wmt);
+extern prototype_variance(vp8_variance8x8_wmt);
+extern prototype_variance(vp8_variance8x16_wmt);
+extern prototype_variance(vp8_variance16x8_wmt);
+extern prototype_variance(vp8_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
+extern prototype_getmbss(vp8_get_mb_ss_sse2);
+extern prototype_variance(vp8_mse16x16_wmt);
+extern prototype_sad(vp8_get16x16pred_error_sse2);
+extern prototype_variance2(vp8_get8x8var_sse2);
+extern prototype_variance2(vp8_get16x16var_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_wmt
+
+#undef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_wmt
+
+#undef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_wmt
+
+#undef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_wmt
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_wmt
+
+#undef vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_wmt
+
+#undef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_wmt
+
+#undef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_wmt
+
+#undef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_wmt
+
+#undef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_wmt
+
+#undef vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt
+
+#undef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt
+
+#undef vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt
+
+#undef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt
+
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
+
+#undef vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
+
+#undef vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_sse2
+
+#undef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_wmt
+
+#undef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
+
+#undef vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_sse2
+
+#undef vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_sse2
+
+#endif
+#endif
+
+
+#if HAVE_SSE3
+extern prototype_sad(vp8_sad16x16_sse3);
+extern prototype_sad(vp8_sad16x8_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3);
+
+extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_sse3
+
+#undef vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3
+
+#undef vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3
+
+#undef vp8_variance_sad8x16x3
+#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3
+
+#undef vp8_variance_sad8x8x3
+#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3
+
+#undef vp8_variance_sad4x4x3
+#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
+
+#undef vp8_variance_sad16x16x4d
+#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3
+
+#undef vp8_variance_sad16x8x4d
+#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
+
+#undef vp8_variance_sad8x16x4d
+#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3
+
+#undef vp8_variance_sad8x8x4d
+#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3
+
+#undef vp8_variance_sad4x4x4d
+#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
+
+#endif
+#endif
+
+
+#if HAVE_SSSE3
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3
+
+#undef vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+
+#endif
+#endif
+
+#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
new file mode 100644
index 000000000..f1391ba8c
--- /dev/null
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+
+#if HAVE_MMX
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_mmx(input, output, pitch);
+ vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+ vp8_fast_fdct4x4_mmx(input, output , pitch);
+ vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ short *scan_mask, short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+ short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+
+ d->eob = vp8_fast_quantize_b_impl_mmx(
+ coeff_ptr,
+ zbin_ptr,
+ qcoeff_ptr,
+ dequant_ptr,
+ scan_mask,
+
+ round_ptr,
+ quant_ptr,
+ dqcoeff_ptr
+ );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSE2
+void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_wmt(input, output, pitch);
+ vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ short *scan_mask, short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+{
+ short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+ short *coeff_ptr = &b->coeff[0];
+ short *zbin_ptr = &b->zbin[0][0];
+ short *round_ptr = &b->round[0][0];
+ short *quant_ptr = &b->quant[0][0];
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = &d->dequant[0][0];
+
+ d->eob = vp8_fast_quantize_b_impl_sse(
+ coeff_ptr,
+ zbin_ptr,
+ qcoeff_ptr,
+ dequant_ptr,
+ scan_mask,
+
+ round_ptr,
+ quant_ptr,
+ dqcoeff_ptr
+ );
+}
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+#endif
+
+void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+ int flags = x86_simd_caps();
+ int mmx_enabled = flags & HAS_MMX;
+ int xmm_enabled = flags & HAS_SSE;
+ int wmt_enabled = flags & HAS_SSE2;
+ int SSE3Enabled = flags & HAS_SSE3;
+ int SSSE3Enabled = flags & HAS_SSSE3;
+
+ /* Note:
+ *
+ * This platform can be built without runtime CPU detection as well. If
+ * you modify any of the function mappings present in this file, be sure
+ * to also update them in static mapings (<arch>/filename_<arch>.h)
+ */
+
+ /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+
+ if (mmx_enabled)
+ {
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx;
+ cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
+ cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
+
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
+ cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx;
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx;
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;
+ }
+
+#endif
+#if HAVE_SSE2
+
+ if (wmt_enabled)
+ {
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
+ cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt;
+ cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt;
+ cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt;
+ cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt;
+
+ cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt;
+ cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt;
+ cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt;
+ cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt;
+ cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt;
+
+ cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt;
+ cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt;
+ cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt;
+ cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt;
+
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
+ cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2;
+
+ cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2;
+ cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2;
+ cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
+ /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
+
+#if 0
+ /* short SSE2 DCT currently disabled, does not match the MMX version */
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
+#endif
+ /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
+ cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt;
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2;
+
+ cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
+ cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
+ cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
+ /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
+ }
+
+#endif
+#if HAVE_SSE3
+
+ if (SSE3Enabled)
+ {
+ cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
+ cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3;
+ cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3;
+ cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3;
+ cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3;
+ cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3;
+ cpi->rtcd.search.full_search = vp8_full_search_sadx3;
+
+ cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3;
+ cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3;
+ cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3;
+ cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3;
+ cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
+ cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
+ }
+
+#endif
+#if HAVE_SSSE3
+
+ if (SSSE3Enabled)
+ {
+ cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
+ cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+ }
+
+#endif
+#endif
+}