summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r--vp8/encoder/x86/csystemdependent.c289
-rw-r--r--vp8/encoder/x86/dct_mmx.asm407
-rw-r--r--vp8/encoder/x86/dct_sse2.asm401
-rw-r--r--vp8/encoder/x86/dct_x86.h38
-rw-r--r--vp8/encoder/x86/encodemb_x86.h24
-rw-r--r--vp8/encoder/x86/encodeopt.asm53
-rw-r--r--vp8/encoder/x86/fwalsh_sse2.asm231
-rw-r--r--vp8/encoder/x86/mcomp_x86.h20
-rw-r--r--vp8/encoder/x86/preproc_mmx.c11
-rw-r--r--vp8/encoder/x86/quantize_mmx.asm168
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm388
-rwxr-xr-xvp8/encoder/x86/quantize_ssse3.asm114
-rw-r--r--vp8/encoder/x86/quantize_x86.h41
-rw-r--r--vp8/encoder/x86/sad_mmx.asm39
-rw-r--r--vp8/encoder/x86/sad_sse2.asm45
-rw-r--r--vp8/encoder/x86/sad_sse3.asm179
-rw-r--r--vp8/encoder/x86/sad_sse4.asm353
-rw-r--r--vp8/encoder/x86/sad_ssse3.asm55
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm17
-rw-r--r--vp8/encoder/x86/subtract_sse2.asm356
-rw-r--r--vp8/encoder/x86/variance_impl_mmx.asm31
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm21
-rw-r--r--vp8/encoder/x86/variance_mmx.c139
-rw-r--r--vp8/encoder/x86/variance_sse2.c218
-rw-r--r--vp8/encoder/x86/variance_x86.h64
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c195
26 files changed, 2313 insertions, 1584 deletions
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
deleted file mode 100644
index 186ee6856..000000000
--- a/vp8/encoder/x86/csystemdependent.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
- */
-
-
-#include "variance.h"
-#include "onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *, short *);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-
-extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
-
-extern int vp8_block_error_c(short *, short *);
-extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
-
-extern int vp8_block_error_mmx(short *, short *);
-extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
-
-extern int vp8_block_error_xmm(short *, short *);
-extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
-
-
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-// c imports
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
-
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction vp8_sad16x16_c;
-extern SADFunction vp8_sad16x8_c;
-extern SADFunction vp8_sad8x16_c;
-extern SADFunction vp8_sad8x8_c;
-extern SADFunction vp8_sad4x4_c;
-
-extern SADFunction vp8_sad16x16_wmt;
-extern SADFunction vp8_sad16x8_wmt;
-extern SADFunction vp8_sad8x16_wmt;
-extern SADFunction vp8_sad8x8_wmt;
-extern SADFunction vp8_sad4x4_wmt;
-
-extern SADFunction vp8_sad16x16_mmx;
-extern SADFunction vp8_sad16x8_mmx;
-extern SADFunction vp8_sad8x16_mmx;
-extern SADFunction vp8_sad8x8_mmx;
-extern SADFunction vp8_sad4x4_mmx;
-
-extern variance_function vp8_variance16x16_c;
-extern variance_function vp8_variance8x16_c;
-extern variance_function vp8_variance16x8_c;
-extern variance_function vp8_variance8x8_c;
-extern variance_function vp8_variance4x4_c;
-extern variance_function vp8_mse16x16_c;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-// mmx imports
-extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
-extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
-extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
-extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
-extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
-extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
-extern variance_function vp8_variance4x4_mmx;
-extern variance_function vp8_variance8x8_mmx;
-extern variance_function vp8_variance8x16_mmx;
-extern variance_function vp8_variance16x8_mmx;
-extern variance_function vp8_variance16x16_mmx;
-
-extern variance_function vp8_mse16x16_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
-
-extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get_mb_ss_mmx(short *);
-extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-
-// wmt imports
-extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
-extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
-extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
-extern variance_function vp8_variance4x4_wmt;
-extern variance_function vp8_variance8x8_wmt;
-extern variance_function vp8_variance8x16_wmt;
-extern variance_function vp8_variance16x8_wmt;
-extern variance_function vp8_variance16x16_wmt;
-
-extern variance_function vp8_mse16x16_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
-extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
-extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-
-extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-void vp8_cmachine_specific_config(void)
-{
- int mmx_enabled;
- int xmm_enabled;
- int wmt_enabled;
-
- vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
-
- if (wmt_enabled) // Willamette
- {
- // Willamette instruction set available:
- vp8_mbuverror = vp8_mbuverror_xmm;
- vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
- vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
- vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt;
- vp8_subtract_b = vp8_subtract_b_mmx;
- vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
- vp8_variance4x4 = vp8_variance4x4_mmx;
- vp8_variance8x8 = vp8_variance8x8_mmx;
- vp8_variance8x16 = vp8_variance8x16_wmt;
- vp8_variance16x8 = vp8_variance16x8_wmt;
- vp8_variance16x16 = vp8_variance16x16_wmt;
- vp8_mse16x16 = vp8_mse16x16_wmt;
- vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt;
- vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt;
- vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt;
- vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt;
- vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt;
- vp8_get_mb_ss = vp8_get_mb_ss_sse2;
- vp8_get16x16pred_error = vp8_get16x16pred_error_sse2;
- vp8_get8x8var = vp8_get8x8var_sse2;
- vp8_get16x16var = vp8_get16x16var_sse2;
- vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx;
- vp8_sad16x16 = vp8_sad16x16_wmt;
- vp8_sad16x8 = vp8_sad16x8_wmt;
- vp8_sad8x16 = vp8_sad8x16_wmt;
- vp8_sad8x8 = vp8_sad8x8_wmt;
- vp8_sad4x4 = vp8_sad4x4_wmt;
- vp8_block_error = vp8_block_error_xmm;
- vp8_mbblock_error = vp8_mbblock_error_xmm;
- vp8_subtract_mby = vp8_subtract_mby_mmx;
-
- }
- else if (mmx_enabled)
- {
- // MMX instruction set available:
- vp8_mbuverror = vp8_mbuverror_mmx;
- vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
- vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
- vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx;
- vp8_subtract_b = vp8_subtract_b_mmx;
- vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
- vp8_variance4x4 = vp8_variance4x4_mmx;
- vp8_variance8x8 = vp8_variance8x8_mmx;
- vp8_variance8x16 = vp8_variance8x16_mmx;
- vp8_variance16x8 = vp8_variance16x8_mmx;
- vp8_variance16x16 = vp8_variance16x16_mmx;
- vp8_mse16x16 = vp8_mse16x16_mmx;
- vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx;
- vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx;
- vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx;
- vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx;
- vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx;
- vp8_get_mb_ss = vp8_get_mb_ss_mmx;
- vp8_get16x16pred_error = vp8_get16x16pred_error_mmx;
- vp8_get8x8var = vp8_get8x8var_mmx;
- vp8_get16x16var = vp8_get16x16var_mmx;
- vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx;
- vp8_sad16x16 = vp8_sad16x16_mmx;
- vp8_sad16x8 = vp8_sad16x8_mmx;
- vp8_sad8x16 = vp8_sad8x16_mmx;
- vp8_sad8x8 = vp8_sad8x8_mmx;
- vp8_sad4x4 = vp8_sad4x4_mmx;
- vp8_block_error = vp8_block_error_mmx;
- vp8_mbblock_error = vp8_mbblock_error_mmx;
- vp8_subtract_mby = vp8_subtract_mby_mmx;
-
- }
- else
- {
- // Pure C:
- vp8_mbuverror = vp8_mbuverror_c;
- vp8_fast_quantize_b = vp8_fast_quantize_b_c;
- vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
- vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
- vp8_subtract_b = vp8_subtract_b_c;
- vp8_subtract_mbuv = vp8_subtract_mbuv_c;
- vp8_variance4x4 = vp8_variance4x4_c;
- vp8_variance8x8 = vp8_variance8x8_c;
- vp8_variance8x16 = vp8_variance8x16_c;
- vp8_variance16x8 = vp8_variance16x8_c;
- vp8_variance16x16 = vp8_variance16x16_c;
- vp8_mse16x16 = vp8_mse16x16_c;
- vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c;
- vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c;
- vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c;
- vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c;
- vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c;
- vp8_get_mb_ss = vp8_get_mb_ss_c;
- vp8_get16x16pred_error = vp8_get16x16pred_error_c;
- vp8_get8x8var = vp8_get8x8var_c;
- vp8_get16x16var = vp8_get16x16var_c;
- vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
- vp8_sad16x16 = vp8_sad16x16_c;
- vp8_sad16x8 = vp8_sad16x8_c;
- vp8_sad8x16 = vp8_sad8x16_c;
- vp8_sad8x8 = vp8_sad8x8_c;
- vp8_sad4x4 = vp8_sad4x4_c;
- vp8_block_error = vp8_block_error_c;
- vp8_mbblock_error = vp8_mbblock_error_c;
- vp8_subtract_mby = vp8_subtract_mby_c;
- }
-
-}
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index e13423796..5acaca875 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -12,8 +13,7 @@
section .text
global sym(vp8_short_fdct4x4_mmx)
- global sym(vp8_fast_fdct4x4_mmx)
- global sym(vp8_fast_fdct8x4_wmt)
+ global sym(vp8_short_fdct8x4_wmt)
%define DCTCONSTANTSBITS (16)
@@ -23,10 +23,6 @@ section .text
%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-%define _1STSTAGESHIFT 14
-%define _2NDSTAGESHIFT 16
-
-; using matrix multiply with source and destbuffer has a pitch
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
sym(vp8_short_fdct4x4_mmx):
push rbp
@@ -36,337 +32,10 @@ sym(vp8_short_fdct4x4_mmx):
push rsi
push rdi
; end prolog
-
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
-
- movsxd rax, dword ptr arg(2) ;pitch
- lea rdx, [dct_matrix GLOBAL]
-
- movq mm0, [rsi ]
- movq mm1, [rsi + rax]
-
- movq mm2, [rsi + rax*2]
- lea rsi, [rsi + rax*2]
-
- movq mm3, [rsi + rax]
-
- ; first column
- movq mm4, mm0
- movq mm7, [rdx]
-
- pmaddwd mm4, mm7
- movq mm5, mm1
-
- pmaddwd mm5, mm7
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
-
- pmaddwd mm5, mm7
- movq mm6, mm3
-
- pmaddwd mm6, mm7
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi], mm4
-
- ;second column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+8]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+8]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+8], mm4
-
-
- ;third column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+16]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+16]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+16], mm4
-
- ;fourth column (this is the last column, so we do not have save the source any more)
-
- pmaddwd mm0, [rdx+24]
-
- pmaddwd mm1, [rdx+24]
- movq mm6, mm0
-
- punpckldq mm0, mm1
- punpckhdq mm6, mm1
-
- paddd mm0, mm6
-
- pmaddwd mm2, [rdx+24]
-
- pmaddwd mm3, [rdx+24]
- movq mm7, mm2
-
- punpckldq mm2, mm3
- punpckhdq mm7, mm3
-
- paddd mm2, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm0, mm6
- paddd mm2, mm6
-
- psrad mm0, _1STSTAGESHIFT
- psrad mm2, _1STSTAGESHIFT
-
- packssdw mm0, mm2
-
- movq mm3, mm0
-
- ; done with one pass
- ; now start second pass
- movq mm0, [rdi ]
- movq mm1, [rdi+ 8]
- movq mm2, [rdi+ 16]
-
- movq mm4, mm0
-
- pmaddwd mm4, [rdx]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi], mm4
-
- ;second column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+8]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+8]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+8], mm4
-
-
- ;third column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+16]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+16]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+16], mm4
-
- ;fourth column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+24]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+24]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+24]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+24]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+24], mm4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
-sym(vp8_fast_fdct4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
- lea rdx, [dct_const_mmx GLOBAL]
+ lea rdx, [GLOBAL(dct_const_mmx)]
movsxd rax, dword ptr arg(2) ;pitch
lea rcx, [rsi + rax*2]
@@ -378,11 +47,11 @@ sym(vp8_fast_fdct4x4_mmx):
movq mm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
- paddw mm0, mm0
- paddw mm1, mm1
+ psllw mm0, 3
+ psllw mm1, 3
- psllw mm2, 1
- psllw mm3, 1
+ psllw mm2, 3
+ psllw mm3, 3
; transpose for the second stage
movq mm4, mm0 ; 00 01 02 03
@@ -530,20 +199,23 @@ sym(vp8_fast_fdct4x4_mmx):
movq mm3, mm5
; done with vertical
- pcmpeqw mm4, mm4
- pcmpeqw mm5, mm5
- psrlw mm4, 15
- psrlw mm5, 15
+ pcmpeqw mm4, mm4
+ pcmpeqw mm5, mm5
+ psrlw mm4, 15
+ psrlw mm5, 15
+
+ psllw mm4, 2
+ psllw mm5, 2
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm4
paddw mm3, mm5
- psraw mm0, 1
- psraw mm1, 1
- psraw mm2, 1
- psraw mm3, 1
+ psraw mm0, 3
+ psraw mm1, 3
+ psraw mm2, 3
+ psraw mm3, 3
movq [rdi ], mm0
movq [rdi+ 8], mm1
@@ -559,8 +231,8 @@ sym(vp8_fast_fdct4x4_mmx):
ret
-;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_fast_fdct8x4_wmt):
+;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_short_fdct8x4_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
@@ -571,7 +243,7 @@ sym(vp8_fast_fdct8x4_wmt):
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
- lea rdx, [dct_const_xmm GLOBAL]
+ lea rdx, [GLOBAL(dct_const_xmm)]
movsxd rax, dword ptr arg(2) ;pitch
lea rcx, [rsi + rax*2]
@@ -583,11 +255,11 @@ sym(vp8_fast_fdct8x4_wmt):
movdqa xmm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
- psllw xmm0, 1
- psllw xmm2, 1
+ psllw xmm0, 3
+ psllw xmm2, 3
- psllw xmm4, 1
- psllw xmm3, 1
+ psllw xmm4, 3
+ psllw xmm3, 3
; transpose for the second stage
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
@@ -757,20 +429,23 @@ sym(vp8_fast_fdct8x4_wmt):
; done with vertical
- pcmpeqw xmm4, xmm4
- pcmpeqw xmm5, xmm5;
- psrlw xmm4, 15
- psrlw xmm5, 15
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm5, xmm5;
+ psrlw xmm4, 15
+ psrlw xmm5, 15
+
+ psllw xmm4, 2
+ psllw xmm5, 2
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm4
paddw xmm3, xmm5
- psraw xmm0, 1
- psraw xmm1, 1
- psraw xmm2, 1
- psraw xmm3, 1
+ psraw xmm0, 3
+ psraw xmm1, 3
+ psraw xmm2, 3
+ psraw xmm3, 3
movq QWORD PTR[rdi ], xmm0
movq QWORD PTR[rdi+ 8], xmm1
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 3e5e9a70c..723a78d76 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -1,260 +1,189 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
-global sym(vp8_short_fdct4x4_wmt)
-
-%define DCTCONSTANTSBITS (16)
-%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
-%define x_c1 (60547) ; cos(pi /8) * (1<<15)
-%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
-%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-
-%define _1STSTAGESHIFT 14
-%define _2NDSTAGESHIFT 16
-
-
-;; using matrix multiply
-;void vp8_short_fdct4x4_wmt(short *input, short *output)
-sym(vp8_short_fdct4x4_wmt):
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
+ SHADOW_ARGS_TO_STACK 3
+;; SAVE_XMM
GET_GOT rbx
+ push rsi
+ push rdi
; end prolog
- mov rax, arg(0) ;input
- mov rcx, arg(1) ;output
-
- lea rdx, [dct_matrix_sse2 GLOBAL]
-
- movdqu xmm0, [rax ]
- movdqu xmm1, [rax+16]
-
- ; first column
- movdqa xmm2, xmm0
- movdqa xmm7, [rdx]
-
- pmaddwd xmm2, xmm7
- movdqa xmm3, xmm1
-
- pmaddwd xmm3, xmm7
- movdqa xmm4, xmm2
-
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
-
- movdqa xmm3, xmm2
- punpckldq xmm2, xmm4
-
- punpckhdq xmm3, xmm4
- paddd xmm2, xmm3
-
-
- paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
- psrad xmm2, _1STSTAGESHIFT
- ;second column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+16]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+16]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
- psrad xmm3, _1STSTAGESHIFT
- packssdw xmm2, xmm3
-
- ;third column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+32]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+32]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _1STSTAGESHIFT
-
- ;fourth column (this is the last column, so we do not have save the source any more)
- pmaddwd xmm0, [rdx+48]
- pmaddwd xmm1, [rdx+48]
-
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
-
- punpckhdq xmm4, xmm1
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm4
- punpckhdq xmm1, xmm4
-
- paddd xmm0, xmm1
- paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
- psrad xmm0, _1STSTAGESHIFT
- packssdw xmm3, xmm0
- ; done with one pass
- ; now start second pass
- movdqa xmm0, xmm2
- movdqa xmm1, xmm3
-
- pmaddwd xmm2, xmm7
- pmaddwd xmm3, xmm7
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
+ mov rsi, arg(0)
+ movsxd rax, DWORD PTR arg(2)
+ lea rdi, [rsi + rax*2]
+
+ movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10
+ movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20
+ movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30
+
+ punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
+ punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
+
+ mov rdi, arg(1)
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
+ punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
+ pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
+ pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
+
+ punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
+ psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
+ psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
+ psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+ paddd xmm3, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_7500)]
+ psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm0, xmm1 ;op[2] op[0]
+ packssdw xmm3, xmm4 ;op[3] op[1]
+ ; 23 22 21 20 03 02 01 00
+ ;
+ ; 33 32 31 30 13 12 11 10
+ ;
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
+
+ movdqa xmm3, xmm0
+ punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
+ punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
+
+ movdqa xmm5, XMMWORD PTR[GLOBAL(_7)]
+ pshufd xmm2, xmm2, 04eh
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
+ psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
+
+ pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
+ movdqa xmm2, xmm3 ;save d1 for compare
+ pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
+ pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
+ pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
+ pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
+ pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+ pxor xmm4, xmm4 ;zero out for compare
+ paddd xmm0, xmm5
+ paddd xmm1, xmm5
+ pcmpeqw xmm2, xmm4
+ psrad xmm0, 4 ;(a1 + b1 + 7)>>4
+ psrad xmm1, 4 ;(a1 - b1 + 7)>>4
+ pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+ ;and keep bit 0 of lower
+
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+ paddd xmm3, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_51000)]
+ packssdw xmm0, xmm1 ;op[8] op[0]
+ psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
+ psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
+
+ packssdw xmm3, xmm4 ;op[12] op[4]
+ movdqa xmm1, xmm0
+ paddw xmm3, xmm2 ;op[4] += (d1!=0)
+ punpcklqdq xmm0, xmm3 ;op[4] op[0]
+ punpckhqdq xmm1, xmm3 ;op[12] op[8]
+
+ movdqa XMMWORD PTR[rdi + 0], xmm0
+ movdqa XMMWORD PTR[rdi + 16], xmm1
- punpckhdq xmm4, xmm3
- movdqa xmm3, xmm2
-
- punpckldq xmm2, xmm4
- punpckhdq xmm3, xmm4
-
- paddd xmm2, xmm3
- paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm2, _2NDSTAGESHIFT
-
- ;second column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+16]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+16]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _2NDSTAGESHIFT
- packssdw xmm2, xmm3
-
- movdqu [rcx], xmm2
- ;third column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+32]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+32]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _2NDSTAGESHIFT
- ;fourth column
- pmaddwd xmm0, [rdx+48]
- pmaddwd xmm1, [rdx+48]
-
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
-
- punpckhdq xmm4, xmm1
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm4
- punpckhdq xmm1, xmm4
-
- paddd xmm0, xmm1
- paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm0, _2NDSTAGESHIFT
- packssdw xmm3, xmm0
-
- movdqu [rcx+16], xmm3
-
- mov rsp, rbp
; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
+;; RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-
SECTION_RODATA
-;static unsigned int dct1st_stage_rounding_sse2[4] =
align 16
-dct1st_stage_rounding_sse2:
- times 4 dd 8192
-
-
-;static unsigned int dct2nd_stage_rounding_sse2[4] =
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
align 16
-dct2nd_stage_rounding_sse2:
- times 4 dd 32768
-
-;static short dct_matrix_sse2[4][8]=
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
align 16
-dct_matrix_sse2:
- times 8 dw 23170
-
- dw 30274
- dw 12540
- dw -12540
- dw -30274
- dw 30274
- dw 12540
- dw -12540
- dw -30274
-
- dw 23170
- times 2 dw -23170
- times 2 dw 23170
- times 2 dw -23170
- dw 23170
+_mult_add:
+ times 8 dw 1
+align 16
+_cmp_mask:
+ times 4 dw 1
+ times 4 dw 0
- dw 12540
- dw -30274
- dw 30274
- dw -12540
- dw 12540
- dw -30274
- dw 30274
- dw -12540
+align 16
+_mult_sub:
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+align 16
+_7:
+ times 4 dd 7
+align 16
+_14500:
+ times 4 dd 14500
+align 16
+_7500:
+ times 4 dd 7500
+align 16
+_12000:
+ times 4 dd 12000
+align 16
+_51000:
+ times 4 dd 51000
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index bc80e64ef..05824c684 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -21,46 +22,41 @@
#if HAVE_MMX
extern prototype_fdct(vp8_short_fdct4x4_mmx);
extern prototype_fdct(vp8_short_fdct8x4_mmx);
-extern prototype_fdct(vp8_fast_fdct4x4_mmx);
-extern prototype_fdct(vp8_fast_fdct8x4_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
+#if 0
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-
-#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
-
-#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+#endif
#endif
#endif
#if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct4x4_wmt);
extern prototype_fdct(vp8_short_fdct8x4_wmt);
-extern prototype_fdct(vp8_fast_fdct8x4_wmt);
-
extern prototype_fdct(vp8_short_walsh4x4_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
+extern prototype_fdct(vp8_short_fdct4x4_sse2);
-#if 0
+#if !CONFIG_RUNTIME_CPU_DETECT
+#if 1
/* short SSE2 DCT currently disabled, does not match the MMX version */
#undef vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
#undef vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
#endif
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
+
#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
index 9397a6cca..69b3edd66 100644
--- a/vp8/encoder/x86/encodemb_x86.h
+++ b/vp8/encoder/x86/encodemb_x86.h
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -54,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx);
extern prototype_berr(vp8_block_error_xmm);
extern prototype_mberr(vp8_mbblock_error_xmm);
extern prototype_mbuverr(vp8_mbuverror_xmm);
-
+extern prototype_subb(vp8_subtract_b_sse2);
+extern prototype_submby(vp8_subtract_mby_sse2);
+extern prototype_submbuv(vp8_subtract_mbuv_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_encodemb_berr
@@ -66,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm);
#undef vp8_encodemb_mbuverr
#define vp8_encodemb_mbuverr vp8_mbuverror_xmm
+#undef vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_sse2
+
+#undef vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_sse2
+
+#undef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2
+
#endif
#endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 194047155..c0f06bbbb 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -1,16 +1,16 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
-
;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
global sym(vp8_block_error_xmm)
sym(vp8_block_error_xmm):
@@ -19,11 +19,9 @@ sym(vp8_block_error_xmm):
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
- ; end prolog
-
+ ; end prologue
mov rsi, arg(0) ;coeff_ptr
- pxor xmm7, xmm7
mov rdi, arg(1) ;dcoef_ptr
movdqa xmm3, [rsi]
@@ -32,33 +30,27 @@ sym(vp8_block_error_xmm):
movdqa xmm5, [rsi+16]
movdqa xmm6, [rdi+16]
- pxor xmm1, xmm1 ; from movd xmm1, dc; dc=0
+ psubw xmm3, xmm4
- movdqa xmm2, xmm7
psubw xmm5, xmm6
-
- por xmm1, xmm2
+ pmaddwd xmm3, xmm3
pmaddwd xmm5, xmm5
- pcmpeqw xmm1, xmm7
- psubw xmm3, xmm4
+ paddd xmm3, xmm5
- pand xmm1, xmm3
- pmaddwd xmm1, xmm1
-
- paddd xmm1, xmm5
- movdqa xmm0, xmm1
+ pxor xmm7, xmm7
+ movdqa xmm0, xmm3
punpckldq xmm0, xmm7
- punpckhdq xmm1, xmm7
+ punpckhdq xmm3, xmm7
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
+ paddd xmm0, xmm3
+ movdqa xmm3, xmm0
psrldq xmm0, 8
- paddd xmm0, xmm1
+ paddd xmm0, xmm3
- movd rax, xmm0
+ movq rax, xmm0
pop rdi
pop rsi
@@ -67,7 +59,6 @@ sym(vp8_block_error_xmm):
pop rbp
ret
-
;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
global sym(vp8_block_error_mmx)
sym(vp8_block_error_mmx):
@@ -124,7 +115,7 @@ sym(vp8_block_error_mmx):
psrlq mm1, 32
paddd mm0, mm1
- movd rax, mm0
+ movq rax, mm0
pop rdi
pop rsi
@@ -201,7 +192,7 @@ mberror_loop_mmx:
psrlq mm2, 32
paddd mm0, mm2
- movd rax, mm0
+ movq rax, mm0
pop rdi
pop rsi
@@ -269,7 +260,7 @@ mberror_loop:
psrldq xmm0, 8
paddd xmm0, xmm1
- movd rax, xmm0
+ movq rax, xmm0
pop rdi
pop rsi
@@ -326,7 +317,7 @@ mbuverror_loop_mmx:
psrlq mm7, 32
paddd mm0, mm7
- movd rax, mm0
+ movq rax, mm0
pop rdi
pop rsi
@@ -383,7 +374,7 @@ mbuverror_loop:
psrldq xmm1, 8
paddd xmm1, xmm2
- movd rax, xmm1
+ movq rax, xmm1
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
index 7d8620178..39439f0d8 100644
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -16,102 +17,148 @@ sym(vp8_short_walsh4x4_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM
+ GET_GOT rbx
push rsi
push rdi
; end prolog
- mov rsi, arg(0)
- mov rdi, arg(1)
-
- movdqu xmm4, [rsi + 0] ;ip[4] ip[0]
- movdqu xmm0, [rsi + 16] ;ip[12] ip[8]
-
- pxor xmm7, xmm7
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
-
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
+ movsxd rdx, dword ptr arg(2) ; pitch
+
+ ; first for loop
+ movq xmm0, MMWORD PTR [rsi] ; load input
+ movq xmm1, MMWORD PTR [rsi + rdx]
+ lea rsi, [rsi + rdx*2]
+ movq xmm2, MMWORD PTR [rsi]
+ movq xmm3, MMWORD PTR [rsi + rdx]
+
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ; ip[1] ip[0]
+ punpckhdq xmm1, xmm2 ; ip[3] ip[2]
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ psllw xmm0, 2 ; d1 a1
+ psllw xmm2, 2 ; c1 b1
+
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2 ; b1 a1
+ punpckhqdq xmm1, xmm2 ; c1 d1
+
+ pxor xmm6, xmm6
+ movq xmm6, xmm0
+ pxor xmm7, xmm7
+ pcmpeqw xmm7, xmm6
+ paddw xmm7, [GLOBAL(c1)]
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1 ; b1+c1 a1+d1
+ psubw xmm2, xmm1 ; b1-c1 a1-d1
+ paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
+
+ ; second for loop
+ ; input: 13 9 5 1 12 8 4 0 (xmm0)
+ ; 14 10 6 2 15 11 7 3 (xmm2)
+ ; after shuffle:
+ ; 13 5 9 1 12 4 8 0 (xmm0)
+ ; 14 6 10 2 15 7 11 3 (xmm1)
+ pshuflw xmm3, xmm0, 0xd8
+ pshufhw xmm0, xmm3, 0xd8
+ pshuflw xmm3, xmm2, 0xd8
+ pshufhw xmm1, xmm3, 0xd8
+
+ movdqa xmm2, xmm0
+ pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
+ pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
+ movdqa xmm3, xmm1
+ pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
+ pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
+
+ pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
+ pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
+ pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
+ pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
+
+ movdqa xmm0, xmm4
+ punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
+ punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
+ movdqa xmm1, xmm6
+ punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
+ punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
+
+ movdqa xmm2, xmm0
+ paddd xmm0, xmm4 ; b21 b20 a21 a20
+ psubd xmm2, xmm4 ; c21 c20 d21 d20
+ movdqa xmm3, xmm1
+ paddd xmm1, xmm6 ; b23 b22 a23 a22
+ psubd xmm3, xmm6 ; c23 c22 d23 d22
+
+ pxor xmm4, xmm4
movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm3 ;d1 a1
- punpckhqdq xmm5, xmm3 ;c1 b1
-
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm5 ;ip[4] ip[0]
-
- paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm3 ;d1 a1
- punpckhqdq xmm6, xmm3 ;c1 b1
-
- movdqa xmm1, xmm6 ;c1 b1
- paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-
- movdqa xmm0, xmm6 ;aka b2 a2
- movdqa xmm1, xmm5 ;aka d2 c2
-
- pcmpgtw xmm0, xmm7
- pcmpgtw xmm1, xmm7
-
- psrlw xmm0, 15
- psrlw xmm1, 15
-
- paddw xmm6, xmm0
- paddw xmm5, xmm1
-
- psraw xmm6, 1
- psraw xmm5, 1
-
- ; a2 = a1 + b1;
- ; b2 = c1 + d1;
- ; c2 = a1 - b1;
- ; d2 = d1 - c1;
- ; a2 += (a2>0);
- ; b2 += (b2>0);
- ; c2 += (c2>0);
- ; d2 += (d2>0);
- ; op[0] = (a2)>>1;
- ; op[4] = (b2)>>1;
- ; op[8] = (c2)>>1;
- ; op[12]= (d2)>>1;
-
- movdqu [rdi + 0], xmm6
- movdqu [rdi + 16], xmm5
+ pcmpgtd xmm4, xmm0
+ pcmpgtd xmm5, xmm2
+ pand xmm4, [GLOBAL(cd1)]
+ pand xmm5, [GLOBAL(cd1)]
+
+ pxor xmm6, xmm6
+ movdqa xmm7, xmm6
+ pcmpgtd xmm6, xmm1
+ pcmpgtd xmm7, xmm3
+ pand xmm6, [GLOBAL(cd1)]
+ pand xmm7, [GLOBAL(cd1)]
+
+ paddd xmm0, xmm4
+ paddd xmm2, xmm5
+ paddd xmm0, [GLOBAL(cd3)]
+ paddd xmm2, [GLOBAL(cd3)]
+ paddd xmm1, xmm6
+ paddd xmm3, xmm7
+ paddd xmm1, [GLOBAL(cd3)]
+ paddd xmm3, [GLOBAL(cd3)]
+
+ psrad xmm0, 3
+ psrad xmm1, 3
+ psrad xmm2, 3
+ psrad xmm3, 3
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
+ punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
+ movdqa xmm5, xmm2
+ punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
+ punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
+
+ packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
+ packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm2
; begin epilog
pop rdi
pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
+
+SECTION_RODATA
+align 16
+c1:
+ dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+ dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+ dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
index 5661491ad..3b7b29c21 100644
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -23,5 +24,14 @@
#endif
#endif
+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx8
+
+#endif
+#endif
+
#endif
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
index 69617ca47..a182c8856 100644
--- a/vp8/encoder/x86/preproc_mmx.c
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
index 847fc6e37..f29a54ecd 100644
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -248,7 +249,7 @@ sym(vp8_fast_quantize_b_impl_mmx):
paddd mm0, mm5
; eob adjustment begins here
- movd rcx, mm0
+ movq rcx, mm0
and rcx, 0xffff
xor rdx, rdx
@@ -261,7 +262,7 @@ sym(vp8_fast_quantize_b_impl_mmx):
and rax, rdx
; Substitute the sse assembly for the old mmx mixed assembly/C. The
; following is kept as reference
- ; movd rcx, mm0
+ ; movq rcx, mm0
; bsr rax, rcx
;
; mov eob, rax
@@ -283,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx):
UNSHADOW_ARGS
pop rbp
ret
-
-
-;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
-; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
-; short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_sse)
-sym(vp8_fast_quantize_b_impl_sse):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- movdqa xmm0, [rsi]
-
- mov rax, arg(1) ;zbin_ptr
- movdqa xmm1, [rax]
-
- movdqa xmm3, xmm0
- psraw xmm0, 15
-
- pxor xmm3, xmm0
- psubw xmm3, xmm0 ; abs
-
- movdqa xmm2, xmm3
- pcmpgtw xmm1, xmm2
-
- pandn xmm1, xmm2
- movdqa xmm3, xmm1
-
- mov rdx, arg(6) ; quant_ptr
- movdqa xmm1, [rdx]
-
- mov rcx, arg(5) ; round_ptr
- movdqa xmm2, [rcx]
-
- paddw xmm3, xmm2
- pmulhuw xmm3, xmm1
-
- pxor xmm3, xmm0
- psubw xmm3, xmm0 ;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
- movdqa xmm0, xmm3
-
- movdqa [rdi], xmm3
-
- mov rax, arg(3) ;dequant_ptr
- movdqa xmm2, [rax]
-
- pmullw xmm3, xmm2
- mov rax, arg(7) ;dqcoeff_ptr
-
- movdqa [rax], xmm3
-
- ; next 8
- movdqa xmm4, [rsi+16]
-
- mov rax, arg(1) ;zbin_ptr
- movdqa xmm5, [rax+16]
-
- movdqa xmm7, xmm4
- psraw xmm4, 15
-
- pxor xmm7, xmm4
- psubw xmm7, xmm4 ; abs
-
- movdqa xmm6, xmm7
- pcmpgtw xmm5, xmm6
-
- pandn xmm5, xmm6
- movdqa xmm7, xmm5
-
- movdqa xmm5, [rdx+16]
- movdqa xmm6, [rcx+16]
-
-
- paddw xmm7, xmm6
- pmulhuw xmm7, xmm5
-
- pxor xmm7, xmm4
- psubw xmm7, xmm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movdqa xmm1, xmm7
- movdqa [rdi+16], xmm7
-
- mov rax, arg(3) ;dequant_ptr
- movdqa xmm6, [rax+16]
-
- pmullw xmm7, xmm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movdqa [rax+16], xmm7
- mov rdi, arg(4) ;scan_mask
-
- pxor xmm7, xmm7
- movdqa xmm2, [rdi]
-
- movdqa xmm3, [rdi+16];
- pcmpeqw xmm0, xmm7
-
- pcmpeqw xmm1, xmm7
- pcmpeqw xmm6, xmm6
-
- pxor xmm0, xmm6
- pxor xmm1, xmm6
-
- psrlw xmm0, 15
- psrlw xmm1, 15
-
- pmaddwd xmm0, xmm2
- pmaddwd xmm1, xmm3
-
- movq xmm2, xmm0
- movq xmm3, xmm1
-
- psrldq xmm0, 8
- psrldq xmm1, 8
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
-
- paddd xmm0, xmm2
- movq xmm1, xmm0
-
- psrldq xmm0, 4
- paddd xmm1, xmm0
-
- movd rcx, xmm1
- and rcx, 0xffff
-
- xor rdx, rdx
- sub rdx, rcx
-
- bsr rax, rcx
- inc rax
-
- sar rdx, 31
- and rax, rdx
-
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
new file mode 100644
index 000000000..1e0bd5c48
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -0,0 +1,388 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; const int *default_zig_zag, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr,
+; unsigned short zbin_oq_value,
+; short *zbin_boost_ptr);
+;
+global sym(vp8_regular_quantize_b_impl_sse2)
+sym(vp8_regular_quantize_b_impl_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 10
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+
+ %define abs_minus_zbin_lo 0
+ %define abs_minus_zbin_hi 16
+ %define temp_qcoeff_lo 32
+ %define temp_qcoeff_hi 48
+ %define save_xmm6 64
+ %define save_xmm7 80
+ %define eob 96
+
+ %define vp8_regularquantizeb_stack_size eob + 16
+
+ sub rsp, vp8_regularquantizeb_stack_size
+
+ movdqa OWORD PTR[rsp + save_xmm6], xmm6
+ movdqa OWORD PTR[rsp + save_xmm7], xmm7
+
+ mov rdx, arg(0) ;coeff_ptr
+ mov eax, arg(8) ;zbin_oq_value
+
+ mov rcx, arg(1) ;zbin_ptr
+ movd xmm7, eax
+
+ movdqa xmm0, OWORD PTR[rdx]
+ movdqa xmm4, OWORD PTR[rdx + 16]
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ psraw xmm0, 15 ;sign of z (aka sz)
+ psraw xmm4, 15 ;sign of z (aka sz)
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+
+ movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr
+ movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr
+
+ pshuflw xmm7, xmm7, 0
+ psubw xmm1, xmm0 ;x = abs(z)
+
+ punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value
+ psubw xmm5, xmm4 ;x = abs(z)
+
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value)
+ psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value)
+
+ mov rdi, arg(5) ;round_ptr
+ mov rsi, arg(6) ;quant_ptr
+
+ movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
+ movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
+
+ paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back
+ paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back
+
+ movdqa xmm2, OWORD PTR[rdi]
+ movdqa xmm3, OWORD PTR[rsi]
+
+ movdqa xmm6, OWORD PTR[rdi + 16]
+ movdqa xmm7, OWORD PTR[rsi + 16]
+
+ paddw xmm1, xmm2
+ paddw xmm5, xmm6
+
+ pmulhw xmm1, xmm3
+ pmulhw xmm5, xmm7
+
+ mov rsi, arg(2) ;qcoeff_ptr
+ pxor xmm6, xmm6
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1
+ movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5
+
+ movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff
+ movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff
+
+ xor rax, rax
+ mov rcx, -1
+
+ mov [rsp + eob], rcx
+ mov rsi, arg(9) ;zbin_boost_ptr
+
+ mov rbx, arg(4) ;default_zig_zag
+
+rq_zigzag_loop:
+ movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc
+ movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
+ lea rsi, [rsi + 2] ;zbin_boost_ptr++
+
+ movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+
+ sub edx, edi ;x - zbin
+ jl rq_zigzag_1
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+ cmp edx, 0
+ je rq_zigzag_1
+
+ mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+ mov rsi, arg(9) ;zbin_boost_ptr
+ mov [rsp + eob], rax ;eob = i
+
+rq_zigzag_1:
+ movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
+ movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
+ lea rsi, [rsi + 2] ;zbin_boost_ptr++
+
+ movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+ lea rax, [rax + 1]
+
+ sub edx, edi ;x - zbin
+ jl rq_zigzag_1a
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+ cmp edx, 0
+ je rq_zigzag_1a
+
+ mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+ mov rsi, arg(9) ;zbin_boost_ptr
+ mov [rsp + eob], rax ;eob = i
+
+rq_zigzag_1a:
+ movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
+ movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
+ lea rsi, [rsi + 2] ;zbin_boost_ptr++
+
+ movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+ lea rax, [rax + 1]
+
+ sub edx, edi ;x - zbin
+ jl rq_zigzag_1b
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+ cmp edx, 0
+ je rq_zigzag_1b
+
+ mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+ mov rsi, arg(9) ;zbin_boost_ptr
+ mov [rsp + eob], rax ;eob = i
+
+rq_zigzag_1b:
+ movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
+ movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
+ lea rsi, [rsi + 2] ;zbin_boost_ptr++
+
+ movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
+ lea rax, [rax + 1]
+
+ sub edx, edi ;x - zbin
+ jl rq_zigzag_1c
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
+
+ cmp edx, 0
+ je rq_zigzag_1c
+
+ mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+ mov rsi, arg(9) ;zbin_boost_ptr
+ mov [rsp + eob], rax ;eob = i
+
+rq_zigzag_1c:
+ lea rax, [rax + 1]
+
+ cmp rax, 16
+ jl rq_zigzag_loop
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ mov rcx, arg(3) ;dequant_ptr
+ mov rsi, arg(7) ;dqcoeff_ptr
+
+ movdqa xmm2, OWORD PTR[rdi]
+ movdqa xmm3, OWORD PTR[rdi + 16]
+
+ movdqa xmm0, OWORD PTR[rcx]
+ movdqa xmm1, OWORD PTR[rcx + 16]
+
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff
+ movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff
+
+ mov rax, [rsp + eob]
+
+ movdqa xmm6, OWORD PTR[rsp + save_xmm6]
+ movdqa xmm7, OWORD PTR[rsp + save_xmm7]
+
+ add rax, 1
+
+ add rsp, vp8_regularquantizeb_stack_size
+ pop rsp
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse2)
+sym(vp8_fast_quantize_b_impl_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+
+ %define save_xmm6 0
+ %define save_xmm7 16
+
+ %define vp8_fastquantizeb_stack_size save_xmm7 + 16
+
+ sub rsp, vp8_fastquantizeb_stack_size
+
+ movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
+ movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
+
+ mov rdx, arg(0) ;coeff_ptr
+ mov rcx, arg(2) ;dequant_ptr
+ mov rax, arg(3) ;scan_mask
+ mov rdi, arg(4) ;round_ptr
+ mov rsi, arg(5) ;quant_ptr
+
+ movdqa xmm0, XMMWORD PTR[rdx]
+ movdqa xmm4, XMMWORD PTR[rdx + 16]
+
+ movdqa xmm6, XMMWORD PTR[rdi] ;round lo
+ movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ psraw xmm0, 15 ;sign of z (aka sz)
+ psraw xmm4, 15 ;sign of z (aka sz)
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0 ;x = abs(z)
+ psubw xmm5, xmm4 ;x = abs(z)
+
+ paddw xmm1, xmm6
+ paddw xmm5, xmm7
+
+ pmulhw xmm1, XMMWORD PTR[rsi]
+ pmulhw xmm5, XMMWORD PTR[rsi + 16]
+
+ mov rdi, arg(1) ;qcoeff_ptr
+ mov rsi, arg(6) ;dqcoeff_ptr
+
+ movdqa xmm6, XMMWORD PTR[rcx]
+ movdqa xmm7, XMMWORD PTR[rcx + 16]
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa XMMWORD PTR[rdi], xmm1
+ movdqa XMMWORD PTR[rdi + 16], xmm5
+
+ pmullw xmm6, xmm1
+ pmullw xmm7, xmm5
+
+ movdqa xmm2, XMMWORD PTR[rax]
+ movdqa xmm3, XMMWORD PTR[rax+16];
+
+ pxor xmm4, xmm4 ;clear all bits
+ pcmpeqw xmm1, xmm4
+ pcmpeqw xmm5, xmm4
+
+ pcmpeqw xmm4, xmm4 ;set all bits
+ pxor xmm1, xmm4
+ pxor xmm5, xmm4
+
+ psrlw xmm1, 15
+ psrlw xmm5, 15
+
+ pmaddwd xmm1, xmm2
+ pmaddwd xmm5, xmm3
+
+ movq xmm2, xmm1
+ movq xmm3, xmm5
+
+ psrldq xmm1, 8
+ psrldq xmm5, 8
+
+ paddd xmm1, xmm5
+ paddd xmm2, xmm3
+
+ paddd xmm1, xmm2
+ movq xmm5, xmm1
+
+ psrldq xmm1, 4
+ paddd xmm5, xmm1
+
+ movq rcx, xmm5
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+
+ movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
+ movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
+
+ movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
+ movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
+
+ add rsp, vp8_fastquantizeb_stack_size
+ pop rsp
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
new file mode 100755
index 000000000..2f33199e5
--- /dev/null
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,114 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(0) ;coeff_ptr
+ mov rdi, arg(3) ;round_ptr
+ mov rsi, arg(4) ;quant_ptr
+
+ movdqa xmm0, [rdx]
+ movdqa xmm4, [rdx + 16]
+
+ movdqa xmm2, [rdi] ;round lo
+ movdqa xmm3, [rdi + 16] ;round hi
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ psraw xmm0, 15 ;sign of z (aka sz)
+ psraw xmm4, 15 ;sign of z (aka sz)
+
+ pabsw xmm1, xmm1
+ pabsw xmm5, xmm5
+
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
+
+ pmulhw xmm1, [rsi]
+ pmulhw xmm5, [rsi + 16]
+
+ mov rdi, arg(1) ;qcoeff_ptr
+ mov rcx, arg(2) ;dequant_ptr
+ mov rsi, arg(5) ;dqcoeff_ptr
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa [rdi], xmm1
+ movdqa [rdi + 16], xmm5
+
+ movdqa xmm2, [rcx]
+ movdqa xmm3, [rcx + 16]
+
+ pxor xmm4, xmm4
+ pmullw xmm2, xmm1
+ pmullw xmm3, xmm5
+
+ pcmpeqw xmm1, xmm4 ;non zero mask
+ pcmpeqw xmm5, xmm4 ;non zero mask
+ packsswb xmm1, xmm5
+ pshufb xmm1, [ GLOBAL(zz_shuf)]
+
+ pmovmskb edx, xmm1
+
+; xor ecx, ecx
+; mov eax, -1
+;find_eob_loop:
+; shr edx, 1
+; jc fq_skip
+; mov eax, ecx
+;fq_skip:
+; inc ecx
+; cmp ecx, 16
+; jne find_eob_loop
+ xor rdi, rdi
+ mov eax, -1
+ xor dx, ax ;flip the bits for bsr
+ bsr eax, edx
+
+ movdqa [rsi], xmm2 ;store dqcoeff
+ movdqa [rsi + 16], xmm3 ;store dqcoeff
+
+ sub edi, edx ;check for all zeros in bit mask
+ sar edi, 31 ;0 or -1
+ add eax, 1
+ and eax, edi ;if the bit mask was all zero,
+ ;then eob = 0
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+ db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
new file mode 100644
index 000000000..b5b22c022
--- /dev/null
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+#ifndef QUANTIZE_X86_H
+#define QUANTIZE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_quantize_block(vp8_regular_quantize_b_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+/* The sse2 quantizer has not been updated to match the new exact
+ * quantizer introduced in commit e04e2935
+ *#undef vp8_quantize_quantb
+ *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+ */
+
+#endif
+
+#endif
+
+
+#endif
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index a825698e7..85cb023a4 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -16,8 +17,6 @@ global sym(vp8_sad8x8_mmx)
global sym(vp8_sad4x4_mmx)
global sym(vp8_sad16x8_mmx)
-%idefine QWORD
-
;unsigned int vp8_sad16x16_mmx(
; unsigned char *src_ptr,
; int src_stride,
@@ -99,7 +98,7 @@ x16x16sad_mmx_loop:
psrlq mm0, 32
paddw mm7, mm0
- movd rax, mm7
+ movq rax, mm7
pop rdi
pop rsi
@@ -171,7 +170,7 @@ x8x16sad_mmx_loop:
psrlq mm0, 32
paddw mm7, mm0
- movd rax, mm7
+ movq rax, mm7
pop rdi
pop rsi
@@ -241,7 +240,7 @@ x8x8sad_mmx_loop:
psrlq mm0, 32
paddw mm7, mm0
- movd rax, mm7
+ movq rax, mm7
pop rdi
pop rsi
@@ -271,11 +270,11 @@ sym(vp8_sad4x4_mmx):
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
- movd mm0, QWORD PTR [rsi]
- movd mm1, QWORD PTR [rdi]
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
- movd mm2, QWORD PTR [rsi+rax]
- movd mm3, QWORD PTR [rdi+rdx]
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
@@ -297,11 +296,11 @@ sym(vp8_sad4x4_mmx):
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
- movd mm4, QWORD PTR [rsi]
- movd mm5, QWORD PTR [rdi]
+ movd mm4, DWORD PTR [rsi]
+ movd mm5, DWORD PTR [rdi]
- movd mm6, QWORD PTR [rsi+rax]
- movd mm7, QWORD PTR [rdi+rdx]
+ movd mm6, DWORD PTR [rsi+rax]
+ movd mm7, DWORD PTR [rdi+rdx]
punpcklbw mm4, mm6
punpcklbw mm5, mm7
@@ -330,7 +329,7 @@ sym(vp8_sad4x4_mmx):
psrlq mm0, 32
paddw mm0, mm1
- movd rax, mm0
+ movq rax, mm0
pop rdi
pop rsi
@@ -417,7 +416,7 @@ x16x8sad_mmx_loop:
psrlq mm0, 32
paddw mm7, mm0
- movd rax, mm7
+ movq rax, mm7
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 53240bbf1..39ed79604 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -1,17 +1,16 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
-%idefine QWORD
-
;unsigned int vp8_sad16x16_wmt(
; unsigned char *src_ptr,
; int src_stride,
@@ -74,7 +73,7 @@ x16x16sad_wmt_loop:
psrldq xmm7, 8
paddw xmm0, xmm7
- movd rax, xmm0
+ movq rax, xmm0
; begin epilog
pop rdi
@@ -112,7 +111,7 @@ sym(vp8_sad8x16_wmt):
x8x16sad_wmt_loop:
- movd rax, mm7
+ movq rax, mm7
cmp rax, arg(4)
jg x8x16sad_wmt_early_exit
@@ -134,7 +133,7 @@ x8x16sad_wmt_loop:
cmp rsi, rcx
jne x8x16sad_wmt_loop
- movd rax, mm7
+ movq rax, mm7
x8x16sad_wmt_early_exit:
@@ -173,7 +172,7 @@ sym(vp8_sad8x8_wmt):
x8x8sad_wmt_loop:
- movd rax, mm7
+ movq rax, mm7
cmp rax, arg(4)
jg x8x8sad_wmt_early_exit
@@ -189,7 +188,7 @@ x8x8sad_wmt_loop:
cmp rsi, rcx
jne x8x8sad_wmt_loop
- movd rax, mm7
+ movq rax, mm7
x8x8sad_wmt_early_exit:
; begin epilog
@@ -220,11 +219,11 @@ sym(vp8_sad4x4_wmt):
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
- movd mm0, QWORD PTR [rsi]
- movd mm1, QWORD PTR [rdi]
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
- movd mm2, QWORD PTR [rsi+rax]
- movd mm3, QWORD PTR [rdi+rdx]
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
@@ -233,19 +232,19 @@ sym(vp8_sad4x4_wmt):
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
- movd mm4, QWORD PTR [rsi]
+ movd mm4, DWORD PTR [rsi]
- movd mm5, QWORD PTR [rdi]
- movd mm6, QWORD PTR [rsi+rax]
+ movd mm5, DWORD PTR [rdi]
+ movd mm6, DWORD PTR [rsi+rax]
- movd mm7, QWORD PTR [rdi+rdx]
+ movd mm7, DWORD PTR [rdi+rdx]
punpcklbw mm4, mm6
punpcklbw mm5, mm7
psadbw mm4, mm5
paddw mm0, mm4
- movd rax, mm0
+ movq rax, mm0
; begin epilog
pop rdi
@@ -282,7 +281,7 @@ sym(vp8_sad16x8_wmt):
x16x8sad_wmt_loop:
- movd rax, mm7
+ movq rax, mm7
cmp rax, arg(4)
jg x16x8sad_wmt_early_exit
@@ -316,7 +315,7 @@ x16x8sad_wmt_loop:
cmp rsi, rcx
jne x16x8sad_wmt_loop
- movd rax, mm7
+ movq rax, mm7
x16x8sad_wmt_early_exit:
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 38cc02957..1b7293c20 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -1,32 +1,31 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
-%idefine QWORD
-
%macro PROCESS_16X2X3 1
%if %1
- movdqa xmm0, [rsi]
- lddqu xmm5, [rdi]
- lddqu xmm6, [rdi+1]
- lddqu xmm7, [rdi+2]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm5, XMMWORD PTR [rdi]
+ lddqu xmm6, XMMWORD PTR [rdi+1]
+ lddqu xmm7, XMMWORD PTR [rdi+2]
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
- movdqa xmm0, [rsi]
- lddqu xmm1, [rdi]
- lddqu xmm2, [rdi+1]
- lddqu xmm3, [rdi+2]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm2, XMMWORD PTR [rdi+1]
+ lddqu xmm3, XMMWORD PTR [rdi+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
@@ -36,10 +35,10 @@
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
- movdqa xmm0, QWORD PTR [rsi+rax]
- lddqu xmm1, QWORD PTR [rdi+rdx]
- lddqu xmm2, QWORD PTR [rdi+rdx+1]
- lddqu xmm3, QWORD PTR [rdi+rdx+2]
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rdi+rdx]
+ lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
+ lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
@@ -55,19 +54,19 @@
%macro PROCESS_8X2X3 1
%if %1
- movq mm0, [rsi]
- movq mm5, [rdi]
- movq mm6, [rdi+1]
- movq mm7, [rdi+2]
+ movq mm0, QWORD PTR [rsi]
+ movq mm5, QWORD PTR [rdi]
+ movq mm6, QWORD PTR [rdi+1]
+ movq mm7, QWORD PTR [rdi+2]
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
- movq mm0, [rsi]
- movq mm1, [rdi]
- movq mm2, [rdi+1]
- movq mm3, [rdi+2]
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+ movq mm2, QWORD PTR [rdi+1]
+ movq mm3, QWORD PTR [rdi+2]
psadbw mm1, mm0
psadbw mm2, mm0
@@ -104,45 +103,45 @@
%macro PROCESS_16X2X4 1
%if %1
- movdqa xmm0, [rsi]
- lddqu xmm4, [rcx]
- lddqu xmm5, [rdx]
- lddqu xmm6, [rbx]
- lddqu xmm7, [rdi]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm4, XMMWORD PTR [rcx]
+ lddqu xmm5, XMMWORD PTR [rdx]
+ lddqu xmm6, XMMWORD PTR [rbx]
+ lddqu xmm7, XMMWORD PTR [rdi]
psadbw xmm4, xmm0
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
- movdqa xmm0, [rsi]
- lddqu xmm1, [rcx]
- lddqu xmm2, [rdx]
- lddqu xmm3, [rbx]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rcx]
+ lddqu xmm2, XMMWORD PTR [rdx]
+ lddqu xmm3, XMMWORD PTR [rbx]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
- lddqu xmm1, [rdi]
+ lddqu xmm1, XMMWORD PTR [rdi]
paddw xmm5, xmm2
paddw xmm6, xmm3
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endif
- movdqa xmm0, QWORD PTR [rsi+rax]
- lddqu xmm1, QWORD PTR [rcx+rbp]
- lddqu xmm2, QWORD PTR [rdx+rbp]
- lddqu xmm3, QWORD PTR [rbx+rbp]
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rcx+rbp]
+ lddqu xmm2, XMMWORD PTR [rdx+rbp]
+ lddqu xmm3, XMMWORD PTR [rbx+rbp]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
- lddqu xmm1, QWORD PTR [rdi+rbp]
+ lddqu xmm1, XMMWORD PTR [rdi+rbp]
paddw xmm5, xmm2
paddw xmm6, xmm3
@@ -161,28 +160,28 @@
%macro PROCESS_8X2X4 1
%if %1
- movq mm0, [rsi]
- movq mm4, [rcx]
- movq mm5, [rdx]
- movq mm6, [rbx]
- movq mm7, [rdi]
+ movq mm0, QWORD PTR [rsi]
+ movq mm4, QWORD PTR [rcx]
+ movq mm5, QWORD PTR [rdx]
+ movq mm6, QWORD PTR [rbx]
+ movq mm7, QWORD PTR [rdi]
psadbw mm4, mm0
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
- movq mm0, [rsi]
- movq mm1, [rcx]
- movq mm2, [rdx]
- movq mm3, [rbx]
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rcx]
+ movq mm2, QWORD PTR [rdx]
+ movq mm3, QWORD PTR [rbx]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
- movq mm1, [rdi]
+ movq mm1, QWORD PTR [rdi]
paddw mm5, mm2
paddw mm6, mm3
@@ -429,20 +428,20 @@ sym(vp8_sad4x4x3_sse3):
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
- movd mm0, QWORD PTR [rsi]
- movd mm1, QWORD PTR [rdi]
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
- movd mm2, QWORD PTR [rsi+rax]
- movd mm3, QWORD PTR [rdi+rdx]
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
- movd mm4, QWORD PTR [rdi+1]
- movd mm5, QWORD PTR [rdi+2]
+ movd mm4, DWORD PTR [rdi+1]
+ movd mm5, DWORD PTR [rdi+2]
- movd mm2, QWORD PTR [rdi+rdx+1]
- movd mm3, QWORD PTR [rdi+rdx+2]
+ movd mm2, DWORD PTR [rdi+rdx+1]
+ movd mm3, DWORD PTR [rdi+rdx+2]
psadbw mm1, mm0
@@ -457,24 +456,24 @@ sym(vp8_sad4x4x3_sse3):
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
- movd mm0, QWORD PTR [rsi]
- movd mm2, QWORD PTR [rdi]
+ movd mm0, DWORD PTR [rsi]
+ movd mm2, DWORD PTR [rdi]
- movd mm3, QWORD PTR [rsi+rax]
- movd mm6, QWORD PTR [rdi+rdx]
+ movd mm3, DWORD PTR [rsi+rax]
+ movd mm6, DWORD PTR [rdi+rdx]
punpcklbw mm0, mm3
punpcklbw mm2, mm6
- movd mm3, QWORD PTR [rdi+1]
- movd mm7, QWORD PTR [rdi+2]
+ movd mm3, DWORD PTR [rdi+1]
+ movd mm7, DWORD PTR [rdi+2]
psadbw mm2, mm0
paddw mm1, mm2
- movd mm2, QWORD PTR [rdi+rdx+1]
- movd mm6, QWORD PTR [rdi+rdx+2]
+ movd mm2, DWORD PTR [rdi+rdx+1]
+ movd mm6, DWORD PTR [rdi+rdx+2]
punpcklbw mm3, mm2
punpcklbw mm7, mm6
@@ -529,7 +528,7 @@ sym(vp8_sad16x16_sse3):
vp8_sad16x16_sse3_loop:
- movd rax, mm7
+ movq rax, mm7
cmp rax, arg(4)
jg vp8_sad16x16_early_exit
@@ -563,7 +562,7 @@ vp8_sad16x16_sse3_loop:
cmp rsi, rcx
jne vp8_sad16x16_sse3_loop
- movd rax, mm7
+ movq rax, mm7
vp8_sad16x16_early_exit:
@@ -845,23 +844,23 @@ sym(vp8_sad4x4x4d_sse3):
xchg rbx, rax
- movd mm0, QWORD PTR [rsi]
- movd mm1, QWORD PTR [rcx]
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rcx]
- movd mm2, QWORD PTR [rsi+rax]
- movd mm3, QWORD PTR [rcx+rbp]
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rcx+rbp]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
- movd mm4, QWORD PTR [rdx]
- movd mm5, QWORD PTR [rbx]
+ movd mm4, DWORD PTR [rdx]
+ movd mm5, DWORD PTR [rbx]
- movd mm6, QWORD PTR [rdi]
- movd mm2, QWORD PTR [rdx+rbp]
+ movd mm6, DWORD PTR [rdi]
+ movd mm2, DWORD PTR [rdx+rbp]
- movd mm3, QWORD PTR [rbx+rbp]
- movd mm7, QWORD PTR [rdi+rbp]
+ movd mm3, DWORD PTR [rbx+rbp]
+ movd mm7, DWORD PTR [rdi+rbp]
psadbw mm1, mm0
@@ -884,17 +883,17 @@ sym(vp8_sad4x4x4d_sse3):
lea rdi, [rdi+rbp*2]
- movd mm0, QWORD PTR [rsi]
- movd mm2, QWORD PTR [rcx]
+ movd mm0, DWORD PTR [rsi]
+ movd mm2, DWORD PTR [rcx]
- movd mm3, QWORD PTR [rsi+rax]
- movd mm7, QWORD PTR [rcx+rbp]
+ movd mm3, DWORD PTR [rsi+rax]
+ movd mm7, DWORD PTR [rcx+rbp]
punpcklbw mm0, mm3
punpcklbw mm2, mm7
- movd mm3, QWORD PTR [rdx]
- movd mm7, QWORD PTR [rbx]
+ movd mm3, DWORD PTR [rdx]
+ movd mm7, DWORD PTR [rbx]
psadbw mm2, mm0
mov rax, rbp
@@ -905,8 +904,8 @@ sym(vp8_sad4x4x4d_sse3):
paddw mm1, mm2
movd [rsi], mm1
- movd mm2, QWORD PTR [rdx+rax]
- movd mm1, QWORD PTR [rbx+rax]
+ movd mm2, DWORD PTR [rdx+rax]
+ movd mm1, DWORD PTR [rbx+rax]
punpcklbw mm3, mm2
punpcklbw mm7, mm1
@@ -914,8 +913,8 @@ sym(vp8_sad4x4x4d_sse3):
psadbw mm3, mm0
psadbw mm7, mm0
- movd mm2, QWORD PTR [rdi]
- movd mm1, QWORD PTR [rdi+rax]
+ movd mm2, DWORD PTR [rdi]
+ movd mm1, DWORD PTR [rdi+rax]
paddw mm3, mm4
paddw mm7, mm5
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
new file mode 100644
index 000000000..21e2e5007
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm1, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ movq xmm2, MMWORD PTR [rdi+ rdx+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm1, xmm2
+%else
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endif
+ movq xmm0, MMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+ movd xmm0, [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ mpsadbw xmm1, xmm0, 0x0
+%else
+ movd xmm0, [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endif
+ movd xmm0, [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4)
+sym(vp8_sad16x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqu XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad16x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4)
+sym(vp8_sad16x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqu XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4)
+sym(vp8_sad8x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqu XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4)
+sym(vp8_sad8x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ mov rdi, arg(4) ;Results
+ movdqu XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad4x4x8_c(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4)
+sym(vp8_sad4x4x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqu XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 1bb956121..69c5eaedc 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -1,32 +1,31 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
-%idefine QWORD
-
%macro PROCESS_16X2X3 1
%if %1
- movdqa xmm0, [rsi]
- lddqu xmm5, [rdi]
- lddqu xmm6, [rdi+1]
- lddqu xmm7, [rdi+2]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm5, XMMWORD PTR [rdi]
+ lddqu xmm6, XMMWORD PTR [rdi+1]
+ lddqu xmm7, XMMWORD PTR [rdi+2]
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
- movdqa xmm0, [rsi]
- lddqu xmm1, [rdi]
- lddqu xmm2, [rdi+1]
- lddqu xmm3, [rdi+2]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm2, XMMWORD PTR [rdi+1]
+ lddqu xmm3, XMMWORD PTR [rdi+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
@@ -36,10 +35,10 @@
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
- movdqa xmm0, QWORD PTR [rsi+rax]
- lddqu xmm1, QWORD PTR [rdi+rdx]
- lddqu xmm2, QWORD PTR [rdi+rdx+1]
- lddqu xmm3, QWORD PTR [rdi+rdx+2]
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rdi+rdx]
+ lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
+ lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
@@ -55,9 +54,9 @@
%macro PROCESS_16X2X3_OFFSET 2
%if %1
- movdqa xmm0, [rsi]
- movdqa xmm4, [rdi]
- movdqa xmm7, [rdi+16]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm7, XMMWORD PTR [rdi+16]
movdqa xmm5, xmm7
palignr xmm5, xmm4, %2
@@ -71,9 +70,9 @@
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
- movdqa xmm0, [rsi]
- movdqa xmm4, [rdi]
- movdqa xmm3, [rdi+16]
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm3, XMMWORD PTR [rdi+16]
movdqa xmm1, xmm3
palignr xmm1, xmm4, %2
@@ -91,9 +90,9 @@
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
- movdqa xmm0, QWORD PTR [rsi+rax]
- movdqa xmm4, QWORD PTR [rdi+rdx]
- movdqa xmm3, QWORD PTR [rdi+rdx+16]
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ movdqa xmm4, XMMWORD PTR [rdi+rdx]
+ movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
movdqa xmm1, xmm3
palignr xmm1, xmm4, %2
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index ce3e61066..a47e1f0d6 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -1,20 +1,21 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
-; unsigned short *diff, unsigned char *Predictor,
+; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_mmx_impl)
-sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -150,7 +151,7 @@ submby_loop:
;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
global sym(vp8_subtract_mbuv_mmx)
-sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644
index 000000000..3fb23d097
--- /dev/null
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -0,0 +1,356 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+; short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp8_subtract_b_sse2_impl)
+sym(vp8_subtract_b_sse2_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi], mm0
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*2], mm0
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_sse2)
+sym(vp8_subtract_mby_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(1) ;src
+ mov rdi, arg(0) ;diff
+
+ mov rax, arg(2) ;pred
+ movsxd rdx, dword ptr arg(3) ;stride
+
+ mov rcx, 8 ; do two lines at one time
+
+submby_loop:
+ movdqa xmm0, XMMWORD PTR [rsi] ; src
+ movdqa xmm1, XMMWORD PTR [rax] ; pred
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi +16], xmm2
+
+ movdqa xmm4, XMMWORD PTR [rsi + rdx]
+ movdqa xmm5, XMMWORD PTR [rax + 16]
+
+ movdqa xmm6, xmm4
+ psubb xmm4, xmm5
+
+ pxor xmm5, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm6, [GLOBAL(t80)]
+ pcmpgtb xmm5, xmm6 ; obtain sign information
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ punpcklbw xmm4, xmm5 ; put sign back to subtraction
+ punpckhbw xmm6, xmm7 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi +32], xmm4
+ movdqa XMMWORD PTR [rdi +48], xmm6
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+ sub rcx, 1
+ jnz submby_loop
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_sse2)
+sym(vp8_subtract_mbuv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(1) ;z = usrc
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ add rax, 256 ;Predictor = pred + 256
+ movsxd rdx, dword ptr arg(4) ;stride;
+ lea rcx, [rdx + rdx*2]
+
+ ;u
+ ;line 0 1
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi +16], xmm2
+
+ ;line 2 3
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+16] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 32], xmm0
+ movdqa XMMWORD PTR [rdi + 48], xmm2
+
+ ;line 4 5
+ lea rsi, [rsi + rdx*4]
+
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 64], xmm0
+ movdqa XMMWORD PTR [rdi + 80], xmm2
+
+ ;line 6 7
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 96], xmm0
+ movdqa XMMWORD PTR [rdi + 112], xmm2
+
+ ;v
+ mov rsi, arg(2) ;z = vsrc
+ add rdi, 64*2 ;diff = diff + 320 (shorts)
+ add rax, 64 ;Predictor = pred + 320
+
+ ;line 0 1
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi +16], xmm2
+
+ ;line 2 3
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+16] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 32], xmm0
+ movdqa XMMWORD PTR [rdi + 48], xmm2
+
+ ;line 4 5
+ lea rsi, [rsi + rdx*4]
+
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 64], xmm0
+ movdqa XMMWORD PTR [rdi + 80], xmm2
+
+ ;line 6 7
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 96], xmm0
+ movdqa XMMWORD PTR [rdi + 112], xmm2
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t80:
+ times 16 db 0x80
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index d0da82ad4..67a9b4d3e 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -497,7 +498,7 @@ sym(vp8_get4x4sse_cs_mmx):
psrlq mm7, 32
paddd mm0, mm7
- movd rax, mm0
+ movq rax, mm0
; begin epilog
@@ -555,7 +556,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx):
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
- paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm5, mm1
@@ -579,7 +580,7 @@ filter_block2d_bil4x4_var_mmx_loop:
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
- paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm3, mm5 ;
@@ -591,7 +592,7 @@ filter_block2d_bil4x4_var_mmx_loop:
paddw mm1, mm3 ;
- paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movd mm3, [rdi] ;
@@ -709,10 +710,10 @@ sym(vp8_filter_block2d_bil_var_mmx):
paddw mm1, mm3 ;
paddw mm2, mm4 ;
- paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
- paddw mm2, [mmx_bi_rd GLOBAL] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm5, mm1
@@ -748,10 +749,10 @@ filter_block2d_bil_var_mmx_loop:
paddw mm1, mm3 ;
paddw mm2, mm4 ;
- paddw mm1, [mmx_bi_rd GLOBAL] ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
- paddw mm2, [mmx_bi_rd GLOBAL] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm3, mm5 ;
@@ -772,8 +773,8 @@ filter_block2d_bil_var_mmx_loop:
paddw mm1, mm3 ;
paddw mm2, mm4 ;
- paddw mm1, [mmx_bi_rd GLOBAL] ;
- paddw mm2, [mmx_bi_rd GLOBAL] ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
psraw mm2, mmx_filter_shift ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 7e5ee284b..cefa0a956 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -57,7 +58,7 @@ NEXTROW:
movdqa xmm3,xmm4
psrldq xmm4,4
paddd xmm4,xmm3
- movd rax,xmm4
+ movq rax,xmm4
; begin epilog
@@ -470,7 +471,7 @@ sym(vp8_get8x8var_sse2):
mov rax, arg(5) ;[Sum]
mov rdi, arg(4) ;[SSE]
- movd rdx, xmm7
+ movq rdx, xmm7
movsx rcx, dx
mov dword ptr [rax], ecx
@@ -531,7 +532,7 @@ sym(vp8_filter_block2d_bil_var_sse2):
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
- paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
@@ -553,7 +554,7 @@ filter_block2d_bil_var_sse2_loop:
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
- paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm3, xmm5 ;
@@ -564,7 +565,7 @@ filter_block2d_bil_var_sse2_loop:
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
- paddw xmm1, [xmm_bi_rd GLOBAL] ;
+ paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 4a5b25b0d..2df73a635 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -14,7 +15,7 @@
extern void filter_block1d_h6_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
@@ -24,7 +25,7 @@ extern void filter_block1d_h6_mmx
);
extern void filter_block1d_v6_mmx
(
- short *src_ptr,
+ const short *src_ptr,
unsigned char *output_ptr,
unsigned int pixels_per_line,
unsigned int pixel_step,
@@ -36,34 +37,34 @@ extern void filter_block1d_v6_mmx
extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
extern unsigned int vp8_get8x8var_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
extern unsigned int vp8_get4x4var_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
extern unsigned int vp8_get4x4sse_cs_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride
);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
const short *HFilter,
const short *VFilter,
@@ -72,9 +73,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
);
extern void vp8_filter_block2d_bil_var_mmx
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
const short *HFilter,
@@ -125,9 +126,9 @@ void vp8_test_get_mb_ss(void)
unsigned int vp8_get16x16var_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned *SSE,
unsigned *SUM
@@ -156,9 +157,9 @@ unsigned int vp8_get16x16var_mmx(
unsigned int vp8_variance4x4_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -172,9 +173,9 @@ unsigned int vp8_variance4x4_mmx(
}
unsigned int vp8_variance8x8_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -189,9 +190,9 @@ unsigned int vp8_variance8x8_mmx(
}
unsigned int vp8_mse16x16_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -211,9 +212,9 @@ unsigned int vp8_mse16x16_mmx(
unsigned int vp8_variance16x16_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
int *sse)
{
@@ -233,9 +234,9 @@ unsigned int vp8_variance16x16_mmx(
}
unsigned int vp8_variance16x8_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -254,9 +255,9 @@ unsigned int vp8_variance16x8_mmx(
unsigned int vp8_variance8x16_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -295,11 +296,11 @@ DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
unsigned int vp8_sub_pixel_variance4x4_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse)
@@ -319,11 +320,11 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
unsigned int vp8_sub_pixel_variance8x8_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -343,11 +344,11 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
unsigned int vp8_sub_pixel_variance16x16_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -382,11 +383,11 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
}
unsigned int vp8_sub_pixel_mse16x16_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -397,11 +398,11 @@ unsigned int vp8_sub_pixel_mse16x16_mmx(
unsigned int vp8_sub_pixel_variance16x8_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -434,11 +435,11 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
unsigned int vp8_sub_pixel_variance8x16_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
int *sse
)
@@ -456,9 +457,9 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
}
unsigned int vp8_i_variance16x16_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -479,9 +480,9 @@ unsigned int vp8_i_variance16x16_mmx(
}
unsigned int vp8_i_variance8x16_mmx(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -500,11 +501,11 @@ unsigned int vp8_i_variance8x16_mmx(
unsigned int vp8_i_sub_pixel_variance16x16_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -559,11 +560,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_mmx
unsigned int vp8_i_sub_pixel_variance8x16_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -594,3 +595,39 @@ unsigned int vp8_i_sub_pixel_variance8x16_mmx
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 7));
}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
+ ref_ptr, recon_stride, sse);
+}
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index ea80753bd..006e0a24a 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -12,16 +13,16 @@
#include "pragmas.h"
#include "vpx_ports/mem.h"
-extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
const short *HFilter,
const short *VFilter,
@@ -31,9 +32,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
extern unsigned int vp8_get4x4var_mmx
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
@@ -41,38 +42,38 @@ extern unsigned int vp8_get4x4var_mmx
unsigned int vp8_get_mb_ss_sse2
(
- short *src_ptr
+ const short *src_ptr
);
unsigned int vp8_get16x16var_sse2
(
- unsigned char *src_ptr,
- int source_stride,
- unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
);
unsigned int vp8_get16x16pred_error_sse2
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_stride
);
unsigned int vp8_get8x8var_sse2
(
- unsigned char *src_ptr,
- int source_stride,
- unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
);
void vp8_filter_block2d_bil_var_sse2
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
const short *HFilter,
@@ -82,9 +83,9 @@ void vp8_filter_block2d_bil_var_sse2
);
void vp8_half_horiz_vert_variance16x_h_sse2
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
@@ -92,9 +93,9 @@ void vp8_half_horiz_vert_variance16x_h_sse2
);
void vp8_half_horiz_variance16x_h_sse2
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
@@ -102,9 +103,9 @@ void vp8_half_horiz_variance16x_h_sse2
);
void vp8_half_vert_variance16x_h_sse2
(
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int ref_pixels_per_line,
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
@@ -114,9 +115,9 @@ void vp8_half_vert_variance16x_h_sse2
DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
unsigned int vp8_variance4x4_wmt(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride)
{
unsigned int var;
@@ -131,9 +132,9 @@ unsigned int vp8_variance4x4_wmt(
unsigned int vp8_variance8x8_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride)
{
unsigned int var;
@@ -148,9 +149,9 @@ unsigned int vp8_variance8x8_wmt
unsigned int vp8_variance16x16_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -163,9 +164,9 @@ unsigned int vp8_variance16x16_wmt
return (sse0 - ((sum0 * sum0) >> 8));
}
unsigned int vp8_mse16x16_wmt(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -181,9 +182,9 @@ unsigned int vp8_mse16x16_wmt(
unsigned int vp8_variance16x8_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -202,9 +203,9 @@ unsigned int vp8_variance16x8_wmt
unsigned int vp8_variance8x16_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -238,11 +239,11 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
};
unsigned int vp8_sub_pixel_variance4x4_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -262,11 +263,11 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
unsigned int vp8_sub_pixel_variance8x8_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -287,11 +288,11 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
unsigned int vp8_sub_pixel_variance16x16_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -363,11 +364,11 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
}
unsigned int vp8_sub_pixel_mse16x16_wmt(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -378,11 +379,11 @@ unsigned int vp8_sub_pixel_mse16x16_wmt(
unsigned int vp8_sub_pixel_variance16x8_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
@@ -416,11 +417,11 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
unsigned int vp8_sub_pixel_variance8x16_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -439,9 +440,9 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
}
unsigned int vp8_i_variance16x16_wmt(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -463,9 +464,9 @@ unsigned int vp8_i_variance16x16_wmt(
}
unsigned int vp8_i_variance8x16_wmt(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int source_stride,
- unsigned char *ref_ptr,
+ const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
@@ -485,11 +486,11 @@ unsigned int vp8_i_variance8x16_wmt(
unsigned int vp8_i_sub_pixel_variance16x16_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -500,11 +501,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_wmt
unsigned int vp8_i_sub_pixel_variance8x16_wmt
(
- unsigned char *src_ptr,
+ const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
- unsigned char *dst_ptr,
+ const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
@@ -512,3 +513,84 @@ unsigned int vp8_i_sub_pixel_variance8x16_wmt
return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 35fc90c48..6bea15ebc 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -34,6 +35,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx);
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
extern prototype_getmbss(vp8_get_mb_ss_mmx);
extern prototype_variance(vp8_mse16x16_mmx);
@@ -88,6 +92,15 @@ extern prototype_sad(vp8_get4x4sse_cs_mmx);
#undef vp8_variance_subpixvar16x16
#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
+#undef vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_mmx
+
+#undef vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_mmx
+
+#undef vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_mmx
+
#undef vp8_variance_subpixmse16x16
#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
@@ -129,6 +142,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt);
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
extern prototype_getmbss(vp8_get_mb_ss_sse2);
extern prototype_variance(vp8_mse16x16_wmt);
@@ -182,6 +198,15 @@ extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_subpixvar16x16
#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
+#undef vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt
+
+#undef vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
+
+#undef vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt
+
#undef vp8_variance_subpixmse16x16
#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
@@ -240,7 +265,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
#undef vp8_variance_sad16x16x4d
-#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3
+#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_sse3
#undef vp8_variance_sad16x8x4d
#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
@@ -272,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
#endif
#endif
+
+#if HAVE_SSE4_1
+extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
+
+#undef vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
+
+#undef vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
+
+#undef vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
+
+#undef vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
+
+#endif
+#endif
+
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index f1391ba8c..fb1b37ccb 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -1,10 +1,11 @@
/*
- * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
*/
@@ -17,15 +18,10 @@
#if HAVE_MMX
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
{
- vp8_short_fdct4x4_mmx(input, output, pitch);
- vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
-void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
-{
- vp8_fast_fdct4x4_mmx(input, output , pitch);
- vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
@@ -33,14 +29,14 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *quant_ptr, short *dqcoeff_ptr);
void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
{
- short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
- short *coeff_ptr = &b->coeff[0];
- short *zbin_ptr = &b->zbin[0][0];
- short *round_ptr = &b->round[0][0];
- short *quant_ptr = &b->quant[0][0];
- short *qcoeff_ptr = d->qcoeff;
+ short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = &d->dequant[0][0];
+ short *dequant_ptr = d->dequant;
d->eob = vp8_fast_quantize_b_impl_mmx(
coeff_ptr,
@@ -86,30 +82,28 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#endif
#if HAVE_SSE2
-void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
{
- vp8_short_fdct4x4_wmt(input, output, pitch);
- vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_sse2(input, output, pitch);
+ vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
-int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
- short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
- short *coeff_ptr = &b->coeff[0];
- short *zbin_ptr = &b->zbin[0][0];
- short *round_ptr = &b->round[0][0];
- short *quant_ptr = &b->quant[0][0];
- short *qcoeff_ptr = d->qcoeff;
+ short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+ short *coeff_ptr = b->coeff;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = &d->dequant[0][0];
+ short *dequant_ptr = d->dequant;
- d->eob = vp8_fast_quantize_b_impl_sse(
+ d->eob = vp8_fast_quantize_b_impl_sse2(
coeff_ptr,
- zbin_ptr,
qcoeff_ptr,
dequant_ptr,
scan_mask,
@@ -120,6 +114,41 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
);
}
+
+int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
+ short *qcoeff_ptr,short *dequant_ptr,
+ const int *default_zig_zag, short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr,
+ unsigned short zbin_oq_value,
+ short *zbin_boost_ptr);
+
+void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
+{
+ short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ d->eob = vp8_regular_quantize_b_impl_sse2(
+ coeff_ptr,
+ zbin_ptr,
+ qcoeff_ptr,
+ dequant_ptr,
+ vp8_default_zig_zag1d,
+
+ round_ptr,
+ quant_ptr,
+ dqcoeff_ptr,
+ zbin_oq_value,
+ zbin_boost_ptr
+ );
+}
+
int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
{
@@ -136,8 +165,39 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
}
+void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+ d->eob = vp8_fast_quantize_b_impl_ssse3(
+ b->coeff,
+ d->qcoeff,
+ d->dequant,
+ b->round,
+ b->quant,
+ d->dqcoeff
+ );
+}
#endif
+
void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -147,6 +207,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
int wmt_enabled = flags & HAS_SSE2;
int SSE3Enabled = flags & HAS_SSE3;
int SSSE3Enabled = flags & HAS_SSSE3;
+ int SSE4_1Enabled = flags & HAS_SSE4_1;
/* Note:
*
@@ -157,7 +218,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
/* Override default functions with fastest ones for this CPU. */
#if HAVE_MMX
-
if (mmx_enabled)
{
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
@@ -177,6 +237,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx;
cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx;
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx;
+ cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx;
+ cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx;
+ cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx;
cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx;
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
@@ -186,11 +249,19 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
-
+#if 0 // new fdct
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
+#else
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
+
+#endif
+
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
@@ -200,12 +271,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx;
- cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;
+ /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
}
-
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2
if (wmt_enabled)
{
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
@@ -225,6 +295,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt;
cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt;
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt;
+ cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt;
+ cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt;
+ cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt;
cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt;
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
@@ -235,26 +308,26 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
-#if 0
- /* short SSE2 DCT currently disabled, does not match the MMX version */
- cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
- cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
-#endif
- /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt;
- cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2;
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2;
+
+ cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ;
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
- /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+ cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2;
+ cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2;
+ cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2;
- cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
}
-
#endif
-#if HAVE_SSE3
+#if HAVE_SSE3
if (SSE3Enabled)
{
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
@@ -272,16 +345,30 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
}
-
#endif
-#if HAVE_SSSE3
+#if HAVE_SSSE3
if (SSSE3Enabled)
{
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
+
}
+#endif
+#if HAVE_SSE4_1
+ if (SSE4_1Enabled)
+ {
+ cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4;
+ cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4;
+ cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4;
+ cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4;
+ cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4;
+ cpi->rtcd.search.full_search = vp8_full_search_sadx8;
+ }
#endif
+
#endif
}