diff options
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r-- | vp8/encoder/x86/csystemdependent.c | 289 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_mmx.asm | 846 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_sse2.asm | 260 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_x86.h | 73 | ||||
-rw-r--r-- | vp8/encoder/x86/encodemb_x86.h | 73 | ||||
-rw-r--r-- | vp8/encoder/x86/encodeopt.asm | 393 | ||||
-rw-r--r-- | vp8/encoder/x86/fwalsh_sse2.asm | 117 | ||||
-rw-r--r-- | vp8/encoder/x86/mcomp_x86.h | 27 | ||||
-rw-r--r-- | vp8/encoder/x86/preproc_mmx.c | 297 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_mmx.asm | 438 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_mmx.asm | 428 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse2.asm | 329 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_sse3.asm | 939 | ||||
-rw-r--r-- | vp8/encoder/x86/sad_ssse3.asm | 367 | ||||
-rw-r--r-- | vp8/encoder/x86/subtract_mmx.asm | 431 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_mmx.asm | 980 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_impl_sse2.asm | 975 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_mmx.c | 596 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_sse2.c | 514 | ||||
-rw-r--r-- | vp8/encoder/x86/variance_x86.h | 275 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 287 |
21 files changed, 8934 insertions, 0 deletions
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c new file mode 100644 index 000000000..186ee6856 --- /dev/null +++ b/vp8/encoder/x86/csystemdependent.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "onyx_int.h" + +SADFunction *vp8_sad16x16; +SADFunction *vp8_sad16x8; +SADFunction *vp8_sad8x16; +SADFunction *vp8_sad8x8; +SADFunction *vp8_sad4x4; + +variance_function *vp8_variance4x4; +variance_function *vp8_variance8x8; +variance_function *vp8_variance8x16; +variance_function *vp8_variance16x8; +variance_function *vp8_variance16x16; + + +variance_function *vp8_mse16x16; + +sub_pixel_variance_function *vp8_sub_pixel_variance4x4; +sub_pixel_variance_function *vp8_sub_pixel_variance8x8; +sub_pixel_variance_function *vp8_sub_pixel_variance8x16; +sub_pixel_variance_function *vp8_sub_pixel_variance16x8; +sub_pixel_variance_function *vp8_sub_pixel_variance16x16; + +int (*vp8_block_error)(short *, short *); +int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc); +void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride); + +extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride); +extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride); + +extern int vp8_block_error_c(short *, short *); +extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc); + +extern int vp8_block_error_mmx(short *, short *); +extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc); + +extern int vp8_block_error_xmm(short *, short *); +extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc); + + + +int (*vp8_mbuverror)(MACROBLOCK *mb); +unsigned int (*vp8_get_mb_ss)(short *); +void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); +void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); +void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); + +void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch); +void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); +unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + +// c imports +extern int vp8_mbuverror_c(MACROBLOCK *mb); +extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); +extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch); + + +extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch); +extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); + +extern SADFunction vp8_sad16x16_c; +extern SADFunction vp8_sad16x8_c; +extern SADFunction vp8_sad8x16_c; +extern SADFunction vp8_sad8x8_c; +extern SADFunction vp8_sad4x4_c; + +extern SADFunction vp8_sad16x16_wmt; +extern SADFunction vp8_sad16x8_wmt; +extern SADFunction vp8_sad8x16_wmt; +extern SADFunction vp8_sad8x8_wmt; +extern SADFunction vp8_sad4x4_wmt; + +extern SADFunction vp8_sad16x16_mmx; +extern SADFunction vp8_sad16x8_mmx; +extern SADFunction vp8_sad8x16_mmx; +extern SADFunction vp8_sad8x8_mmx; +extern SADFunction vp8_sad4x4_mmx; + +extern variance_function vp8_variance16x16_c; +extern variance_function vp8_variance8x16_c; +extern variance_function vp8_variance16x8_c; +extern variance_function vp8_variance8x8_c; +extern variance_function vp8_variance4x4_c; +extern variance_function vp8_mse16x16_c; + +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c; + +extern unsigned int vp8_get_mb_ss_c(short *); +extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + +// mmx imports +extern int vp8_mbuverror_mmx(MACROBLOCK *mb); +extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d); +extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch); +extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); +extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); +extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); +extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch); +extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch); +extern variance_function vp8_variance4x4_mmx; +extern variance_function vp8_variance8x8_mmx; +extern variance_function vp8_variance8x16_mmx; +extern variance_function vp8_variance16x8_mmx; +extern variance_function vp8_variance16x16_mmx; + +extern variance_function vp8_mse16x16_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx; + +extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get_mb_ss_mmx(short *); +extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); + + +// wmt imports +extern int vp8_mbuverror_xmm(MACROBLOCK *mb); +extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d); +extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch); +extern variance_function vp8_variance4x4_wmt; +extern variance_function vp8_variance8x8_wmt; +extern variance_function vp8_variance8x16_wmt; +extern variance_function vp8_variance16x8_wmt; +extern variance_function vp8_variance16x16_wmt; + +extern variance_function vp8_mse16x16_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt; +extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt; +extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); +extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr); +extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); +extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); + +extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); + +void vp8_cmachine_specific_config(void) +{ + int mmx_enabled; + int xmm_enabled; + int wmt_enabled; + + vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled); + + if (wmt_enabled) // Willamette + { + // Willamette instruction set available: + vp8_mbuverror = vp8_mbuverror_xmm; + vp8_fast_quantize_b = vp8_fast_quantize_b_sse; + vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; + vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; + vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt; + vp8_subtract_b = vp8_subtract_b_mmx; + vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; + vp8_variance4x4 = vp8_variance4x4_mmx; + vp8_variance8x8 = vp8_variance8x8_mmx; + vp8_variance8x16 = vp8_variance8x16_wmt; + vp8_variance16x8 = vp8_variance16x8_wmt; + vp8_variance16x16 = vp8_variance16x16_wmt; + vp8_mse16x16 = vp8_mse16x16_wmt; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + vp8_get_mb_ss = vp8_get_mb_ss_sse2; + vp8_get16x16pred_error = vp8_get16x16pred_error_sse2; + vp8_get8x8var = vp8_get8x8var_sse2; + vp8_get16x16var = vp8_get16x16var_sse2; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_sad16x16 = vp8_sad16x16_wmt; + vp8_sad16x8 = vp8_sad16x8_wmt; + vp8_sad8x16 = vp8_sad8x16_wmt; + vp8_sad8x8 = vp8_sad8x8_wmt; + vp8_sad4x4 = vp8_sad4x4_wmt; + vp8_block_error = vp8_block_error_xmm; + vp8_mbblock_error = vp8_mbblock_error_xmm; + vp8_subtract_mby = vp8_subtract_mby_mmx; + + } + else if (mmx_enabled) + { + // MMX instruction set available: + vp8_mbuverror = vp8_mbuverror_mmx; + vp8_fast_quantize_b = vp8_fast_quantize_b_mmx; + vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; + vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; + vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx; + vp8_subtract_b = vp8_subtract_b_mmx; + vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; + vp8_variance4x4 = vp8_variance4x4_mmx; + vp8_variance8x8 = vp8_variance8x8_mmx; + vp8_variance8x16 = vp8_variance8x16_mmx; + vp8_variance16x8 = vp8_variance16x8_mmx; + vp8_variance16x16 = vp8_variance16x16_mmx; + vp8_mse16x16 = vp8_mse16x16_mmx; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; + vp8_get_mb_ss = vp8_get_mb_ss_mmx; + vp8_get16x16pred_error = vp8_get16x16pred_error_mmx; + vp8_get8x8var = vp8_get8x8var_mmx; + vp8_get16x16var = vp8_get16x16var_mmx; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_sad16x16 = vp8_sad16x16_mmx; + vp8_sad16x8 = vp8_sad16x8_mmx; + vp8_sad8x16 = vp8_sad8x16_mmx; + vp8_sad8x8 = vp8_sad8x8_mmx; + vp8_sad4x4 = vp8_sad4x4_mmx; + vp8_block_error = vp8_block_error_mmx; + vp8_mbblock_error = vp8_mbblock_error_mmx; + vp8_subtract_mby = vp8_subtract_mby_mmx; + + } + else + { + // Pure C: + vp8_mbuverror = vp8_mbuverror_c; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c; + vp8_subtract_b = vp8_subtract_b_c; + vp8_subtract_mbuv = vp8_subtract_mbuv_c; + vp8_variance4x4 = vp8_variance4x4_c; + vp8_variance8x8 = vp8_variance8x8_c; + vp8_variance8x16 = vp8_variance8x16_c; + vp8_variance16x8 = vp8_variance16x8_c; + vp8_variance16x16 = vp8_variance16x16_c; + vp8_mse16x16 = vp8_mse16x16_c; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; + vp8_get_mb_ss = vp8_get_mb_ss_c; + vp8_get16x16pred_error = vp8_get16x16pred_error_c; + vp8_get8x8var = vp8_get8x8var_c; + vp8_get16x16var = vp8_get16x16var_c; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + vp8_sad16x16 = vp8_sad16x16_c; + vp8_sad16x8 = vp8_sad16x8_c; + vp8_sad8x16 = vp8_sad8x16_c; + vp8_sad8x8 = vp8_sad8x8_c; + vp8_sad4x4 = vp8_sad4x4_c; + vp8_block_error = vp8_block_error_c; + vp8_mbblock_error = vp8_mbblock_error_c; + vp8_subtract_mby = vp8_subtract_mby_c; + } + +} diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm new file mode 100644 index 000000000..e13423796 --- /dev/null +++ b/vp8/encoder/x86/dct_mmx.asm @@ -0,0 +1,846 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text + global sym(vp8_short_fdct4x4_mmx) + global sym(vp8_fast_fdct4x4_mmx) + global sym(vp8_fast_fdct8x4_wmt) + + +%define DCTCONSTANTSBITS (16) +%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) +%define x_c1 (60547) ; cos(pi /8) * (1<<15) +%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) +%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) + + +%define _1STSTAGESHIFT 14 +%define _2NDSTAGESHIFT 16 + +; using matrix multiply with source and destbuffer has a pitch +;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) +sym(vp8_short_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + movsxd rax, dword ptr arg(2) ;pitch + lea rdx, [dct_matrix GLOBAL] + + movq mm0, [rsi ] + movq mm1, [rsi + rax] + + movq mm2, [rsi + rax*2] + lea rsi, [rsi + rax*2] + + movq mm3, [rsi + rax] + + ; first column + movq mm4, mm0 + movq mm7, [rdx] + + pmaddwd mm4, mm7 + movq mm5, mm1 + + pmaddwd mm5, mm7 + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + + pmaddwd mm5, mm7 + movq mm6, mm3 + + pmaddwd mm6, mm7 + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi], mm4 + + ;second column + movq mm4, mm0 + + pmaddwd mm4, [rdx+8] + movq mm5, mm1 + + pmaddwd mm5, [rdx+8] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+8] + movq mm6, mm3 + + pmaddwd mm6, [rdx+8] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+8], mm4 + + + ;third column + movq mm4, mm0 + + pmaddwd mm4, [rdx+16] + movq mm5, mm1 + + pmaddwd mm5, [rdx+16] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+16] + movq mm6, mm3 + + pmaddwd mm6, [rdx+16] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+16], mm4 + + ;fourth column (this is the last column, so we do not have save the source any more) + + pmaddwd mm0, [rdx+24] + + pmaddwd mm1, [rdx+24] + movq mm6, mm0 + + punpckldq mm0, mm1 + punpckhdq mm6, mm1 + + paddd mm0, mm6 + + pmaddwd mm2, [rdx+24] + + pmaddwd mm3, [rdx+24] + movq mm7, mm2 + + punpckldq mm2, mm3 + punpckhdq mm7, mm3 + + paddd mm2, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm0, mm6 + paddd mm2, mm6 + + psrad mm0, _1STSTAGESHIFT + psrad mm2, _1STSTAGESHIFT + + packssdw mm0, mm2 + + movq mm3, mm0 + + ; done with one pass + ; now start second pass + movq mm0, [rdi ] + movq mm1, [rdi+ 8] + movq mm2, [rdi+ 16] + + movq mm4, mm0 + + pmaddwd mm4, [rdx] + movq mm5, mm1 + + pmaddwd mm5, [rdx] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx] + movq mm6, mm3 + + pmaddwd mm6, [rdx] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi], mm4 + + ;second column + movq mm4, mm0 + + pmaddwd mm4, [rdx+8] + movq mm5, mm1 + + pmaddwd mm5, [rdx+8] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+8] + movq mm6, mm3 + + pmaddwd mm6, [rdx+8] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+8], mm4 + + + ;third column + movq mm4, mm0 + + pmaddwd mm4, [rdx+16] + movq mm5, mm1 + + pmaddwd mm5, [rdx+16] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+16] + movq mm6, mm3 + + pmaddwd mm6, [rdx+16] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+16], mm4 + + ;fourth column + movq mm4, mm0 + + pmaddwd mm4, [rdx+24] + movq mm5, mm1 + + pmaddwd mm5, [rdx+24] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+24] + movq mm6, mm3 + + pmaddwd mm6, [rdx+24] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+24], mm4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch) +sym(vp8_fast_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + lea rdx, [dct_const_mmx GLOBAL] + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movq mm0, [rsi] + movq mm1, [rsi + rax ] + + movq mm2, [rcx] + movq mm3, [rcx + rax] + ; get the constants + ;shift to left by 1 for prescision + paddw mm0, mm0 + paddw mm1, mm1 + + psllw mm2, 1 + psllw mm3, 1 + + ; transpose for the second stage + movq mm4, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 10 11 12 03 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm4, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm3 ; 20 30 21 31 + punpckhwd mm5, mm3 ; 22 32 23 33 + + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm4 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm4, mm5 ; 03 13 23 33 + movq mm3, mm4 + + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a = 0 + 3 + paddw mm1, mm2 ; b = 1 + 2 + + psubw mm4, mm2 ; c = 1 - 2 + psubw mm5, mm3 ; d = 0 - 3 + + + ; output 0 and 2 + movq mm6, [rdx + 16] ; c2 + movq mm2, mm0 ; a + + paddw mm0, mm1 ; a + b + psubw mm2, mm1 ; a - b + + movq mm1, mm0 ; a + b + pmulhw mm0, mm6 ; 00 01 02 03 + + paddw mm0, mm1 ; output 00 01 02 03 + pmulhw mm6, mm2 ; 20 21 22 23 + + paddw mm2, mm6 ; output 20 21 22 23 + + ; output 1 and 3 + movq mm6, [rdx + 8] ; c1 + movq mm7, [rdx + 24] ; c3 + + movq mm1, mm4 ; c + movq mm3, mm5 ; d + + pmulhw mm1, mm7 ; c * c3 + pmulhw mm3, mm6 ; d * c1 + + paddw mm3, mm5 ; d * c1 rounded + paddw mm1, mm3 ; output 10 11 12 13 + + movq mm3, mm4 ; c + pmulhw mm5, mm7 ; d * c3 + + pmulhw mm4, mm6 ; c * c1 + paddw mm3, mm4 ; round c* c1 + + psubw mm5, mm3 ; output 30 31 32 33 + movq mm3, mm5 + + + ; done with vertical + ; transpose for the second stage + movq mm4, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 10 11 12 03 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm4, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm3 ; 20 30 21 31 + punpckhwd mm5, mm3 ; 22 32 23 33 + + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm4 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm4, mm5 ; 03 13 23 33 + movq mm3, mm4 + + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a = 0 + 3 + paddw mm1, mm2 ; b = 1 + 2 + + psubw mm4, mm2 ; c = 1 - 2 + psubw mm5, mm3 ; d = 0 - 3 + + + ; output 0 and 2 + movq mm6, [rdx + 16] ; c2 + movq mm2, mm0 ; a + paddw mm0, mm1 ; a + b + + psubw mm2, mm1 ; a - b + + movq mm1, mm0 ; a + b + pmulhw mm0, mm6 ; 00 01 02 03 + + paddw mm0, mm1 ; output 00 01 02 03 + pmulhw mm6, mm2 ; 20 21 22 23 + + paddw mm2, mm6 ; output 20 21 22 23 + + + ; output 1 and 3 + movq mm6, [rdx + 8] ; c1 + movq mm7, [rdx + 24] ; c3 + + movq mm1, mm4 ; c + movq mm3, mm5 ; d + + pmulhw mm1, mm7 ; c * c3 + pmulhw mm3, mm6 ; d * c1 + + paddw mm3, mm5 ; d * c1 rounded + paddw mm1, mm3 ; output 10 11 12 13 + + movq mm3, mm4 ; c + pmulhw mm5, mm7 ; d * c3 + + pmulhw mm4, mm6 ; c * c1 + paddw mm3, mm4 ; round c* c1 + + psubw mm5, mm3 ; output 30 31 32 33 + movq mm3, mm5 + ; done with vertical + + pcmpeqw mm4, mm4 + pcmpeqw mm5, mm5 + psrlw mm4, 15 + psrlw mm5, 15 + + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm4 + paddw mm3, mm5 + + psraw mm0, 1 + psraw mm1, 1 + psraw mm2, 1 + psraw mm3, 1 + + movq [rdi ], mm0 + movq [rdi+ 8], mm1 + movq [rdi+16], mm2 + movq [rdi+24], mm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch) +sym(vp8_fast_fdct8x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + lea rdx, [dct_const_xmm GLOBAL] + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movdqa xmm0, [rsi] + movdqa xmm2, [rsi + rax] + + movdqa xmm4, [rcx] + movdqa xmm3, [rcx + rax] + ; get the constants + ;shift to left by 1 for prescision + psllw xmm0, 1 + psllw xmm2, 1 + + psllw xmm4, 1 + psllw xmm3, 1 + + ; transpose for the second stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a = 0 + 3 + paddw xmm1, xmm2 ; b = 1 + 2 + + psubw xmm4, xmm2 ; c = 1 - 2 + psubw xmm5, xmm3 ; d = 0 - 3 + + + ; output 0 and 2 + movdqa xmm6, [rdx + 32] ; c2 + movdqa xmm2, xmm0 ; a + + paddw xmm0, xmm1 ; a + b + psubw xmm2, xmm1 ; a - b + + movdqa xmm1, xmm0 ; a + b + pmulhw xmm0, xmm6 ; 00 01 02 03 + + paddw xmm0, xmm1 ; output 00 01 02 03 + pmulhw xmm6, xmm2 ; 20 21 22 23 + + paddw xmm2, xmm6 ; output 20 21 22 23 + + ; output 1 and 3 + movdqa xmm6, [rdx + 16] ; c1 + movdqa xmm7, [rdx + 48] ; c3 + + movdqa xmm1, xmm4 ; c + movdqa xmm3, xmm5 ; d + + pmulhw xmm1, xmm7 ; c * c3 + pmulhw xmm3, xmm6 ; d * c1 + + paddw xmm3, xmm5 ; d * c1 rounded + paddw xmm1, xmm3 ; output 10 11 12 13 + + movdqa xmm3, xmm4 ; c + pmulhw xmm5, xmm7 ; d * c3 + + pmulhw xmm4, xmm6 ; c * c1 + paddw xmm3, xmm4 ; round c* c1 + + psubw xmm5, xmm3 ; output 30 31 32 33 + movdqa xmm3, xmm5 + + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 + movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 + + movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 + + + movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 + + movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a = 0 + 3 + paddw xmm1, xmm2 ; b = 1 + 2 + + psubw xmm4, xmm2 ; c = 1 - 2 + psubw xmm5, xmm3 ; d = 0 - 3 + + + ; output 0 and 2 + movdqa xmm6, [rdx + 32] ; c2 + movdqa xmm2, xmm0 ; a + + paddw xmm0, xmm1 ; a + b + psubw xmm2, xmm1 ; a - b + + movdqa xmm1, xmm0 ; a + b + pmulhw xmm0, xmm6 ; 00 01 02 03 + + paddw xmm0, xmm1 ; output 00 01 02 03 + pmulhw xmm6, xmm2 ; 20 21 22 23 + + paddw xmm2, xmm6 ; output 20 21 22 23 + + ; output 1 and 3 + movdqa xmm6, [rdx + 16] ; c1 + movdqa xmm7, [rdx + 48] ; c3 + + movdqa xmm1, xmm4 ; c + movdqa xmm3, xmm5 ; d + + pmulhw xmm1, xmm7 ; c * c3 + pmulhw xmm3, xmm6 ; d * c1 + + paddw xmm3, xmm5 ; d * c1 rounded + paddw xmm1, xmm3 ; output 10 11 12 13 + + movdqa xmm3, xmm4 ; c + pmulhw xmm5, xmm7 ; d * c3 + + pmulhw xmm4, xmm6 ; c * c1 + paddw xmm3, xmm4 ; round c* c1 + + psubw xmm5, xmm3 ; output 30 31 32 33 + movdqa xmm3, xmm5 + ; done with vertical + + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm5, xmm5; + psrlw xmm4, 15 + psrlw xmm5, 15 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + psraw xmm0, 1 + psraw xmm1, 1 + psraw xmm2, 1 + psraw xmm3, 1 + + movq QWORD PTR[rdi ], xmm0 + movq QWORD PTR[rdi+ 8], xmm1 + movq QWORD PTR[rdi+16], xmm2 + movq QWORD PTR[rdi+24], xmm3 + + psrldq xmm0, 8 + psrldq xmm1, 8 + psrldq xmm2, 8 + psrldq xmm3, 8 + + movq QWORD PTR[rdi+32], xmm0 + movq QWORD PTR[rdi+40], xmm1 + movq QWORD PTR[rdi+48], xmm2 + movq QWORD PTR[rdi+56], xmm3 + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;static const unsigned int dct1st_stage_rounding_mmx[2] = +align 16 +dct1st_stage_rounding_mmx: + times 2 dd 8192 + + +;static const unsigned int dct2nd_stage_rounding_mmx[2] = +align 16 +dct2nd_stage_rounding_mmx: + times 2 dd 32768 + + +;static const short dct_matrix[4][4]= +align 16 +dct_matrix: + times 4 dw 23170 + + dw 30274 + dw 12540 + dw -12540 + dw -30274 + + dw 23170 + times 2 dw -23170 + dw 23170 + + dw 12540 + dw -30274 + dw 30274 + dw -12540 + + +;static const unsigned short dct_const_mmx[4 * 4]= +align 16 +dct_const_mmx: + times 4 dw 0 + times 4 dw 60547 + times 4 dw 46341 + times 4 dw 25080 + + +;static const unsigned short dct_const_xmm[8 * 4]= +align 16 +dct_const_xmm: + times 8 dw 0 + times 8 dw 60547 + times 8 dw 46341 + times 8 dw 25080 diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm new file mode 100644 index 000000000..3e5e9a70c --- /dev/null +++ b/vp8/encoder/x86/dct_sse2.asm @@ -0,0 +1,260 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vp8_short_fdct4x4_wmt) + +%define DCTCONSTANTSBITS (16) +%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) +%define x_c1 (60547) ; cos(pi /8) * (1<<15) +%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) +%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) + +%define _1STSTAGESHIFT 14 +%define _2NDSTAGESHIFT 16 + + +;; using matrix multiply +;void vp8_short_fdct4x4_wmt(short *input, short *output) +sym(vp8_short_fdct4x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + GET_GOT rbx + ; end prolog + + mov rax, arg(0) ;input + mov rcx, arg(1) ;output + + lea rdx, [dct_matrix_sse2 GLOBAL] + + movdqu xmm0, [rax ] + movdqu xmm1, [rax+16] + + ; first column + movdqa xmm2, xmm0 + movdqa xmm7, [rdx] + + pmaddwd xmm2, xmm7 + movdqa xmm3, xmm1 + + pmaddwd xmm3, xmm7 + movdqa xmm4, xmm2 + + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + movdqa xmm3, xmm2 + punpckldq xmm2, xmm4 + + punpckhdq xmm3, xmm4 + paddd xmm2, xmm3 + + + paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + psrad xmm2, _1STSTAGESHIFT + ;second column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+16] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+16] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + + + psrad xmm3, _1STSTAGESHIFT + packssdw xmm2, xmm3 + + ;third column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+32] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+32] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + + psrad xmm3, _1STSTAGESHIFT + + ;fourth column (this is the last column, so we do not have save the source any more) + pmaddwd xmm0, [rdx+48] + pmaddwd xmm1, [rdx+48] + + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + + punpckhdq xmm4, xmm1 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm4 + punpckhdq xmm1, xmm4 + + paddd xmm0, xmm1 + paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] + + + psrad xmm0, _1STSTAGESHIFT + packssdw xmm3, xmm0 + ; done with one pass + ; now start second pass + movdqa xmm0, xmm2 + movdqa xmm1, xmm3 + + pmaddwd xmm2, xmm7 + pmaddwd xmm3, xmm7 + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + + punpckhdq xmm4, xmm3 + movdqa xmm3, xmm2 + + punpckldq xmm2, xmm4 + punpckhdq xmm3, xmm4 + + paddd xmm2, xmm3 + paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm2, _2NDSTAGESHIFT + + ;second column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+16] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+16] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm3, _2NDSTAGESHIFT + packssdw xmm2, xmm3 + + movdqu [rcx], xmm2 + ;third column + movdqa xmm3, xmm0 + pmaddwd xmm3, [rdx+32] + + movdqa xmm4, xmm1 + pmaddwd xmm4, [rdx+32] + + movdqa xmm5, xmm3 + punpckldq xmm3, xmm4 + + punpckhdq xmm5, xmm4 + movdqa xmm4, xmm3 + + punpckldq xmm3, xmm5 + punpckhdq xmm4, xmm5 + + paddd xmm3, xmm4 + paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm3, _2NDSTAGESHIFT + ;fourth column + pmaddwd xmm0, [rdx+48] + pmaddwd xmm1, [rdx+48] + + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + + punpckhdq xmm4, xmm1 + movdqa xmm1, xmm0 + + punpckldq xmm0, xmm4 + punpckhdq xmm1, xmm4 + + paddd xmm0, xmm1 + paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] + + psrad xmm0, _2NDSTAGESHIFT + packssdw xmm3, xmm0 + + movdqu [rcx+16], xmm3 + + mov rsp, rbp + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;static unsigned int dct1st_stage_rounding_sse2[4] = +align 16 +dct1st_stage_rounding_sse2: + times 4 dd 8192 + + +;static unsigned int dct2nd_stage_rounding_sse2[4] = +align 16 +dct2nd_stage_rounding_sse2: + times 4 dd 32768 + +;static short dct_matrix_sse2[4][8]= +align 16 +dct_matrix_sse2: + times 8 dw 23170 + + dw 30274 + dw 12540 + dw -12540 + dw -30274 + dw 30274 + dw 12540 + dw -12540 + dw -30274 + + dw 23170 + times 2 dw -23170 + times 2 dw 23170 + times 2 dw -23170 + dw 23170 + + dw 12540 + dw -30274 + dw 30274 + dw -12540 + dw 12540 + dw -30274 + dw 30274 + dw -12540 diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h new file mode 100644 index 000000000..bc80e64ef --- /dev/null +++ b/vp8/encoder/x86/dct_x86.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef DCT_X86_H +#define DCT_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX +extern prototype_fdct(vp8_short_fdct4x4_mmx); +extern prototype_fdct(vp8_short_fdct8x4_mmx); +extern prototype_fdct(vp8_fast_fdct4x4_mmx); +extern prototype_fdct(vp8_fast_fdct8x4_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx + +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_fdct(vp8_short_fdct4x4_wmt); +extern prototype_fdct(vp8_short_fdct8x4_wmt); +extern prototype_fdct(vp8_fast_fdct8x4_wmt); + +extern prototype_fdct(vp8_short_walsh4x4_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#if 0 +/* short SSE2 DCT currently disabled, does not match the MMX version */ +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt +#endif + +#undef vp8_fdct_fast8x4 +#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt + +#undef vp8_fdct_walsh_short4x4 +#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2 + +#endif + + +#endif + +#endif diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h new file mode 100644 index 000000000..9397a6cca --- /dev/null +++ b/vp8/encoder/x86/encodemb_x86.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef ENCODEMB_X86_H +#define ENCODEMB_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX +extern prototype_berr(vp8_block_error_mmx); +extern prototype_mberr(vp8_mbblock_error_mmx); +extern prototype_mbuverr(vp8_mbuverror_mmx); +extern prototype_subb(vp8_subtract_b_mmx); +extern prototype_submby(vp8_subtract_mby_mmx); +extern prototype_submbuv(vp8_subtract_mbuv_mmx); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_encodemb_berr +#define vp8_encodemb_berr vp8_block_error_mmx + +#undef vp8_encodemb_mberr +#define vp8_encodemb_mberr vp8_mbblock_error_mmx + +#undef vp8_encodemb_mbuverr +#define vp8_encodemb_mbuverr vp8_mbuverror_mmx + +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp8_subtract_b_mmx + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp8_subtract_mby_mmx + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp8_subtract_mbuv_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_berr(vp8_block_error_xmm); +extern prototype_mberr(vp8_mbblock_error_xmm); +extern prototype_mbuverr(vp8_mbuverror_xmm); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_encodemb_berr +#define vp8_encodemb_berr vp8_block_error_xmm + +#undef vp8_encodemb_mberr +#define vp8_encodemb_mberr vp8_mbblock_error_xmm + +#undef vp8_encodemb_mbuverr +#define vp8_encodemb_mbuverr vp8_mbuverror_xmm + +#endif +#endif + + +#endif diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm new file mode 100644 index 000000000..194047155 --- /dev/null +++ b/vp8/encoder/x86/encodeopt.asm @@ -0,0 +1,393 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_xmm) +sym(vp8_block_error_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm7, xmm7 + + mov rdi, arg(1) ;dcoef_ptr + movdqa xmm3, [rsi] + + movdqa xmm4, [rdi] + movdqa xmm5, [rsi+16] + + movdqa xmm6, [rdi+16] + pxor xmm1, xmm1 ; from movd xmm1, dc; dc=0 + + movdqa xmm2, xmm7 + psubw xmm5, xmm6 + + por xmm1, xmm2 + pmaddwd xmm5, xmm5 + + pcmpeqw xmm1, xmm7 + psubw xmm3, xmm4 + + pand xmm1, xmm3 + pmaddwd xmm1, xmm1 + + paddd xmm1, xmm5 + movdqa xmm0, xmm1 + + punpckldq xmm0, xmm7 + punpckhdq xmm1, xmm7 + + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + psrldq xmm0, 8 + paddd xmm0, xmm1 + + movd rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_mmx) +sym(vp8_block_error_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor mm7, mm7 + + mov rdi, arg(1) ;dcoef_ptr + movq mm3, [rsi] + + movq mm4, [rdi] + movq mm5, [rsi+8] + + movq mm6, [rdi+8] + pxor mm1, mm1 ; from movd mm1, dc ; dc =0 + + movq mm2, mm7 + psubw mm5, mm6 + + por mm1, mm2 + pmaddwd mm5, mm5 + + pcmpeqw mm1, mm7 + psubw mm3, mm4 + + pand mm1, mm3 + pmaddwd mm1, mm1 + + paddd mm1, mm5 + movq mm3, [rsi+16] + + movq mm4, [rdi+16] + movq mm5, [rsi+24] + + movq mm6, [rdi+24] + psubw mm5, mm6 + + pmaddwd mm5, mm5 + psubw mm3, mm4 + + pmaddwd mm3, mm3 + paddd mm3, mm5 + + paddd mm1, mm3 + movq mm0, mm1 + + psrlq mm1, 32 + paddd mm0, mm1 + + movd rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_mmx_impl) +sym(vp8_mbblock_error_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor mm7, mm7 + + mov rdi, arg(1) ;dcoef_ptr + pxor mm2, mm2 + + movd mm1, dword ptr arg(2) ;dc + por mm1, mm2 + + pcmpeqw mm1, mm7 + mov rcx, 16 + +mberror_loop_mmx: + movq mm3, [rsi] + movq mm4, [rdi] + + movq mm5, [rsi+8] + movq mm6, [rdi+8] + + + psubw mm5, mm6 + pmaddwd mm5, mm5 + + psubw mm3, mm4 + pand mm3, mm1 + + pmaddwd mm3, mm3 + paddd mm2, mm5 + + paddd mm2, mm3 + movq mm3, [rsi+16] + + movq mm4, [rdi+16] + movq mm5, [rsi+24] + + movq mm6, [rdi+24] + psubw mm5, mm6 + + pmaddwd mm5, mm5 + psubw mm3, mm4 + + pmaddwd mm3, mm3 + paddd mm2, mm5 + + paddd mm2, mm3 + add rsi, 32 + + add rdi, 32 + sub rcx, 1 + + jnz mberror_loop_mmx + + movq mm0, mm2 + psrlq mm2, 32 + + paddd mm0, mm2 + movd rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_xmm_impl) +sym(vp8_mbblock_error_xmm_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + pxor xmm7, xmm7 + + mov rdi, arg(1) ;dcoef_ptr + pxor xmm2, xmm2 + + movd xmm1, dword ptr arg(2) ;dc + por xmm1, xmm2 + + pcmpeqw xmm1, xmm7 + mov rcx, 16 + +mberror_loop: + movdqa xmm3, [rsi] + movdqa xmm4, [rdi] + + movdqa xmm5, [rsi+16] + movdqa xmm6, [rdi+16] + + + psubw xmm5, xmm6 + pmaddwd xmm5, xmm5 + + psubw xmm3, xmm4 + pand xmm3, xmm1 + + pmaddwd xmm3, xmm3 + add rsi, 32 + + add rdi, 32 + + sub rcx, 1 + paddd xmm2, xmm5 + + paddd xmm2, xmm3 + jnz mberror_loop + + movdqa xmm0, xmm2 + punpckldq xmm0, xmm7 + + punpckhdq xmm2, xmm7 + paddd xmm0, xmm2 + + movdqa xmm1, xmm0 + psrldq xmm0, 8 + + paddd xmm0, xmm1 + movd rax, xmm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_mmx_impl) +sym(vp8_mbuverror_mmx_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor mm7, mm7 + +mbuverror_loop_mmx: + + movq mm1, [rsi] + movq mm2, [rdi] + + psubw mm1, mm2 + pmaddwd mm1, mm1 + + + movq mm3, [rsi+8] + movq mm4, [rdi+8] + + psubw mm3, mm4 + pmaddwd mm3, mm3 + + + paddd mm7, mm1 + paddd mm7, mm3 + + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz mbuverror_loop_mmx + + movq mm0, mm7 + psrlq mm7, 32 + + paddd mm0, mm7 + movd rax, mm0 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_xmm_impl) +sym(vp8_mbuverror_xmm_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;s_ptr + mov rdi, arg(1) ;d_ptr + + mov rcx, 16 + pxor xmm7, xmm7 + +mbuverror_loop: + + movdqa xmm1, [rsi] + movdqa xmm2, [rdi] + + psubw xmm1, xmm2 + pmaddwd xmm1, xmm1 + + paddd xmm7, xmm1 + + add rsi, 16 + add rdi, 16 + + dec rcx + jnz mbuverror_loop + + pxor xmm0, xmm0 + movdqa xmm1, xmm7 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + paddd xmm1, xmm2 + + movdqa xmm2, xmm1 + + psrldq xmm1, 8 + paddd xmm1, xmm2 + + movd rax, xmm1 + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm new file mode 100644 index 000000000..7d8620178 --- /dev/null +++ b/vp8/encoder/x86/fwalsh_sse2.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_walsh4x4_sse2) +sym(vp8_short_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + + movdqu xmm4, [rsi + 0] ;ip[4] ip[0] + movdqu xmm0, [rsi + 16] ;ip[12] ip[8] + + pxor xmm7, xmm7 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm5 ;ip[4] ip[0] + + paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm6, xmm5 + punpcklqdq xmm5, xmm3 ;d1 a1 + punpckhqdq xmm6, xmm3 ;c1 b1 + + movdqa xmm1, xmm6 ;c1 b1 + paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + movdqa xmm0, xmm6 ;aka b2 a2 + movdqa xmm1, xmm5 ;aka d2 c2 + + pcmpgtw xmm0, xmm7 + pcmpgtw xmm1, xmm7 + + psrlw xmm0, 15 + psrlw xmm1, 15 + + paddw xmm6, xmm0 + paddw xmm5, xmm1 + + psraw xmm6, 1 + psraw xmm5, 1 + + ; a2 = a1 + b1; + ; b2 = c1 + d1; + ; c2 = a1 - b1; + ; d2 = d1 - c1; + ; a2 += (a2>0); + ; b2 += (b2>0); + ; c2 += (c2>0); + ; d2 += (d2>0); + ; op[0] = (a2)>>1; + ; op[4] = (b2)>>1; + ; op[8] = (c2)>>1; + ; op[12]= (d2)>>1; + + movdqu [rdi + 0], xmm6 + movdqu [rdi + 16], xmm5 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h new file mode 100644 index 000000000..5661491ad --- /dev/null +++ b/vp8/encoder/x86/mcomp_x86.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef MCOMP_X86_H +#define MCOMP_X86_H + +#if HAVE_SSE3 +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_search_full_search +#define vp8_search_full_search vp8_full_search_sadx3 + +#undef vp8_search_diamond_search +#define vp8_search_diamond_search vp8_diamond_search_sadx4 + +#endif +#endif + +#endif + diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c new file mode 100644 index 000000000..69617ca47 --- /dev/null +++ b/vp8/encoder/x86/preproc_mmx.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "memory.h" +#include "preproc.h" +#include "pragmas.h" + +/**************************************************************************** +* Macros +****************************************************************************/ +#define FRAMECOUNT 7 +#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) + +/**************************************************************************** +* Imports +****************************************************************************/ +extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); + +/**************************************************************************** +* Exported Global Variables +****************************************************************************/ +void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); + +/**************************************************************************** + * + * ROUTINE : temp_filter_wmt + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_wmt +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3}; + __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16}; + + if (ppi->frame == 0) + { + do + { + int i; + int frame = 0; + + do + { + for (i = 0; i < 8; i++) + { + *frameptr = s[byte+i]; + ++frameptr; + } + + ++frame; + } + while (frame < FRAMECOUNT); + + for (i = 0; i < 8; i++) + d[byte+i] = s[byte+i]; + + byte += 8; + + } + while (byte < bytes); + } + else + { + int i; + int offset2 = (ppi->frame % FRAMECOUNT); + + do + { + __declspec(align(16)) unsigned short counts[8]; + __declspec(align(16)) unsigned short sums[8]; + __asm + { + mov eax, offset2 + mov edi, s // source pixels + pxor xmm1, xmm1 // accumulator + + pxor xmm7, xmm7 + + mov esi, frameptr // accumulator + pxor xmm2, xmm2 // count + + movq xmm3, QWORD PTR [edi] + + movq QWORD PTR [esi+8*eax], xmm3 + + punpcklbw xmm3, xmm2 // xmm3 source pixels + mov ecx, FRAMECOUNT + + next_frame: + movq xmm4, QWORD PTR [esi] // get frame buffer values + punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels + movdqa xmm6, xmm4 // save the pixel values + psubsw xmm4, xmm3 // subtracted pixel values + pmullw xmm4, xmm4 // square xmm4 + movd xmm5, strength + psrlw xmm4, xmm5 // should be strength + pmullw xmm4, threes // 3 * modifier + movdqa xmm5, sixteens // 16s + psubusw xmm5, xmm4 // 16 - modifiers + movdqa xmm4, xmm5 // save the modifiers + pmullw xmm4, xmm6 // multiplier values + paddusw xmm1, xmm4 // accumulator + paddusw xmm2, xmm5 // count + add esi, 8 // next frame + dec ecx // next set of eight pixels + jnz next_frame + + movdqa counts, xmm2 + psrlw xmm2, 1 // divide count by 2 for rounding + paddusw xmm1, xmm2 // rounding added in + + mov frameptr, esi + + movdqa sums, xmm1 + } + + for (i = 0; i < 8; i++) + { + int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; + blurvalue >>= 16; + d[i] = blurvalue; + } + + s += 8; + d += 8; + byte += 8; + } + while (byte < bytes); + } + + ++ppi->frame; + __asm emms +} + +/**************************************************************************** + * + * ROUTINE : temp_filter_mmx + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_mmx +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3}; + __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16}; + + if (ppi->frame == 0) + { + do + { + int i; + int frame = 0; + + do + { + for (i = 0; i < 4; i++) + { + *frameptr = s[byte+i]; + ++frameptr; + } + + ++frame; + } + while (frame < FRAMECOUNT); + + for (i = 0; i < 4; i++) + d[byte+i] = s[byte+i]; + + byte += 4; + + } + while (byte < bytes); + } + else + { + int i; + int offset2 = (ppi->frame % FRAMECOUNT); + + do + { + __declspec(align(16)) unsigned short counts[8]; + __declspec(align(16)) unsigned short sums[8]; + __asm + { + + mov eax, offset2 + mov edi, s // source pixels + pxor mm1, mm1 // accumulator + pxor mm7, mm7 + + mov esi, frameptr // accumulator + pxor mm2, mm2 // count + + movd mm3, DWORD PTR [edi] + movd DWORD PTR [esi+4*eax], mm3 + + punpcklbw mm3, mm2 // mm3 source pixels + mov ecx, FRAMECOUNT + + next_frame: + movd mm4, DWORD PTR [esi] // get frame buffer values + punpcklbw mm4, mm7 // mm4 frame buffer pixels + movq mm6, mm4 // save the pixel values + psubsw mm4, mm3 // subtracted pixel values + pmullw mm4, mm4 // square mm4 + movd mm5, strength + psrlw mm4, mm5 // should be strength + pmullw mm4, threes // 3 * modifier + movq mm5, sixteens // 16s + psubusw mm5, mm4 // 16 - modifiers + movq mm4, mm5 // save the modifiers + pmullw mm4, mm6 // multiplier values + paddusw mm1, mm4 // accumulator + paddusw mm2, mm5 // count + add esi, 4 // next frame + dec ecx // next set of eight pixels + jnz next_frame + + movq counts, mm2 + psrlw mm2, 1 // divide count by 2 for rounding + paddusw mm1, mm2 // rounding added in + + mov frameptr, esi + + movq sums, mm1 + + } + + for (i = 0; i < 4; i++) + { + int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; + blurvalue >>= 16; + d[i] = blurvalue; + } + + s += 4; + d += 4; + byte += 4; + } + while (byte < bytes); + } + + ++ppi->frame; + __asm emms +} diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm new file mode 100644 index 000000000..847fc6e37 --- /dev/null +++ b/vp8/encoder/x86/quantize_mmx.asm @@ -0,0 +1,438 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_mmx) +sym(vp8_fast_quantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + movq mm0, [rsi] + + mov rax, arg(1) ;zbin_ptr + movq mm1, [rax] + + movq mm3, mm0 + psraw mm0, 15 + + pxor mm3, mm0 + psubw mm3, mm0 ; abs + + movq mm2, mm3 + pcmpgtw mm1, mm2 + + pandn mm1, mm2 + movq mm3, mm1 + + mov rdx, arg(6) ;quant_ptr + movq mm1, [rdx] + + mov rcx, arg(5) ;round_ptr + movq mm2, [rcx] + + paddw mm3, mm2 + pmulhuw mm3, mm1 + + pxor mm3, mm0 + psubw mm3, mm0 ;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + movq mm0, mm3 + + movq [rdi], mm3 + + mov rax, arg(3) ;dequant_ptr + movq mm2, [rax] + + pmullw mm3, mm2 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax], mm3 + + ; next 8 + movq mm4, [rsi+8] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+8] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+8] + movq mm6, [rcx+8] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+8], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+8] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+8], mm7 + + + ; next 8 + movq mm4, [rsi+16] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+16] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+16] + movq mm6, [rcx+16] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+16], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+16] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+16], mm7 + + + ; next 8 + movq mm4, [rsi+24] + + mov rax, arg(1) ;zbin_ptr + movq mm5, [rax+24] + + movq mm7, mm4 + psraw mm4, 15 + + pxor mm7, mm4 + psubw mm7, mm4 ; abs + + movq mm6, mm7 + pcmpgtw mm5, mm6 + + pandn mm5, mm6 + movq mm7, mm5 + + movq mm5, [rdx+24] + movq mm6, [rcx+24] + + paddw mm7, mm6 + pmulhuw mm7, mm5 + + pxor mm7, mm4 + psubw mm7, mm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movq mm1, mm7 + movq [rdi+24], mm7 + + mov rax, arg(3) ;dequant_ptr + movq mm6, [rax+24] + + pmullw mm7, mm6 + mov rax, arg(7) ;dqcoeff_ptr + + movq [rax+24], mm7 + + + + mov rdi, arg(4) ;scan_mask + mov rsi, arg(2) ;qcoeff_ptr + + pxor mm5, mm5 + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rsi+8] + + movq mm2, [rdi] + movq mm3, [rdi+8]; + + pcmpeqw mm0, mm7 + pcmpeqw mm1, mm7 + + pcmpeqw mm6, mm6 + pxor mm0, mm6 + + pxor mm1, mm6 + psrlw mm0, 15 + + psrlw mm1, 15 + pmaddwd mm0, mm2 + + pmaddwd mm1, mm3 + movq mm5, mm0 + + paddd mm5, mm1 + + movq mm0, [rsi+16] + movq mm1, [rsi+24] + + movq mm2, [rdi+16] + movq mm3, [rdi+24]; + + pcmpeqw mm0, mm7 + pcmpeqw mm1, mm7 + + pcmpeqw mm6, mm6 + pxor mm0, mm6 + + pxor mm1, mm6 + psrlw mm0, 15 + + psrlw mm1, 15 + pmaddwd mm0, mm2 + + pmaddwd mm1, mm3 + paddd mm5, mm0 + + paddd mm5, mm1 + movq mm0, mm5 + + psrlq mm5, 32 + paddd mm0, mm5 + + ; eob adjustment begins here + movd rcx, mm0 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx ; rdx=-rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + ; Substitute the sse assembly for the old mmx mixed assembly/C. The + ; following is kept as reference + ; movd rcx, mm0 + ; bsr rax, rcx + ; + ; mov eob, rax + ; mov eee, rcx + ; + ;if(eee==0) + ;{ + ; eob=-1; + ;} + ;else if(eee<0) + ;{ + ; eob=15; + ;} + ;d->eob = eob+1; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_sse) +sym(vp8_fast_quantize_b_impl_sse): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;coeff_ptr + movdqa xmm0, [rsi] + + mov rax, arg(1) ;zbin_ptr + movdqa xmm1, [rax] + + movdqa xmm3, xmm0 + psraw xmm0, 15 + + pxor xmm3, xmm0 + psubw xmm3, xmm0 ; abs + + movdqa xmm2, xmm3 + pcmpgtw xmm1, xmm2 + + pandn xmm1, xmm2 + movdqa xmm3, xmm1 + + mov rdx, arg(6) ; quant_ptr + movdqa xmm1, [rdx] + + mov rcx, arg(5) ; round_ptr + movdqa xmm2, [rcx] + + paddw xmm3, xmm2 + pmulhuw xmm3, xmm1 + + pxor xmm3, xmm0 + psubw xmm3, xmm0 ;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + movdqa xmm0, xmm3 + + movdqa [rdi], xmm3 + + mov rax, arg(3) ;dequant_ptr + movdqa xmm2, [rax] + + pmullw xmm3, xmm2 + mov rax, arg(7) ;dqcoeff_ptr + + movdqa [rax], xmm3 + + ; next 8 + movdqa xmm4, [rsi+16] + + mov rax, arg(1) ;zbin_ptr + movdqa xmm5, [rax+16] + + movdqa xmm7, xmm4 + psraw xmm4, 15 + + pxor xmm7, xmm4 + psubw xmm7, xmm4 ; abs + + movdqa xmm6, xmm7 + pcmpgtw xmm5, xmm6 + + pandn xmm5, xmm6 + movdqa xmm7, xmm5 + + movdqa xmm5, [rdx+16] + movdqa xmm6, [rcx+16] + + + paddw xmm7, xmm6 + pmulhuw xmm7, xmm5 + + pxor xmm7, xmm4 + psubw xmm7, xmm4;gain the sign back + + mov rdi, arg(2) ;qcoeff_ptr + + movdqa xmm1, xmm7 + movdqa [rdi+16], xmm7 + + mov rax, arg(3) ;dequant_ptr + movdqa xmm6, [rax+16] + + pmullw xmm7, xmm6 + mov rax, arg(7) ;dqcoeff_ptr + + movdqa [rax+16], xmm7 + mov rdi, arg(4) ;scan_mask + + pxor xmm7, xmm7 + movdqa xmm2, [rdi] + + movdqa xmm3, [rdi+16]; + pcmpeqw xmm0, xmm7 + + pcmpeqw xmm1, xmm7 + pcmpeqw xmm6, xmm6 + + pxor xmm0, xmm6 + pxor xmm1, xmm6 + + psrlw xmm0, 15 + psrlw xmm1, 15 + + pmaddwd xmm0, xmm2 + pmaddwd xmm1, xmm3 + + movq xmm2, xmm0 + movq xmm3, xmm1 + + psrldq xmm0, 8 + psrldq xmm1, 8 + + paddd xmm0, xmm1 + paddd xmm2, xmm3 + + paddd xmm0, xmm2 + movq xmm1, xmm0 + + psrldq xmm0, 4 + paddd xmm1, xmm0 + + movd rcx, xmm1 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm new file mode 100644 index 000000000..a825698e7 --- /dev/null +++ b/vp8/encoder/x86/sad_mmx.asm @@ -0,0 +1,428 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +global sym(vp8_sad16x16_mmx) +global sym(vp8_sad8x16_mmx) +global sym(vp8_sad8x8_mmx) +global sym(vp8_sad4x4_mmx) +global sym(vp8_sad16x8_mmx) + +%idefine QWORD + +;unsigned int vp8_sad16x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x16x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpcklbw mm2, mm6 + + punpckhbw mm1, mm6 + punpckhbw mm3, mm6 + + paddw mm0, mm2 + paddw mm1, mm3 + + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + paddw mm7, mm1 + + cmp rsi, rcx + jne x16x16sad_mmx_loop + + + movq mm0, mm7 + + punpcklwd mm0, mm6 + punpckhwd mm7, mm6 + + paddw mm0, mm7 + movq mm7, mm0 + + + psrlq mm0, 32 + paddw mm7, mm0 + + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad8x16_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad8x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x8x16sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + paddw mm7, mm2 + cmp rsi, rcx + + jne x8x16sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad8x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x8x8sad_mmx_loop: + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + punpcklbw mm0, mm6 + + punpckhbw mm2, mm6 + paddw mm0, mm2 + + lea rsi, [rsi+rax] + add rdi, rdx + + paddw mm7, mm0 + cmp rsi, rcx + + jne x8x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad4x4_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rdi] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movq mm2, mm0 + psubusb mm0, mm1 + + psubusb mm1, mm2 + por mm0, mm1 + + movq mm2, mm0 + pxor mm3, mm3 + + punpcklbw mm0, mm3 + punpckhbw mm2, mm3 + + paddw mm0, mm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm4, QWORD PTR [rsi] + movd mm5, QWORD PTR [rdi] + + movd mm6, QWORD PTR [rsi+rax] + movd mm7, QWORD PTR [rdi+rdx] + + punpcklbw mm4, mm6 + punpcklbw mm5, mm7 + + movq mm6, mm4 + psubusb mm4, mm5 + + psubusb mm5, mm6 + por mm4, mm5 + + movq mm5, mm4 + punpcklbw mm4, mm3 + + punpckhbw mm5, mm3 + paddw mm4, mm5 + + paddw mm0, mm4 + movq mm1, mm0 + + punpcklwd mm0, mm3 + punpckhwd mm1, mm3 + + paddw mm0, mm1 + movq mm1, mm0 + + psrlq mm0, 32 + paddw mm0, mm1 + + movd rax, mm0 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x8_mmx( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +sym(vp8_sad16x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + pxor mm7, mm7 + + pxor mm6, mm6 + +x16x8sad_mmx_loop: + + movq mm0, [rsi] + movq mm1, [rdi] + + movq mm2, [rsi+8] + movq mm3, [rdi+8] + + movq mm4, mm0 + movq mm5, mm2 + + psubusb mm0, mm1 + psubusb mm1, mm4 + + psubusb mm2, mm3 + psubusb mm3, mm5 + + por mm0, mm1 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0, mm6 + punpckhbw mm1, mm6 + + punpcklbw mm2, mm6 + punpckhbw mm3, mm6 + + + paddw mm0, mm2 + paddw mm1, mm3 + + paddw mm0, mm1 + lea rsi, [rsi+rax] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne x16x8sad_mmx_loop + + movq mm0, mm7 + punpcklwd mm0, mm6 + + punpckhwd mm7, mm6 + paddw mm0, mm7 + + movq mm7, mm0 + psrlq mm0, 32 + + paddw mm7, mm0 + movd rax, mm7 + + pop rdi + pop rsi + mov rsp, rbp + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm new file mode 100644 index 000000000..53240bbf1 --- /dev/null +++ b/vp8/encoder/x86/sad_sse2.asm @@ -0,0 +1,329 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%idefine QWORD + +;unsigned int vp8_sad16x16_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad16x16_wmt) +sym(vp8_sad16x16_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rax*8] + + lea rcx, [rcx+rax*8] + pxor xmm7, xmm7 + +x16x16sad_wmt_loop: + + movq xmm0, QWORD PTR [rsi] + movq xmm2, QWORD PTR [rsi+8] + + movq xmm1, QWORD PTR [rdi] + movq xmm3, QWORD PTR [rdi+8] + + movq xmm4, QWORD PTR [rsi+rax] + movq xmm5, QWORD PTR [rdi+rdx] + + + punpcklbw xmm0, xmm2 + punpcklbw xmm1, xmm3 + + psadbw xmm0, xmm1 + movq xmm6, QWORD PTR [rsi+rax+8] + + movq xmm3, QWORD PTR [rdi+rdx+8] + lea rsi, [rsi+rax*2] + + lea rdi, [rdi+rdx*2] + punpcklbw xmm4, xmm6 + + punpcklbw xmm5, xmm3 + psadbw xmm4, xmm5 + + paddw xmm7, xmm0 + paddw xmm7, xmm4 + + cmp rsi, rcx + jne x16x16sad_wmt_loop + + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd rax, xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_sad8x16_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int max_err) +global sym(vp8_sad8x16_wmt) +sym(vp8_sad8x16_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + + lea rcx, [rcx+rbx*8] + pxor mm7, mm7 + +x8x16sad_wmt_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg x8x16sad_wmt_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + movq mm2, QWORD PTR [rsi+rbx] + movq mm3, QWORD PTR [rdi+rdx] + + psadbw mm0, mm1 + psadbw mm2, mm3 + + lea rsi, [rsi+rbx*2] + lea rdi, [rdi+rdx*2] + + paddw mm7, mm0 + paddw mm7, mm2 + + cmp rsi, rcx + jne x8x16sad_wmt_loop + + movd rax, mm7 + +x8x16sad_wmt_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad8x8_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad8x8_wmt) +sym(vp8_sad8x8_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + pxor mm7, mm7 + +x8x8sad_wmt_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg x8x8sad_wmt_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + + psadbw mm0, mm1 + lea rsi, [rsi+rbx] + + add rdi, rdx + paddw mm7, mm0 + + cmp rsi, rcx + jne x8x8sad_wmt_loop + + movd rax, mm7 +x8x8sad_wmt_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_sad4x4_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad4x4_wmt) +sym(vp8_sad4x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rdi] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + psadbw mm0, mm1 + lea rsi, [rsi+rax*2] + + lea rdi, [rdi+rdx*2] + movd mm4, QWORD PTR [rsi] + + movd mm5, QWORD PTR [rdi] + movd mm6, QWORD PTR [rsi+rax] + + movd mm7, QWORD PTR [rdi+rdx] + punpcklbw mm4, mm6 + + punpcklbw mm5, mm7 + psadbw mm4, mm5 + + paddw mm0, mm4 + movd rax, mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x8_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride) +global sym(vp8_sad16x8_wmt) +sym(vp8_sad16x8_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + pxor mm7, mm7 + +x16x8sad_wmt_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg x16x8sad_wmt_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, QWORD PTR [rsi+rbx] + movq mm5, QWORD PTR [rdi+rdx] + + psadbw mm0, mm1 + psadbw mm2, mm3 + + movq mm1, QWORD PTR [rsi+rbx+8] + movq mm3, QWORD PTR [rdi+rdx+8] + + psadbw mm4, mm5 + psadbw mm1, mm3 + + lea rsi, [rsi+rbx*2] + lea rdi, [rdi+rdx*2] + + paddw mm0, mm2 + paddw mm4, mm1 + + paddw mm7, mm0 + paddw mm7, mm4 + + cmp rsi, rcx + jne x16x8sad_wmt_loop + + movd rax, mm7 + +x16x8sad_wmt_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm new file mode 100644 index 000000000..38cc02957 --- /dev/null +++ b/vp8/encoder/x86/sad_sse3.asm @@ -0,0 +1,939 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%idefine QWORD + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, [rsi] + lddqu xmm5, [rdi] + lddqu xmm6, [rdi+1] + lddqu xmm7, [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + lddqu xmm1, [rdi] + lddqu xmm2, [rdi+1] + lddqu xmm3, [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + lddqu xmm1, QWORD PTR [rdi+rdx] + lddqu xmm2, QWORD PTR [rdi+rdx+1] + lddqu xmm3, QWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_8X2X3 1 +%if %1 + movq mm0, [rsi] + movq mm5, [rdi] + movq mm6, [rdi+1] + movq mm7, [rdi+2] + + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, [rsi] + movq mm1, [rdi] + movq mm2, [rdi+1] + movq mm3, [rdi+2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endif + movq mm0, QWORD PTR [rsi+rax] + movq mm1, QWORD PTR [rdi+rdx] + movq mm2, QWORD PTR [rdi+rdx+1] + movq mm3, QWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endmacro + +%macro LOAD_X4_ADDRESSES 5 + mov %2, [%1+REG_SZ_BYTES*0] + mov %3, [%1+REG_SZ_BYTES*1] + + mov %4, [%1+REG_SZ_BYTES*2] + mov %5, [%1+REG_SZ_BYTES*3] +%endmacro + +%macro PROCESS_16X2X4 1 +%if %1 + movdqa xmm0, [rsi] + lddqu xmm4, [rcx] + lddqu xmm5, [rdx] + lddqu xmm6, [rbx] + lddqu xmm7, [rdi] + + psadbw xmm4, xmm0 + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + lddqu xmm1, [rcx] + lddqu xmm2, [rdx] + lddqu xmm3, [rbx] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm4, xmm1 + lddqu xmm1, [rdi] + paddw xmm5, xmm2 + paddw xmm6, xmm3 + + psadbw xmm1, xmm0 + paddw xmm7, xmm1 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + lddqu xmm1, QWORD PTR [rcx+rbp] + lddqu xmm2, QWORD PTR [rdx+rbp] + lddqu xmm3, QWORD PTR [rbx+rbp] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm4, xmm1 + lddqu xmm1, QWORD PTR [rdi+rbp] + paddw xmm5, xmm2 + paddw xmm6, xmm3 + + lea rsi, [rsi+rax*2] + lea rcx, [rcx+rbp*2] + + lea rdx, [rdx+rbp*2] + lea rbx, [rbx+rbp*2] + + lea rdi, [rdi+rbp*2] + + psadbw xmm1, xmm0 + paddw xmm7, xmm1 + +%endmacro + +%macro PROCESS_8X2X4 1 +%if %1 + movq mm0, [rsi] + movq mm4, [rcx] + movq mm5, [rdx] + movq mm6, [rbx] + movq mm7, [rdi] + + psadbw mm4, mm0 + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, [rsi] + movq mm1, [rcx] + movq mm2, [rdx] + movq mm3, [rbx] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm4, mm1 + movq mm1, [rdi] + paddw mm5, mm2 + paddw mm6, mm3 + + psadbw mm1, mm0 + paddw mm7, mm1 +%endif + movq mm0, QWORD PTR [rsi+rax] + movq mm1, QWORD PTR [rcx+rbp] + movq mm2, QWORD PTR [rdx+rbp] + movq mm3, QWORD PTR [rbx+rbp] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm4, mm1 + movq mm1, QWORD PTR [rdi+rbp] + paddw mm5, mm2 + paddw mm6, mm3 + + lea rsi, [rsi+rax*2] + lea rcx, [rcx+rbp*2] + + lea rdx, [rdx+rbp*2] + lea rbx, [rbx+rbp*2] + + lea rdi, [rdi+rbp*2] + + psadbw mm1, mm0 + paddw mm7, mm1 + +%endmacro + +;void int vp8_sad16x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x16x3_sse3) +sym(vp8_sad16x16x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad16x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x8x3_sse3) +sym(vp8_sad16x8x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x16x3_sse3) +sym(vp8_sad8x16x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X3 1 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + + mov rdi, arg(4) ;Results + + movd [rdi], mm5 + movd [rdi+4], mm6 + movd [rdi+8], mm7 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x8x3_sse3) +sym(vp8_sad8x8x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X3 1 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + PROCESS_8X2X3 0 + + mov rdi, arg(4) ;Results + + movd [rdi], mm5 + movd [rdi+4], mm6 + movd [rdi+8], mm7 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad4x4x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad4x4x3_sse3) +sym(vp8_sad4x4x3_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rdi] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, QWORD PTR [rdi+1] + movd mm5, QWORD PTR [rdi+2] + + movd mm2, QWORD PTR [rdi+rdx+1] + movd mm3, QWORD PTR [rdi+rdx+2] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + psadbw mm4, mm0 + psadbw mm5, mm0 + + + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movd mm0, QWORD PTR [rsi] + movd mm2, QWORD PTR [rdi] + + movd mm3, QWORD PTR [rsi+rax] + movd mm6, QWORD PTR [rdi+rdx] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm6 + + movd mm3, QWORD PTR [rdi+1] + movd mm7, QWORD PTR [rdi+2] + + psadbw mm2, mm0 + + paddw mm1, mm2 + + movd mm2, QWORD PTR [rdi+rdx+1] + movd mm6, QWORD PTR [rdi+rdx+2] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm6 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + paddw mm3, mm4 + paddw mm7, mm5 + + mov rdi, arg(4) ;Results + movd [rdi], mm1 + + movd [rdi+4], mm3 + movd [rdi+8], mm7 + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_sad16x16_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int max_err) +;%define lddqu movdqu +global sym(vp8_sad16x16_sse3) +sym(vp8_sad16x16_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + lea rcx, [rsi+rbx*8] + + lea rcx, [rcx+rbx*8] + pxor mm7, mm7 + +vp8_sad16x16_sse3_loop: + + movd rax, mm7 + cmp rax, arg(4) + jg vp8_sad16x16_early_exit + + movq mm0, QWORD PTR [rsi] + movq mm2, QWORD PTR [rsi+8] + + movq mm1, QWORD PTR [rdi] + movq mm3, QWORD PTR [rdi+8] + + movq mm4, QWORD PTR [rsi+rbx] + movq mm5, QWORD PTR [rdi+rdx] + + psadbw mm0, mm1 + psadbw mm2, mm3 + + movq mm1, QWORD PTR [rsi+rbx+8] + movq mm3, QWORD PTR [rdi+rdx+8] + + psadbw mm4, mm5 + psadbw mm1, mm3 + + lea rsi, [rsi+rbx*2] + lea rdi, [rdi+rdx*2] + + paddw mm0, mm2 + paddw mm4, mm1 + + paddw mm7, mm0 + paddw mm7, mm4 + + cmp rsi, rcx + jne vp8_sad16x16_sse3_loop + + movd rax, mm7 + +vp8_sad16x16_early_exit: + + ; begin epilog + pop rdi + pop rsi + pop rbx + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_sad16x16x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr_base, +; int ref_stride, +; int *results) +global sym(vp8_sad16x16x4d_sse3) +sym(vp8_sad16x16x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_16X2X4 1 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movq xmm0, xmm4 + psrldq xmm4, 8 + + paddw xmm0, xmm4 + movd [rdi], xmm0 +;- + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+8], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+12], xmm0 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_sad16x8x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr_base, +; int ref_stride, +; int *results) +global sym(vp8_sad16x8x4d_sse3) +sym(vp8_sad16x8x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_16X2X4 1 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + PROCESS_16X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movq xmm0, xmm4 + psrldq xmm4, 8 + + paddw xmm0, xmm4 + movd [rdi], xmm0 +;- + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+8], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+12], xmm0 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x16x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x16x4d_sse3) +sym(vp8_sad8x16x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_8X2X4 1 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movd [rdi], mm4 + movd [rdi+4], mm5 + movd [rdi+8], mm6 + movd [rdi+12], mm7 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad8x8x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad8x8x4d_sse3) +sym(vp8_sad8x8x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + PROCESS_8X2X4 1 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + PROCESS_8X2X4 0 + + pop rbp + mov rdi, arg(4) ;Results + + movd [rdi], mm4 + movd [rdi+4], mm5 + movd [rdi+8], mm6 + movd [rdi+12], mm7 + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad4x4x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad4x4x4d_sse3) +sym(vp8_sad4x4x4d_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rbx + ; end prolog + + push rbp + mov rdi, arg(2) ; ref_ptr_base + + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi + + mov rsi, arg(0) ;src_ptr + + movsxd rbx, dword ptr arg(1) ;src_stride + movsxd rbp, dword ptr arg(3) ;ref_stride + + xchg rbx, rax + + movd mm0, QWORD PTR [rsi] + movd mm1, QWORD PTR [rcx] + + movd mm2, QWORD PTR [rsi+rax] + movd mm3, QWORD PTR [rcx+rbp] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, QWORD PTR [rdx] + movd mm5, QWORD PTR [rbx] + + movd mm6, QWORD PTR [rdi] + movd mm2, QWORD PTR [rdx+rbp] + + movd mm3, QWORD PTR [rbx+rbp] + movd mm7, QWORD PTR [rdi+rbp] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + punpcklbw mm6, mm7 + psadbw mm4, mm0 + + psadbw mm5, mm0 + psadbw mm6, mm0 + + + + lea rsi, [rsi+rax*2] + lea rcx, [rcx+rbp*2] + + lea rdx, [rdx+rbp*2] + lea rbx, [rbx+rbp*2] + + lea rdi, [rdi+rbp*2] + + movd mm0, QWORD PTR [rsi] + movd mm2, QWORD PTR [rcx] + + movd mm3, QWORD PTR [rsi+rax] + movd mm7, QWORD PTR [rcx+rbp] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm7 + + movd mm3, QWORD PTR [rdx] + movd mm7, QWORD PTR [rbx] + + psadbw mm2, mm0 + mov rax, rbp + + pop rbp + mov rsi, arg(4) ;Results + + paddw mm1, mm2 + movd [rsi], mm1 + + movd mm2, QWORD PTR [rdx+rax] + movd mm1, QWORD PTR [rbx+rax] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm1 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + movd mm2, QWORD PTR [rdi] + movd mm1, QWORD PTR [rdi+rax] + + paddw mm3, mm4 + paddw mm7, mm5 + + movd [rsi+4], mm3 + punpcklbw mm2, mm1 + + movd [rsi+8], mm7 + psadbw mm2, mm0 + + paddw mm2, mm6 + movd [rsi+12], mm2 + + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm new file mode 100644 index 000000000..1bb956121 --- /dev/null +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -0,0 +1,367 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%idefine QWORD + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, [rsi] + lddqu xmm5, [rdi] + lddqu xmm6, [rdi+1] + lddqu xmm7, [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + lddqu xmm1, [rdi] + lddqu xmm2, [rdi+1] + lddqu xmm3, [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + lddqu xmm1, QWORD PTR [rdi+rdx] + lddqu xmm2, QWORD PTR [rdi+rdx+1] + lddqu xmm3, QWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X2X3_OFFSET 2 +%if %1 + movdqa xmm0, [rsi] + movdqa xmm4, [rdi] + movdqa xmm7, [rdi+16] + + movdqa xmm5, xmm7 + palignr xmm5, xmm4, %2 + + movdqa xmm6, xmm7 + palignr xmm6, xmm4, (%2+1) + + palignr xmm7, xmm4, (%2+2) + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, [rsi] + movdqa xmm4, [rdi] + movdqa xmm3, [rdi+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, QWORD PTR [rsi+rax] + movdqa xmm4, QWORD PTR [rdi+rdx] + movdqa xmm3, QWORD PTR [rdi+rdx+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X16X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +%macro PROCESS_16X8X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +;void int vp8_sad16x16x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x16x3_ssse3) +sym(vp8_sad16x16x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp vp8_sad16x16x3_ssse3_skiptable +vp8_sad16x16x3_ssse3_jumptable: + dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump + dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump +vp8_sad16x16x3_ssse3_skiptable: + + call vp8_sad16x16x3_ssse3_do_jump +vp8_sad16x16x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3 + +vp8_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +vp8_sad16x16x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void int vp8_sad16x8x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(vp8_sad16x8x3_ssse3) +sym(vp8_sad16x8x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp vp8_sad16x8x3_ssse3_skiptable +vp8_sad16x8x3_ssse3_jumptable: + dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump + dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump +vp8_sad16x8x3_ssse3_skiptable: + + call vp8_sad16x8x3_ssse3_do_jump +vp8_sad16x8x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3 + +vp8_sad16x8x3_ssse3_aligned_by_15: + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +vp8_sad16x8x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm new file mode 100644 index 000000000..ce3e61066 --- /dev/null +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -0,0 +1,431 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, +; unsigned short *diff, unsigned char *Predictor, +; int pitch); +global sym(vp8_subtract_b_mmx_impl) +sym(vp8_subtract_b_mmx_impl) + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + + mov rdi, arg(2) ;diff + mov rax, arg(3) ;Predictor + mov rsi, arg(0) ;z + movsxd rdx, dword ptr arg(1);src_stride; + movsxd rcx, dword ptr arg(4);pitch + pxor mm7, mm7 + + movd mm0, [rsi] + movd mm1, [rax] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi], mm0 + + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2],mm0 + + + movd mm0, [rsi+rdx*2] + movd mm1, [rax+rcx*2] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*4], mm0 + + lea rsi, [rsi+rdx*2] + lea rcx, [rcx+rcx*2] + + + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq [rdi+rcx*2], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) +global sym(vp8_subtract_mby_mmx) +sym(vp8_subtract_mby_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + + mov rsi, arg(1) ;src + mov rdi, arg(0) ;diff + + mov rax, arg(2) ;pred + movsxd rdx, dword ptr arg(3) ;stride + + mov rcx, 16 + pxor mm0, mm0 + +submby_loop: + + movq mm1, [rsi] + movq mm3, [rax] + + movq mm2, mm1 + movq mm4, mm3 + + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm3 + psubw mm2, mm4 + + movq [rdi], mm1 + movq [rdi+8], mm2 + + + movq mm1, [rsi+8] + movq mm3, [rax+8] + + movq mm2, mm1 + movq mm4, mm3 + + punpcklbw mm1, mm0 + punpcklbw mm3, mm0 + + punpckhbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm3 + psubw mm2, mm4 + + movq [rdi+16], mm1 + movq [rdi+24], mm2 + + + add rdi, 32 + add rax, 16 + + lea rsi, [rsi+rdx] + + sub rcx, 1 + jnz submby_loop + + pop rdi + pop rsi + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +global sym(vp8_subtract_mbuv_mmx) +sym(vp8_subtract_mbuv_mmx) + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + ;short *udiff = diff + 256; + ;short *vdiff = diff + 320; + ;unsigned char *upred = pred + 256; + ;unsigned char *vpred = pred + 320; + + ;unsigned char *z = usrc; + ;unsigned short *diff = udiff; + ;unsigned char *Predictor= upred; + + mov rdi, arg(0) ;diff + mov rax, arg(3) ;pred + mov rsi, arg(1) ;z = usrc + add rdi, 256*2 ;diff = diff + 256 (shorts) + add rax, 256 ;Predictor = pred + 256 + movsxd rdx, dword ptr arg(4) ;stride; + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + + add rdi, 64 + add rax, 32 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + ;unsigned char *z = vsrc; + ;unsigned short *diff = vdiff; + ;unsigned char *Predictor= vpred; + + mov rdi, arg(0) ;diff + mov rax, arg(3) ;pred + mov rsi, arg(2) ;z = usrc + add rdi, 320*2 ;diff = diff + 320 (shorts) + add rax, 320 ;Predictor = pred + 320 + movsxd rdx, dword ptr arg(4) ;stride; + pxor mm7, mm7 + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + + add rdi, 64 + add rax, 32 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi] + movq mm1, [rax] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi], mm0 + movq [rdi+8], mm3 + + + movq mm0, [rsi+rdx] + movq mm1, [rax+8] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+16], mm0 + movq [rdi+24], mm3 + + movq mm0, [rsi+rdx*2] + movq mm1, [rax+16] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + movq [rdi+32], mm0 + movq [rdi+40], mm3 + lea rsi, [rsi+rdx*2] + + + movq mm0, [rsi+rdx] + movq mm1, [rax+24] + movq mm3, mm0 + movq mm4, mm1 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + punpckhbw mm3, mm7 + punpckhbw mm4, mm7 + psubw mm0, mm1 + psubw mm3, mm4 + + movq [rdi+48], mm0 + movq [rdi+56], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm new file mode 100644 index 000000000..d0da82ad4 --- /dev/null +++ b/vp8/encoder/x86/variance_impl_mmx.asm @@ -0,0 +1,980 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) +global sym(vp8_get_mb_ss_mmx) +sym(vp8_get_mb_ss_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + sub rsp, 8 + ; end prolog + + mov rax, arg(0) ;src_ptr + mov rcx, 16 + pxor mm4, mm4 + +NEXTROW: + movq mm0, [rax] + movq mm1, [rax+8] + movq mm2, [rax+16] + movq mm3, [rax+24] + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + + paddd mm4, mm0 + paddd mm4, mm1 + paddd mm4, mm2 + paddd mm4, mm3 + + add rax, 32 + dec rcx + ja NEXTROW + movq QWORD PTR [rsp], mm4 + + ;return sum[0]+sum[1]; + movsxd rax, dword ptr [rsp] + movsxd rcx, dword ptr [rsp+4] + add rax, rcx + + + ; begin epilog + add rsp, 8 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_get8x8var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vp8_get8x8var_mmx) +sym(vp8_get8x8var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 5 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + ; movq mm4, [rbx + rdx] + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 6 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 7 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 8 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int +;vp8_get4x4var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vp8_get4x4var_mmx) +sym(vp8_get4x4var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int +;vp8_get4x4sse_cs_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride +;) +global sym(vp8_get4x4sse_cs_mmx) +sym(vp8_get4x4sse_cs_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + push rbx + ; end prolog + + + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + ; Row 1 + movd mm0, [rax] ; Copy eight bytes to mm0 + movd mm1, [rbx] ; Copy eight bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 2 + movd mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movd mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm1, mm6 + punpcklbw mm0, mm6 ; unpack to higher prrcision + psubsw mm0, mm1 ; A-B (low order) to MM0 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movd mm0, [rax] ; Copy eight bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + movq mm0, mm7 ; + psrlq mm7, 32 + + paddd mm0, mm7 + movd rax, mm0 + + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%define mmx_filter_shift 7 + +;void vp8_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil4x4_var_mmx) +sym(vp8_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + + paddw mm1, [mmx_bi_rd GLOBAL] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz filter_block2d_bil4x4_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + + +;void vp8_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_filter_block2d_bil_var_mmx) +sym(vp8_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [mmx_bi_rd GLOBAL] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [mmx_bi_rd GLOBAL] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [mmx_bi_rd GLOBAL] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm2, [mmx_bi_rd GLOBAL] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz filter_block2d_bil_var_mmx_loop ; + + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_get16x16pred_error_mmx +;( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride +;) +global sym(vp8_get16x16pred_error_mmx) +sym(vp8_get16x16pred_error_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;DWORD PTR [src_ptr] + mov rdi, arg(2) ;DWORD PTR [ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] + + pxor mm0, mm0 ; clear xmm0 for unpack + pxor mm7, mm7 ; clear xmm7 for accumulating diffs + + pxor mm6, mm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +var16loop: + + movq mm1, [rsi] + movq mm2, [rdi] + + movq mm3, mm1 + movq mm4, mm2 + + punpcklbw mm1, mm0 + punpckhbw mm3, mm0 + + punpcklbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm2 + psubw mm3, mm4 + + paddw mm7, mm1 + pmaddwd mm1, mm1 + + paddw mm7, mm3 + pmaddwd mm3, mm3 + + paddd mm6, mm1 + paddd mm6, mm3 + + + movq mm1, [rsi+8] + movq mm2, [rdi+8] + + movq mm3, mm1 + movq mm4, mm2 + + punpcklbw mm1, mm0 + punpckhbw mm3, mm0 + + punpcklbw mm2, mm0 + punpckhbw mm4, mm0 + + psubw mm1, mm2 + psubw mm3, mm4 + + paddw mm7, mm1 + pmaddwd mm1, mm1 + + paddw mm7, mm3 + pmaddwd mm3, mm3 + + paddd mm6, mm1 + paddd mm6, mm3 + + add rsi, rax + add rdi, rdx + + sub rcx, 1 + jnz var16loop + + + movq mm1, mm6 + pxor mm6, mm6 + + pxor mm5, mm5 + punpcklwd mm6, mm7 + + punpckhwd mm5, mm7 + psrad mm5, 16 + + psrad mm6, 16 + paddd mm6, mm5 + + movq mm2, mm1 + psrlq mm1, 32 + + paddd mm2, mm1 + movq mm7, mm6 + + psrlq mm6, 32 + paddd mm6, mm7 + + movd DWORD PTR [rsp], mm6 ;Sum + movd DWORD PTR [rsp+4], mm2 ;SSE + + ; return (SSE-((Sum*Sum)>>8)); + movsxd rdx, dword ptr [rsp] + imul rdx, rdx + sar rdx, 8 + movsxd rax, dword ptr [rsp + 4] + sub rax, rdx + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm new file mode 100644 index 000000000..7e5ee284b --- /dev/null +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -0,0 +1,975 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define xmm_filter_shift 7 + +;unsigned int vp8_get_mb_ss_sse2 +;( +; short *src_ptr +;) +global sym(vp8_get_mb_ss_sse2) +sym(vp8_get_mb_ss_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 1 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + + mov rax, arg(0) ;[src_ptr] + mov rcx, 8 + pxor xmm4, xmm4 + +NEXTROW: + movdqa xmm0, [rax] + movdqa xmm1, [rax+16] + movdqa xmm2, [rax+32] + movdqa xmm3, [rax+48] + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + + paddd xmm0, xmm1 + paddd xmm2, xmm3 + paddd xmm4, xmm0 + paddd xmm4, xmm2 + + add rax, 0x40 + dec rcx + ja NEXTROW + + movdqa xmm3,xmm4 + psrldq xmm4,8 + paddd xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,4 + paddd xmm4,xmm3 + movd rax,xmm4 + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_get16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp8_get16x16var_sse2) +sym(vp8_get16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + + + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + + punpcklbw xmm2, xmm0 + punpckhbw xmm4, xmm0 + + + psubw xmm1, xmm2 + psubw xmm3, xmm4 + + paddw xmm7, xmm1 + pmaddwd xmm1, xmm1 + + paddw xmm7, xmm3 + pmaddwd xmm3, xmm3 + + paddd xmm6, xmm1 + paddd xmm6, xmm3 + + add rsi, rax + add rdi, rdx + + sub rcx, 1 + jnz var16loop + + + movdqa xmm1, xmm6 + pxor xmm6, xmm6 + + pxor xmm5, xmm5 + punpcklwd xmm6, xmm7 + + punpckhwd xmm5, xmm7 + psrad xmm5, 16 + + psrad xmm6, 16 + paddd xmm6, xmm5 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + movdqa xmm7, xmm6 + + paddd xmm1, xmm2 + punpckldq xmm6, xmm0 + + punpckhdq xmm7, xmm0 + paddd xmm6, xmm7 + + movdqa xmm2, xmm1 + movdqa xmm7, xmm6 + + psrldq xmm1, 8 + psrldq xmm6, 8 + + paddd xmm7, xmm6 + paddd xmm1, xmm2 + + mov rax, arg(5) ;[Sum] + mov rdi, arg(4) ;[SSE] + + movd DWORD PTR [rax], xmm7 + movd DWORD PTR [rdi], xmm1 + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_get16x16pred_error_sse2 +;( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride +;) +global sym(vp8_get16x16pred_error_sse2) +sym(vp8_get16x16pred_error_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +var16peloop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + + punpcklbw xmm2, xmm0 + punpckhbw xmm4, xmm0 + + psubw xmm1, xmm2 + psubw xmm3, xmm4 + + paddw xmm7, xmm1 + pmaddwd xmm1, xmm1 + + paddw xmm7, xmm3 + pmaddwd xmm3, xmm3 + + paddd xmm6, xmm1 + paddd xmm6, xmm3 + + add rsi, rax + add rdi, rdx + + sub rcx, 1 + jnz var16peloop + + + movdqa xmm1, xmm6 + pxor xmm6, xmm6 + + pxor xmm5, xmm5 + punpcklwd xmm6, xmm7 + + punpckhwd xmm5, xmm7 + psrad xmm5, 16 + + psrad xmm6, 16 + paddd xmm6, xmm5 + + movdqa xmm2, xmm1 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + movdqa xmm7, xmm6 + + paddd xmm1, xmm2 + punpckldq xmm6, xmm0 + + punpckhdq xmm7, xmm0 + paddd xmm6, xmm7 + + movdqa xmm2, xmm1 + movdqa xmm7, xmm6 + + psrldq xmm1, 8 + psrldq xmm6, 8 + + paddd xmm7, xmm6 + paddd xmm1, xmm2 + + movd DWORD PTR [rsp], xmm7 ;Sum + movd DWORD PTR [rsp+4], xmm1 ;SSE + + ; return (SSE-((Sum*Sum)>>8)); + movsxd rdx, dword ptr [rsp] + imul rdx, rdx + sar rdx, 8 + movsxd rax, dword ptr [rsp + 4] + sub rax, rdx + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int vp8_get8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp8_get8x8var_sse2) +sym(vp8_get8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + movq xmm1, QWORD PTR [rsi] + movq xmm2, QWORD PTR [rdi] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + psubsw xmm1, xmm2 + paddw xmm7, xmm1 + + pmaddwd xmm1, xmm1 + + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + movq xmm2, QWORD PTR[rsi + rax * 2] + movq xmm3, QWORD PTR[rdi + rdx * 2] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + lea rsi, [rsi + rax * 2] + lea rdi, [rdi + rdx * 2] + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + movq xmm2, QWORD PTR[rsi + rax *2] + movq xmm3, QWORD PTR[rdi + rdx *2] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + lea rsi, [rsi + rax * 2] + lea rdi, [rdi + rdx * 2] + + + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + movq xmm2, QWORD PTR[rsi + rax *2] + movq xmm3, QWORD PTR[rdi + rdx *2] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + lea rsi, [rsi + rax * 2] + lea rdi, [rdi + rdx * 2] + + movq xmm2, QWORD PTR[rsi + rax] + movq xmm3, QWORD PTR[rdi + rdx] + + punpcklbw xmm2, xmm0 + punpcklbw xmm3, xmm0 + + psubsw xmm2, xmm3 + paddw xmm7, xmm2 + + pmaddwd xmm2, xmm2 + paddd xmm1, xmm2 + + + movdqa xmm6, xmm7 + punpcklwd xmm6, xmm0 + + punpckhwd xmm7, xmm0 + movdqa xmm2, xmm1 + + paddw xmm6, xmm7 + punpckldq xmm1, xmm0 + + punpckhdq xmm2, xmm0 + movdqa xmm7, xmm6 + + paddd xmm1, xmm2 + punpckldq xmm6, xmm0 + + punpckhdq xmm7, xmm0 + paddw xmm6, xmm7 + + movdqa xmm2, xmm1 + movdqa xmm7, xmm6 + + psrldq xmm1, 8 + psrldq xmm6, 8 + + paddw xmm7, xmm6 + paddd xmm1, xmm2 + + mov rax, arg(5) ;[Sum] + mov rdi, arg(4) ;[SSE] + + movd rdx, xmm7 + movsx rcx, dx + + mov dword ptr [rax], ecx + movd DWORD PTR [rdi], xmm1 + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block2d_bil_var_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared;; +; +;) +global sym(vp8_filter_block2d_bil_var_sse2) +sym(vp8_filter_block2d_bil_var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor xmm6, xmm6 ; + pxor xmm7, xmm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + + movq xmm3, QWORD PTR [rsi+1] ; + punpcklbw xmm1, xmm0 ; + + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 + ; + pmullw xmm3, [rax+16] ; + paddw xmm1, xmm3 ; + + paddw xmm1, [xmm_bi_rd GLOBAL] ; + psraw xmm1, xmm_filter_shift ; + + movdqa xmm5, xmm1 +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif +filter_block2d_bil_var_sse2_loop: + + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, [xmm_bi_rd GLOBAL] ; + + psraw xmm1, xmm_filter_shift ; + movdqa xmm3, xmm5 ; + + movdqa xmm5, xmm1 ; + pmullw xmm3, [rdx] ; + + pmullw xmm1, [rdx+16] ; + paddw xmm1, xmm3 ; + + paddw xmm1, [xmm_bi_rd GLOBAL] ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz filter_block2d_bil_var_sse2_loop ; + + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(7) ; sum + mov rdi, arg(8) ; sumsquared + + movd [rsi], mm2 ; xsum + movd [rdi], mm4 ; xxsum + + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_vert_variance16x_h_sse2) +sym(vp8_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source +%else + add rsi, r8 +%endif + +vp8_half_horiz_vert_variance16x_h_1: + + movq xmm1, QWORD PTR [rsi] ; + movq xmm2, QWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance16x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_vert_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance16x_h_sse2) +sym(vp8_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; +vp8_half_vert_variance16x_h_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + + sub rcx, 1 ; + jnz vp8_half_vert_variance16x_h_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance16x_h_sse2) +sym(vp8_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor xmm0, xmm0 ; +vp8_half_horiz_variance16x16_1: + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + punpcklbw xmm5, xmm0 ; xmm5 = words of above + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + +%if ABI_IS_32BIT + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination +%else + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz vp8_half_horiz_variance16x16_1 ; + + movdq2q mm6, xmm6 ; + movdq2q mm7, xmm7 ; + + psrldq xmm6, 8 + psrldq xmm7, 8 + + movdq2q mm2, xmm6 + movdq2q mm3, xmm7 + + paddw mm6, mm2 + paddd mm7, mm3 + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rsi, arg(5) ; sum + mov rdi, arg(6) ; sumsquared + + movd [rsi], mm2 ; + movd [rdi], mm4 ; + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c new file mode 100644 index 000000000..4a5b25b0d --- /dev/null +++ b/vp8/encoder/x86/variance_mmx.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "pragmas.h" +#include "vpx_ports/mem.h" + +extern void filter_block1d_h6_mmx +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *vp7_filter +); +extern void filter_block1d_v6_mmx +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + short *vp7_filter +); + +extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr); +extern unsigned int vp8_get8x8var_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern unsigned int vp8_get4x4var_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +extern unsigned int vp8_get4x4sse_cs_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride +); +extern void vp8_filter_block2d_bil4x4_var_mmx +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +extern void vp8_filter_block2d_bil_var_mmx +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +extern unsigned int vp8_get16x16pred_error_mmx +( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride +); + + +void vp8_test_get_mb_ss(void) +{ + short zz[] = + { + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4, + }; + int s = 0, x = vp8_get_mb_ss_mmx(zz); + { + int y; + + for (y = 0; y < 256; y++) + s += (zz[y] * zz[y]); + } + + x += 0; +} + + +unsigned int vp8_get16x16var_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned *SSE, + unsigned *SUM +) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + + *SSE = var; + *SUM = avg; + return (var - ((avg * avg) >> 8)); + +} + + + + + +unsigned int vp8_variance4x4_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + *sse = var; + return (var - ((avg * avg) >> 4)); + +} + +unsigned int vp8_variance8x8_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int var; + int avg; + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + *sse = var; + + return (var - ((avg * avg) >> 6)); + +} + +unsigned int vp8_mse16x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + + +unsigned int vp8_variance16x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - ((avg * avg) >> 8)); +} + +unsigned int vp8_variance16x8_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + + +unsigned int vp8_variance8x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - ((avg * avg) >> 7)); + +} + + + + +/////////////////////////////////////////////////////////////////////////// +// the mmx function that does the bilinear filtering and var calculation // +// int one pass // +/////////////////////////////////////////////////////////////////////////// +DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) = +{ + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + +unsigned int vp8_sub_pixel_variance4x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) + +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); + + +} + +unsigned int vp8_sub_pixel_mse16x16_mmx( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + return *sse; +} + +unsigned int vp8_sub_pixel_variance16x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 7)); +} + +unsigned int vp8_i_variance16x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; + vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - ((avg * avg) >> 8)); + +} + +unsigned int vp8_i_variance8x16_mmx( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + +unsigned int vp8_i_sub_pixel_variance16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + int f2soffset = (src_pixels_per_line >> 1); + int f2doffset = (dst_pixels_per_line >> 1); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + vp8_filter_block2d_bil_var_mmx( + src_ptr + f2soffset, src_pixels_per_line, + dst_ptr + f2doffset, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + vp8_filter_block2d_bil_var_mmx( + src_ptr + f2soffset + 8, src_pixels_per_line, + dst_ptr + f2doffset + 8, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_i_sub_pixel_variance8x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + int f2soffset = (src_pixels_per_line >> 1); + int f2doffset = (dst_pixels_per_line >> 1); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_mmx( + src_ptr + f2soffset, src_pixels_per_line, + dst_ptr + f2doffset, dst_pixels_per_line, 8, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c new file mode 100644 index 000000000..ea80753bd --- /dev/null +++ b/vp8/encoder/x86/variance_sse2.c @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "variance.h" +#include "pragmas.h" +#include "vpx_ports/mem.h" + +extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); + +extern void vp8_filter_block2d_bil4x4_var_mmx +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); + +extern unsigned int vp8_get4x4var_mmx +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); + +unsigned int vp8_get_mb_ss_sse2 +( + short *src_ptr +); +unsigned int vp8_get16x16var_sse2 +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +unsigned int vp8_get16x16pred_error_sse2 +( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride +); +unsigned int vp8_get8x8var_sse2 +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum +); +void vp8_filter_block2d_bil_var_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const short *HFilter, + const short *VFilter, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_vert_variance16x_h_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_horiz_variance16x_h_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); +void vp8_half_vert_variance16x_h_sse2 +( + unsigned char *ref_ptr, + int ref_pixels_per_line, + unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); + +DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); + +unsigned int vp8_variance4x4_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride) +{ + unsigned int var; + int avg; + + vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + return (var - ((avg * avg) >> 4)); + +} + + + +unsigned int vp8_variance8x8_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride) +{ + unsigned int var; + int avg; + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; + + return (var - ((avg * avg) >> 6)); + +} + + +unsigned int vp8_variance16x16_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0; + int sum0; + + + vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + *sse = sse0; + return (sse0 - ((sum0 * sum0) >> 8)); +} +unsigned int vp8_mse16x16_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + + unsigned int sse0; + int sum0; + vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + *sse = sse0; + return sse0; + +} + + +unsigned int vp8_variance16x8_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + +unsigned int vp8_variance8x16_wmt +( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + +/////////////////////////////////////////////////////////////////////////// +// the mmx function that does the bilinear filtering and var calculation // +// int one pass // +/////////////////////////////////////////////////////////////////////////// +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) = +{ + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } +}; +unsigned int vp8_sub_pixel_variance4x4_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil4x4_var_mmx( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], + &xsum, &xxsum + ); + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 4)); +} + + +unsigned int vp8_sub_pixel_variance8x8_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum, &xxsum + ); + + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 6)); +} + +unsigned int vp8_sub_pixel_variance16x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + // note we could avoid these if statements if the calling function + // just called the appropriate functions inside. + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum1, &xxsum1 + ); + } + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + +unsigned int vp8_sub_pixel_mse16x16_wmt( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + return *sse; +} + +unsigned int vp8_sub_pixel_variance16x8_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum0, &xxsum0 + ); + + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum1, &xxsum1 + ); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} + +unsigned int vp8_sub_pixel_variance8x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + int xsum; + unsigned int xxsum; + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + &xsum, &xxsum + ); + + *sse = xxsum; + return (xxsum - ((xsum * xsum) >> 7)); +} + +unsigned int vp8_i_variance16x16_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; + vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + + *sse = var; + return (var - ((avg * avg) >> 8)); + +} + +unsigned int vp8_i_variance8x16_wmt( + unsigned char *src_ptr, + int source_stride, + unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; + vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; + + var = sse0 + sse1; + avg = sum0 + sum1; + + *sse = var; + return (var - ((avg * avg) >> 7)); + +} + + +unsigned int vp8_i_sub_pixel_variance16x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); +} + + +unsigned int vp8_i_sub_pixel_variance8x16_wmt +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + + return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h new file mode 100644 index 000000000..35fc90c48 --- /dev/null +++ b/vp8/encoder/x86/variance_x86.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef VARIANCE_X86_H +#define VARIANCE_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX +extern prototype_sad(vp8_sad4x4_mmx); +extern prototype_sad(vp8_sad8x8_mmx); +extern prototype_sad(vp8_sad8x16_mmx); +extern prototype_sad(vp8_sad16x8_mmx); +extern prototype_sad(vp8_sad16x16_mmx); +extern prototype_variance(vp8_variance4x4_mmx); +extern prototype_variance(vp8_variance8x8_mmx); +extern prototype_variance(vp8_variance8x16_mmx); +extern prototype_variance(vp8_variance16x8_mmx); +extern prototype_variance(vp8_variance16x16_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx); +extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx); +extern prototype_getmbss(vp8_get_mb_ss_mmx); +extern prototype_variance(vp8_mse16x16_mmx); +extern prototype_sad(vp8_get16x16pred_error_mmx); +extern prototype_variance2(vp8_get8x8var_mmx); +extern prototype_variance2(vp8_get16x16var_mmx); +extern prototype_sad(vp8_get4x4sse_cs_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad4x4 +#define vp8_variance_sad4x4 vp8_sad4x4_mmx + +#undef vp8_variance_sad8x8 +#define vp8_variance_sad8x8 vp8_sad8x8_mmx + +#undef vp8_variance_sad8x16 +#define vp8_variance_sad8x16 vp8_sad8x16_mmx + +#undef vp8_variance_sad16x8 +#define vp8_variance_sad16x8 vp8_sad16x8_mmx + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_mmx + +#undef vp8_variance_var4x4 +#define vp8_variance_var4x4 vp8_variance4x4_mmx + +#undef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_mmx + +#undef vp8_variance_var8x16 +#define vp8_variance_var8x16 vp8_variance8x16_mmx + +#undef vp8_variance_var16x8 +#define vp8_variance_var16x8 vp8_variance16x8_mmx + +#undef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_mmx + +#undef vp8_variance_subpixvar4x4 +#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx + +#undef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx + +#undef vp8_variance_subpixvar8x16 +#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx + +#undef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx + +#undef vp8_variance_subpixmse16x16 +#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx + +#undef vp8_variance_getmbss +#define vp8_variance_getmbss vp8_get_mb_ss_mmx + +#undef vp8_variance_mse16x16 +#define vp8_variance_mse16x16 vp8_mse16x16_mmx + +#undef vp8_variance_get16x16prederror +#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx + +#undef vp8_variance_get8x8var +#define vp8_variance_get8x8var vp8_get8x8var_mmx + +#undef vp8_variance_get16x16var +#define vp8_variance_get16x16var vp8_get16x16var_mmx + +#undef vp8_variance_get4x4sse_cs +#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_sad(vp8_sad4x4_wmt); +extern prototype_sad(vp8_sad8x8_wmt); +extern prototype_sad(vp8_sad8x16_wmt); +extern prototype_sad(vp8_sad16x8_wmt); +extern prototype_sad(vp8_sad16x16_wmt); +extern prototype_variance(vp8_variance4x4_wmt); +extern prototype_variance(vp8_variance8x8_wmt); +extern prototype_variance(vp8_variance8x16_wmt); +extern prototype_variance(vp8_variance16x8_wmt); +extern prototype_variance(vp8_variance16x16_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt); +extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt); +extern prototype_getmbss(vp8_get_mb_ss_sse2); +extern prototype_variance(vp8_mse16x16_wmt); +extern prototype_sad(vp8_get16x16pred_error_sse2); +extern prototype_variance2(vp8_get8x8var_sse2); +extern prototype_variance2(vp8_get16x16var_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad4x4 +#define vp8_variance_sad4x4 vp8_sad4x4_wmt + +#undef vp8_variance_sad8x8 +#define vp8_variance_sad8x8 vp8_sad8x8_wmt + +#undef vp8_variance_sad8x16 +#define vp8_variance_sad8x16 vp8_sad8x16_wmt + +#undef vp8_variance_sad16x8 +#define vp8_variance_sad16x8 vp8_sad16x8_wmt + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_wmt + +#undef vp8_variance_var4x4 +#define vp8_variance_var4x4 vp8_variance4x4_wmt + +#undef vp8_variance_var8x8 +#define vp8_variance_var8x8 vp8_variance8x8_wmt + +#undef vp8_variance_var8x16 +#define vp8_variance_var8x16 vp8_variance8x16_wmt + +#undef vp8_variance_var16x8 +#define vp8_variance_var16x8 vp8_variance16x8_wmt + +#undef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_wmt + +#undef vp8_variance_subpixvar4x4 +#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt + +#undef vp8_variance_subpixvar8x8 +#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt + +#undef vp8_variance_subpixvar8x16 +#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt + +#undef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt + +#undef vp8_variance_subpixmse16x16 +#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt + +#undef vp8_variance_getmbss +#define vp8_variance_getmbss vp8_get_mb_ss_sse2 + +#undef vp8_variance_mse16x16 +#define vp8_variance_mse16x16 vp8_mse16x16_wmt + +#undef vp8_variance_get16x16prederror +#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2 + +#undef vp8_variance_get8x8var +#define vp8_variance_get8x8var vp8_get8x8var_sse2 + +#undef vp8_variance_get16x16var +#define vp8_variance_get16x16var vp8_get16x16var_sse2 + +#endif +#endif + + +#if HAVE_SSE3 +extern prototype_sad(vp8_sad16x16_sse3); +extern prototype_sad(vp8_sad16x8_sse3); +extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3); +extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3); + +extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3); +extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_sse3 + +#undef vp8_variance_sad16x16x3 +#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3 + +#undef vp8_variance_sad16x8x3 +#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3 + +#undef vp8_variance_sad8x16x3 +#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3 + +#undef vp8_variance_sad8x8x3 +#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3 + +#undef vp8_variance_sad4x4x3 +#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3 + +#undef vp8_variance_sad16x16x4d +#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3 + +#undef vp8_variance_sad16x8x4d +#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3 + +#undef vp8_variance_sad8x16x4d +#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3 + +#undef vp8_variance_sad8x8x4d +#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3 + +#undef vp8_variance_sad4x4x4d +#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3 + +#endif +#endif + + +#if HAVE_SSSE3 +extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); +extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad16x16x3 +#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3 + +#undef vp8_variance_sad16x8x3 +#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 + +#endif +#endif + +#endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c new file mode 100644 index 000000000..f1391ba8c --- /dev/null +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/x86.h" +#include "variance.h" +#include "onyx_int.h" + + +#if HAVE_MMX +void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_mmx(input, output, pitch); + vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); +} + +void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch) +{ + vp8_fast_fdct4x4_mmx(input, output , pitch); + vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch); +} + +int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr, short *dequant_ptr, + short *scan_mask, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) +{ + short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + + d->eob = vp8_fast_quantize_b_impl_mmx( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + scan_mask, + + round_ptr, + quant_ptr, + dqcoeff_ptr + ); +} + +int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) +{ + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_mmx(MACROBLOCK *mb) +{ + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); +} + +void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, + short *diff, unsigned char *predictor, + int pitch); +void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *z = *(be->base_src) + be->src; + unsigned int src_stride = be->src_stride; + short *diff = &be->src_diff[0]; + unsigned char *predictor = &bd->predictor[0]; + vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); +} + +#endif + +#if HAVE_SSE2 +void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) +{ + vp8_short_fdct4x4_wmt(input, output, pitch); + vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch); +} + +int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr, short *dequant_ptr, + short *scan_mask, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) +{ + short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; + short *coeff_ptr = &b->coeff[0]; + short *zbin_ptr = &b->zbin[0][0]; + short *round_ptr = &b->round[0][0]; + short *quant_ptr = &b->quant[0][0]; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = &d->dequant[0][0]; + + d->eob = vp8_fast_quantize_b_impl_sse( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + scan_mask, + + round_ptr, + quant_ptr, + dqcoeff_ptr + ); +} + +int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) +{ + short *coeff_ptr = mb->block[0].coeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); +} + +int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_xmm(MACROBLOCK *mb) +{ + short *s_ptr = &mb->coeff[256]; + short *d_ptr = &mb->e_mbd.dqcoeff[256]; + return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); +} + +#endif + +void vp8_arch_x86_encoder_init(VP8_COMP *cpi) +{ +#if CONFIG_RUNTIME_CPU_DETECT + int flags = x86_simd_caps(); + int mmx_enabled = flags & HAS_MMX; + int xmm_enabled = flags & HAS_SSE; + int wmt_enabled = flags & HAS_SSE2; + int SSE3Enabled = flags & HAS_SSE3; + int SSSE3Enabled = flags & HAS_SSSE3; + + /* Note: + * + * This platform can be built without runtime CPU detection as well. If + * you modify any of the function mappings present in this file, be sure + * to also update them in static mapings (<arch>/filename_<arch>.h) + */ + + /* Override default functions with fastest ones for this CPU. */ +#if HAVE_MMX + + if (mmx_enabled) + { + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_mmx; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_mmx; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_mmx; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_mmx; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_mmx; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_mmx; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_mmx; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_mmx; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_mmx; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_mmx; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_mmx; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx; + cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; + cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; + + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; + cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; + + cpi->rtcd.encodemb.berr = vp8_block_error_mmx; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_mmx; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_mmx; + cpi->rtcd.encodemb.subb = vp8_subtract_b_mmx; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx; + } + +#endif +#if HAVE_SSE2 + + if (wmt_enabled) + { + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; + cpi->rtcd.variance.sad16x8 = vp8_sad16x8_wmt; + cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; + cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; + cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; + + cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; + cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; + cpi->rtcd.variance.var8x16 = vp8_variance8x16_wmt; + cpi->rtcd.variance.var16x8 = vp8_variance16x8_wmt; + cpi->rtcd.variance.var16x16 = vp8_variance16x16_wmt; + + cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_wmt; + cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_wmt; + cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt; + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt; + cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt; + + cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; + cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2; + + cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; + cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; + cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; + /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; + +#if 0 + /* short SSE2 DCT currently disabled, does not match the MMX version */ + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt; +#endif + /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */; + cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt; + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2; + + cpi->rtcd.encodemb.berr = vp8_block_error_xmm; + cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; + cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; + /* cpi->rtcd.encodemb.sub* not implemented for wmt */ + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse; + } + +#endif +#if HAVE_SSE3 + + if (SSE3Enabled) + { + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; + cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_sse3; + cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_sse3; + cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3; + cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3; + cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3; + cpi->rtcd.search.full_search = vp8_full_search_sadx3; + + cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3; + cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3; + cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; + cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; + cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; + cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; + } + +#endif +#if HAVE_SSSE3 + + if (SSSE3Enabled) + { + cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; + cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + } + +#endif +#endif +} |