diff options
Diffstat (limited to 'vp8/encoder/x86')
26 files changed, 2313 insertions, 1584 deletions
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c deleted file mode 100644 index 186ee6856..000000000 --- a/vp8/encoder/x86/csystemdependent.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. - */ - - -#include "variance.h" -#include "onyx_int.h" - -SADFunction *vp8_sad16x16; -SADFunction *vp8_sad16x8; -SADFunction *vp8_sad8x16; -SADFunction *vp8_sad8x8; -SADFunction *vp8_sad4x4; - -variance_function *vp8_variance4x4; -variance_function *vp8_variance8x8; -variance_function *vp8_variance8x16; -variance_function *vp8_variance16x8; -variance_function *vp8_variance16x16; - - -variance_function *vp8_mse16x16; - -sub_pixel_variance_function *vp8_sub_pixel_variance4x4; -sub_pixel_variance_function *vp8_sub_pixel_variance8x8; -sub_pixel_variance_function *vp8_sub_pixel_variance8x16; -sub_pixel_variance_function *vp8_sub_pixel_variance16x8; -sub_pixel_variance_function *vp8_sub_pixel_variance16x16; - -int (*vp8_block_error)(short *, short *); -int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc); -void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride); - -extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride); -extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride); - -extern int vp8_block_error_c(short *, short *); -extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc); - -extern int vp8_block_error_mmx(short *, short *); -extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc); - -extern int vp8_block_error_xmm(short *, short *); -extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc); - - - -int (*vp8_mbuverror)(MACROBLOCK *mb); -unsigned int (*vp8_get_mb_ss)(short *); -void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); -void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); -void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); - -void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch); -void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); -unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); -unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); - -// c imports -extern int vp8_mbuverror_c(MACROBLOCK *mb); -extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch); -extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch); -extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); -extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch); - - -extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch); -extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); - -extern SADFunction vp8_sad16x16_c; -extern SADFunction vp8_sad16x8_c; -extern SADFunction vp8_sad8x16_c; -extern SADFunction vp8_sad8x8_c; -extern SADFunction vp8_sad4x4_c; - -extern SADFunction vp8_sad16x16_wmt; -extern SADFunction vp8_sad16x8_wmt; -extern SADFunction vp8_sad8x16_wmt; -extern SADFunction vp8_sad8x8_wmt; -extern SADFunction vp8_sad4x4_wmt; - -extern SADFunction vp8_sad16x16_mmx; -extern SADFunction vp8_sad16x8_mmx; -extern SADFunction vp8_sad8x16_mmx; -extern SADFunction vp8_sad8x8_mmx; -extern SADFunction vp8_sad4x4_mmx; - -extern variance_function vp8_variance16x16_c; -extern variance_function vp8_variance8x16_c; -extern variance_function vp8_variance16x8_c; -extern variance_function vp8_variance8x8_c; -extern variance_function vp8_variance4x4_c; -extern variance_function vp8_mse16x16_c; - -extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c; - -extern unsigned int vp8_get_mb_ss_c(short *); -extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); -extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); - -// mmx imports -extern int vp8_mbuverror_mmx(MACROBLOCK *mb); -extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d); -extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch); -extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); -extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); -extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch); -extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch); -extern variance_function vp8_variance4x4_mmx; -extern variance_function vp8_variance8x8_mmx; -extern variance_function vp8_variance8x16_mmx; -extern variance_function vp8_variance16x8_mmx; -extern variance_function vp8_variance16x16_mmx; - -extern variance_function vp8_mse16x16_mmx; -extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx; - -extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); -extern unsigned int vp8_get_mb_ss_mmx(short *); -extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); - - -// wmt imports -extern int vp8_mbuverror_xmm(MACROBLOCK *mb); -extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d); -extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch); -extern variance_function vp8_variance4x4_wmt; -extern variance_function vp8_variance8x8_wmt; -extern variance_function vp8_variance8x16_wmt; -extern variance_function vp8_variance16x8_wmt; -extern variance_function vp8_variance16x16_wmt; - -extern variance_function vp8_mse16x16_wmt; -extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt; -extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride); -extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr); -extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); - -extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); - -void vp8_cmachine_specific_config(void) -{ - int mmx_enabled; - int xmm_enabled; - int wmt_enabled; - - vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled); - - if (wmt_enabled) // Willamette - { - // Willamette instruction set available: - vp8_mbuverror = vp8_mbuverror_xmm; - vp8_fast_quantize_b = vp8_fast_quantize_b_sse; - vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; - vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; - vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; - vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt; - vp8_subtract_b = vp8_subtract_b_mmx; - vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; - vp8_variance4x4 = vp8_variance4x4_mmx; - vp8_variance8x8 = vp8_variance8x8_mmx; - vp8_variance8x16 = vp8_variance8x16_wmt; - vp8_variance16x8 = vp8_variance16x8_wmt; - vp8_variance16x16 = vp8_variance16x16_wmt; - vp8_mse16x16 = vp8_mse16x16_wmt; - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - vp8_get_mb_ss = vp8_get_mb_ss_sse2; - vp8_get16x16pred_error = vp8_get16x16pred_error_sse2; - vp8_get8x8var = vp8_get8x8var_sse2; - vp8_get16x16var = vp8_get16x16var_sse2; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - vp8_sad16x16 = vp8_sad16x16_wmt; - vp8_sad16x8 = vp8_sad16x8_wmt; - vp8_sad8x16 = vp8_sad8x16_wmt; - vp8_sad8x8 = vp8_sad8x8_wmt; - vp8_sad4x4 = vp8_sad4x4_wmt; - vp8_block_error = vp8_block_error_xmm; - vp8_mbblock_error = vp8_mbblock_error_xmm; - vp8_subtract_mby = vp8_subtract_mby_mmx; - - } - else if (mmx_enabled) - { - // MMX instruction set available: - vp8_mbuverror = vp8_mbuverror_mmx; - vp8_fast_quantize_b = vp8_fast_quantize_b_mmx; - vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; - vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; - vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; - vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx; - vp8_subtract_b = vp8_subtract_b_mmx; - vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; - vp8_variance4x4 = vp8_variance4x4_mmx; - vp8_variance8x8 = vp8_variance8x8_mmx; - vp8_variance8x16 = vp8_variance8x16_mmx; - vp8_variance16x8 = vp8_variance16x8_mmx; - vp8_variance16x16 = vp8_variance16x16_mmx; - vp8_mse16x16 = vp8_mse16x16_mmx; - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; - vp8_get_mb_ss = vp8_get_mb_ss_mmx; - vp8_get16x16pred_error = vp8_get16x16pred_error_mmx; - vp8_get8x8var = vp8_get8x8var_mmx; - vp8_get16x16var = vp8_get16x16var_mmx; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - vp8_sad16x16 = vp8_sad16x16_mmx; - vp8_sad16x8 = vp8_sad16x8_mmx; - vp8_sad8x16 = vp8_sad8x16_mmx; - vp8_sad8x8 = vp8_sad8x8_mmx; - vp8_sad4x4 = vp8_sad4x4_mmx; - vp8_block_error = vp8_block_error_mmx; - vp8_mbblock_error = vp8_mbblock_error_mmx; - vp8_subtract_mby = vp8_subtract_mby_mmx; - - } - else - { - // Pure C: - vp8_mbuverror = vp8_mbuverror_c; - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - vp8_short_fdct4x4 = vp8_short_fdct4x4_c; - vp8_short_fdct8x4 = vp8_short_fdct8x4_c; - vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c; - vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c; - vp8_subtract_b = vp8_subtract_b_c; - vp8_subtract_mbuv = vp8_subtract_mbuv_c; - vp8_variance4x4 = vp8_variance4x4_c; - vp8_variance8x8 = vp8_variance8x8_c; - vp8_variance8x16 = vp8_variance8x16_c; - vp8_variance16x8 = vp8_variance16x8_c; - vp8_variance16x16 = vp8_variance16x16_c; - vp8_mse16x16 = vp8_mse16x16_c; - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; - vp8_get_mb_ss = vp8_get_mb_ss_c; - vp8_get16x16pred_error = vp8_get16x16pred_error_c; - vp8_get8x8var = vp8_get8x8var_c; - vp8_get16x16var = vp8_get16x16var_c; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - vp8_sad16x16 = vp8_sad16x16_c; - vp8_sad16x8 = vp8_sad16x8_c; - vp8_sad8x16 = vp8_sad8x16_c; - vp8_sad8x8 = vp8_sad8x8_c; - vp8_sad4x4 = vp8_sad4x4_c; - vp8_block_error = vp8_block_error_c; - vp8_mbblock_error = vp8_mbblock_error_c; - vp8_subtract_mby = vp8_subtract_mby_c; - } - -} diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm index e13423796..5acaca875 100644 --- a/vp8/encoder/x86/dct_mmx.asm +++ b/vp8/encoder/x86/dct_mmx.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; @@ -12,8 +13,7 @@ section .text global sym(vp8_short_fdct4x4_mmx) - global sym(vp8_fast_fdct4x4_mmx) - global sym(vp8_fast_fdct8x4_wmt) + global sym(vp8_short_fdct8x4_wmt) %define DCTCONSTANTSBITS (16) @@ -23,10 +23,6 @@ section .text %define x_c3 (25080) ; cos(pi*3/8) * (1<<15) -%define _1STSTAGESHIFT 14 -%define _2NDSTAGESHIFT 16 - -; using matrix multiply with source and destbuffer has a pitch ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) sym(vp8_short_fdct4x4_mmx): push rbp @@ -36,337 +32,10 @@ sym(vp8_short_fdct4x4_mmx): push rsi push rdi ; end prolog - - mov rsi, arg(0) ;input - mov rdi, arg(1) ;output - - movsxd rax, dword ptr arg(2) ;pitch - lea rdx, [dct_matrix GLOBAL] - - movq mm0, [rsi ] - movq mm1, [rsi + rax] - - movq mm2, [rsi + rax*2] - lea rsi, [rsi + rax*2] - - movq mm3, [rsi + rax] - - ; first column - movq mm4, mm0 - movq mm7, [rdx] - - pmaddwd mm4, mm7 - movq mm5, mm1 - - pmaddwd mm5, mm7 - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - - pmaddwd mm5, mm7 - movq mm6, mm3 - - pmaddwd mm6, mm7 - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _1STSTAGESHIFT - psrad mm5, _1STSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi], mm4 - - ;second column - movq mm4, mm0 - - pmaddwd mm4, [rdx+8] - movq mm5, mm1 - - pmaddwd mm5, [rdx+8] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+8] - movq mm6, mm3 - - pmaddwd mm6, [rdx+8] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _1STSTAGESHIFT - psrad mm5, _1STSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+8], mm4 - - - ;third column - movq mm4, mm0 - - pmaddwd mm4, [rdx+16] - movq mm5, mm1 - - pmaddwd mm5, [rdx+16] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+16] - movq mm6, mm3 - - pmaddwd mm6, [rdx+16] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _1STSTAGESHIFT - psrad mm5, _1STSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+16], mm4 - - ;fourth column (this is the last column, so we do not have save the source any more) - - pmaddwd mm0, [rdx+24] - - pmaddwd mm1, [rdx+24] - movq mm6, mm0 - - punpckldq mm0, mm1 - punpckhdq mm6, mm1 - - paddd mm0, mm6 - - pmaddwd mm2, [rdx+24] - - pmaddwd mm3, [rdx+24] - movq mm7, mm2 - - punpckldq mm2, mm3 - punpckhdq mm7, mm3 - - paddd mm2, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm0, mm6 - paddd mm2, mm6 - - psrad mm0, _1STSTAGESHIFT - psrad mm2, _1STSTAGESHIFT - - packssdw mm0, mm2 - - movq mm3, mm0 - - ; done with one pass - ; now start second pass - movq mm0, [rdi ] - movq mm1, [rdi+ 8] - movq mm2, [rdi+ 16] - - movq mm4, mm0 - - pmaddwd mm4, [rdx] - movq mm5, mm1 - - pmaddwd mm5, [rdx] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx] - movq mm6, mm3 - - pmaddwd mm6, [rdx] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi], mm4 - - ;second column - movq mm4, mm0 - - pmaddwd mm4, [rdx+8] - movq mm5, mm1 - - pmaddwd mm5, [rdx+8] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+8] - movq mm6, mm3 - - pmaddwd mm6, [rdx+8] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+8], mm4 - - - ;third column - movq mm4, mm0 - - pmaddwd mm4, [rdx+16] - movq mm5, mm1 - - pmaddwd mm5, [rdx+16] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+16] - movq mm6, mm3 - - pmaddwd mm6, [rdx+16] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+16], mm4 - - ;fourth column - movq mm4, mm0 - - pmaddwd mm4, [rdx+24] - movq mm5, mm1 - - pmaddwd mm5, [rdx+24] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+24] - movq mm6, mm3 - - pmaddwd mm6, [rdx+24] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+24], mm4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch) -sym(vp8_fast_fdct4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog mov rsi, arg(0) ;input mov rdi, arg(1) ;output - lea rdx, [dct_const_mmx GLOBAL] + lea rdx, [GLOBAL(dct_const_mmx)] movsxd rax, dword ptr arg(2) ;pitch lea rcx, [rsi + rax*2] @@ -378,11 +47,11 @@ sym(vp8_fast_fdct4x4_mmx): movq mm3, [rcx + rax] ; get the constants ;shift to left by 1 for prescision - paddw mm0, mm0 - paddw mm1, mm1 + psllw mm0, 3 + psllw mm1, 3 - psllw mm2, 1 - psllw mm3, 1 + psllw mm2, 3 + psllw mm3, 3 ; transpose for the second stage movq mm4, mm0 ; 00 01 02 03 @@ -530,20 +199,23 @@ sym(vp8_fast_fdct4x4_mmx): movq mm3, mm5 ; done with vertical - pcmpeqw mm4, mm4 - pcmpeqw mm5, mm5 - psrlw mm4, 15 - psrlw mm5, 15 + pcmpeqw mm4, mm4 + pcmpeqw mm5, mm5 + psrlw mm4, 15 + psrlw mm5, 15 + + psllw mm4, 2 + psllw mm5, 2 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm4 paddw mm3, mm5 - psraw mm0, 1 - psraw mm1, 1 - psraw mm2, 1 - psraw mm3, 1 + psraw mm0, 3 + psraw mm1, 3 + psraw mm2, 3 + psraw mm3, 3 movq [rdi ], mm0 movq [rdi+ 8], mm1 @@ -559,8 +231,8 @@ sym(vp8_fast_fdct4x4_mmx): ret -;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch) -sym(vp8_fast_fdct8x4_wmt): +;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) +sym(vp8_short_fdct8x4_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 @@ -571,7 +243,7 @@ sym(vp8_fast_fdct8x4_wmt): mov rsi, arg(0) ;input mov rdi, arg(1) ;output - lea rdx, [dct_const_xmm GLOBAL] + lea rdx, [GLOBAL(dct_const_xmm)] movsxd rax, dword ptr arg(2) ;pitch lea rcx, [rsi + rax*2] @@ -583,11 +255,11 @@ sym(vp8_fast_fdct8x4_wmt): movdqa xmm3, [rcx + rax] ; get the constants ;shift to left by 1 for prescision - psllw xmm0, 1 - psllw xmm2, 1 + psllw xmm0, 3 + psllw xmm2, 3 - psllw xmm4, 1 - psllw xmm3, 1 + psllw xmm4, 3 + psllw xmm3, 3 ; transpose for the second stage movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 @@ -757,20 +429,23 @@ sym(vp8_fast_fdct8x4_wmt): ; done with vertical - pcmpeqw xmm4, xmm4 - pcmpeqw xmm5, xmm5; - psrlw xmm4, 15 - psrlw xmm5, 15 + pcmpeqw xmm4, xmm4 + pcmpeqw xmm5, xmm5; + psrlw xmm4, 15 + psrlw xmm5, 15 + + psllw xmm4, 2 + psllw xmm5, 2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm4 paddw xmm3, xmm5 - psraw xmm0, 1 - psraw xmm1, 1 - psraw xmm2, 1 - psraw xmm3, 1 + psraw xmm0, 3 + psraw xmm1, 3 + psraw xmm2, 3 + psraw xmm3, 3 movq QWORD PTR[rdi ], xmm0 movq QWORD PTR[rdi+ 8], xmm1 diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 3e5e9a70c..723a78d76 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -1,260 +1,189 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" -global sym(vp8_short_fdct4x4_wmt) - -%define DCTCONSTANTSBITS (16) -%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) -%define x_c1 (60547) ; cos(pi /8) * (1<<15) -%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) -%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) - -%define _1STSTAGESHIFT 14 -%define _2NDSTAGESHIFT 16 - - -;; using matrix multiply -;void vp8_short_fdct4x4_wmt(short *input, short *output) -sym(vp8_short_fdct4x4_wmt): +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_sse2) +sym(vp8_short_fdct4x4_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 + SHADOW_ARGS_TO_STACK 3 +;; SAVE_XMM GET_GOT rbx + push rsi + push rdi ; end prolog - mov rax, arg(0) ;input - mov rcx, arg(1) ;output - - lea rdx, [dct_matrix_sse2 GLOBAL] - - movdqu xmm0, [rax ] - movdqu xmm1, [rax+16] - - ; first column - movdqa xmm2, xmm0 - movdqa xmm7, [rdx] - - pmaddwd xmm2, xmm7 - movdqa xmm3, xmm1 - - pmaddwd xmm3, xmm7 - movdqa xmm4, xmm2 - - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - movdqa xmm3, xmm2 - punpckldq xmm2, xmm4 - - punpckhdq xmm3, xmm4 - paddd xmm2, xmm3 - - - paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] - psrad xmm2, _1STSTAGESHIFT - ;second column - movdqa xmm3, xmm0 - pmaddwd xmm3, [rdx+16] - - movdqa xmm4, xmm1 - pmaddwd xmm4, [rdx+16] - - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - - punpckhdq xmm5, xmm4 - movdqa xmm4, xmm3 - - punpckldq xmm3, xmm5 - punpckhdq xmm4, xmm5 - - paddd xmm3, xmm4 - paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] - - - psrad xmm3, _1STSTAGESHIFT - packssdw xmm2, xmm3 - - ;third column - movdqa xmm3, xmm0 - pmaddwd xmm3, [rdx+32] - - movdqa xmm4, xmm1 - pmaddwd xmm4, [rdx+32] - - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - - punpckhdq xmm5, xmm4 - movdqa xmm4, xmm3 - - punpckldq xmm3, xmm5 - punpckhdq xmm4, xmm5 - - paddd xmm3, xmm4 - paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] - - psrad xmm3, _1STSTAGESHIFT - - ;fourth column (this is the last column, so we do not have save the source any more) - pmaddwd xmm0, [rdx+48] - pmaddwd xmm1, [rdx+48] - - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - - punpckhdq xmm4, xmm1 - movdqa xmm1, xmm0 - - punpckldq xmm0, xmm4 - punpckhdq xmm1, xmm4 - - paddd xmm0, xmm1 - paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL] - - - psrad xmm0, _1STSTAGESHIFT - packssdw xmm3, xmm0 - ; done with one pass - ; now start second pass - movdqa xmm0, xmm2 - movdqa xmm1, xmm3 - - pmaddwd xmm2, xmm7 - pmaddwd xmm3, xmm7 - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 + mov rsi, arg(0) + movsxd rax, DWORD PTR arg(2) + lea rdi, [rsi + rax*2] + + movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00 + movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10 + movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20 + movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30 + + punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 + punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 + + mov rdi, arg(1) + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 + punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 + pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx + pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx + + punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 + movdqa xmm3, xmm0 + paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 + psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 + psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 + psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 + + paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] + psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm0, xmm1 ;op[2] op[0] + packssdw xmm3, xmm4 ;op[3] op[1] + ; 23 22 21 20 03 02 01 00 + ; + ; 33 32 31 30 13 12 11 10 + ; + movdqa xmm2, xmm0 + punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 + + movdqa xmm3, xmm0 + punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 + punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 + + movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] + pshufd xmm2, xmm2, 04eh + movdqa xmm3, xmm0 + paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 + psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 + + pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 + movdqa xmm2, xmm3 ;save d1 for compare + pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 + pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 + pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 + pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 + pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 + movdqa xmm1, xmm0 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 + + pxor xmm4, xmm4 ;zero out for compare + paddd xmm0, xmm5 + paddd xmm1, xmm5 + pcmpeqw xmm2, xmm4 + psrad xmm0, 4 ;(a1 + b1 + 7)>>4 + psrad xmm1, 4 ;(a1 - b1 + 7)>>4 + pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, + ;and keep bit 0 of lower + + movdqa xmm4, xmm3 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 + paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] + packssdw xmm0, xmm1 ;op[8] op[0] + psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 + psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 + + packssdw xmm3, xmm4 ;op[12] op[4] + movdqa xmm1, xmm0 + paddw xmm3, xmm2 ;op[4] += (d1!=0) + punpcklqdq xmm0, xmm3 ;op[4] op[0] + punpckhqdq xmm1, xmm3 ;op[12] op[8] + + movdqa XMMWORD PTR[rdi + 0], xmm0 + movdqa XMMWORD PTR[rdi + 16], xmm1 - punpckhdq xmm4, xmm3 - movdqa xmm3, xmm2 - - punpckldq xmm2, xmm4 - punpckhdq xmm3, xmm4 - - paddd xmm2, xmm3 - paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] - - psrad xmm2, _2NDSTAGESHIFT - - ;second column - movdqa xmm3, xmm0 - pmaddwd xmm3, [rdx+16] - - movdqa xmm4, xmm1 - pmaddwd xmm4, [rdx+16] - - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - - punpckhdq xmm5, xmm4 - movdqa xmm4, xmm3 - - punpckldq xmm3, xmm5 - punpckhdq xmm4, xmm5 - - paddd xmm3, xmm4 - paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] - - psrad xmm3, _2NDSTAGESHIFT - packssdw xmm2, xmm3 - - movdqu [rcx], xmm2 - ;third column - movdqa xmm3, xmm0 - pmaddwd xmm3, [rdx+32] - - movdqa xmm4, xmm1 - pmaddwd xmm4, [rdx+32] - - movdqa xmm5, xmm3 - punpckldq xmm3, xmm4 - - punpckhdq xmm5, xmm4 - movdqa xmm4, xmm3 - - punpckldq xmm3, xmm5 - punpckhdq xmm4, xmm5 - - paddd xmm3, xmm4 - paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] - - psrad xmm3, _2NDSTAGESHIFT - ;fourth column - pmaddwd xmm0, [rdx+48] - pmaddwd xmm1, [rdx+48] - - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - - punpckhdq xmm4, xmm1 - movdqa xmm1, xmm0 - - punpckldq xmm0, xmm4 - punpckhdq xmm1, xmm4 - - paddd xmm0, xmm1 - paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL] - - psrad xmm0, _2NDSTAGESHIFT - packssdw xmm3, xmm0 - - movdqu [rcx+16], xmm3 - - mov rsp, rbp ; begin epilog + pop rdi + pop rsi RESTORE_GOT +;; RESTORE_XMM UNSHADOW_ARGS pop rbp ret - SECTION_RODATA -;static unsigned int dct1st_stage_rounding_sse2[4] = align 16 -dct1st_stage_rounding_sse2: - times 4 dd 8192 - - -;static unsigned int dct2nd_stage_rounding_sse2[4] = +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 + dw 5352 + dw 2217 align 16 -dct2nd_stage_rounding_sse2: - times 4 dd 32768 - -;static short dct_matrix_sse2[4][8]= +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 + dw 2217 + dw -5352 align 16 -dct_matrix_sse2: - times 8 dw 23170 - - dw 30274 - dw 12540 - dw -12540 - dw -30274 - dw 30274 - dw 12540 - dw -12540 - dw -30274 - - dw 23170 - times 2 dw -23170 - times 2 dw 23170 - times 2 dw -23170 - dw 23170 +_mult_add: + times 8 dw 1 +align 16 +_cmp_mask: + times 4 dw 1 + times 4 dw 0 - dw 12540 - dw -30274 - dw 30274 - dw -12540 - dw 12540 - dw -30274 - dw 30274 - dw -12540 +align 16 +_mult_sub: + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 + dw 1 + dw -1 +align 16 +_7: + times 4 dd 7 +align 16 +_14500: + times 4 dd 14500 +align 16 +_7500: + times 4 dd 7500 +align 16 +_12000: + times 4 dd 12000 +align 16 +_51000: + times 4 dd 51000 diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h index bc80e64ef..05824c684 100644 --- a/vp8/encoder/x86/dct_x86.h +++ b/vp8/encoder/x86/dct_x86.h @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -21,46 +22,41 @@ #if HAVE_MMX extern prototype_fdct(vp8_short_fdct4x4_mmx); extern prototype_fdct(vp8_short_fdct8x4_mmx); -extern prototype_fdct(vp8_fast_fdct4x4_mmx); -extern prototype_fdct(vp8_fast_fdct8x4_mmx); #if !CONFIG_RUNTIME_CPU_DETECT +#if 0 #undef vp8_fdct_short4x4 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx - -#undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx - -#undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx +#endif #endif #endif #if HAVE_SSE2 -extern prototype_fdct(vp8_short_fdct4x4_wmt); extern prototype_fdct(vp8_short_fdct8x4_wmt); -extern prototype_fdct(vp8_fast_fdct8x4_wmt); - extern prototype_fdct(vp8_short_walsh4x4_sse2); -#if !CONFIG_RUNTIME_CPU_DETECT +extern prototype_fdct(vp8_short_fdct4x4_sse2); -#if 0 +#if !CONFIG_RUNTIME_CPU_DETECT +#if 1 /* short SSE2 DCT currently disabled, does not match the MMX version */ #undef vp8_fdct_short4x4 -#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt +#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2 #undef vp8_fdct_short8x4 -#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt +#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2 #endif +#undef vp8_fdct_fast4x4 +#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2 + #undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt +#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2 #undef vp8_fdct_walsh_short4x4 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2 diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h index 9397a6cca..69b3edd66 100644 --- a/vp8/encoder/x86/encodemb_x86.h +++ b/vp8/encoder/x86/encodemb_x86.h @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -54,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx); extern prototype_berr(vp8_block_error_xmm); extern prototype_mberr(vp8_mbblock_error_xmm); extern prototype_mbuverr(vp8_mbuverror_xmm); - +extern prototype_subb(vp8_subtract_b_sse2); +extern prototype_submby(vp8_subtract_mby_sse2); +extern prototype_submbuv(vp8_subtract_mbuv_sse2); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_encodemb_berr @@ -66,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm); #undef vp8_encodemb_mbuverr #define vp8_encodemb_mbuverr vp8_mbuverror_xmm +#undef vp8_encodemb_subb +#define vp8_encodemb_subb vp8_subtract_b_sse2 + +#undef vp8_encodemb_submby +#define vp8_encodemb_submby vp8_subtract_mby_sse2 + +#undef vp8_encodemb_submbuv +#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2 + #endif #endif diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index 194047155..c0f06bbbb 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -1,16 +1,16 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" - ;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) global sym(vp8_block_error_xmm) sym(vp8_block_error_xmm): @@ -19,11 +19,9 @@ sym(vp8_block_error_xmm): SHADOW_ARGS_TO_STACK 2 push rsi push rdi - ; end prolog - + ; end prologue mov rsi, arg(0) ;coeff_ptr - pxor xmm7, xmm7 mov rdi, arg(1) ;dcoef_ptr movdqa xmm3, [rsi] @@ -32,33 +30,27 @@ sym(vp8_block_error_xmm): movdqa xmm5, [rsi+16] movdqa xmm6, [rdi+16] - pxor xmm1, xmm1 ; from movd xmm1, dc; dc=0 + psubw xmm3, xmm4 - movdqa xmm2, xmm7 psubw xmm5, xmm6 - - por xmm1, xmm2 + pmaddwd xmm3, xmm3 pmaddwd xmm5, xmm5 - pcmpeqw xmm1, xmm7 - psubw xmm3, xmm4 + paddd xmm3, xmm5 - pand xmm1, xmm3 - pmaddwd xmm1, xmm1 - - paddd xmm1, xmm5 - movdqa xmm0, xmm1 + pxor xmm7, xmm7 + movdqa xmm0, xmm3 punpckldq xmm0, xmm7 - punpckhdq xmm1, xmm7 + punpckhdq xmm3, xmm7 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 + paddd xmm0, xmm3 + movdqa xmm3, xmm0 psrldq xmm0, 8 - paddd xmm0, xmm1 + paddd xmm0, xmm3 - movd rax, xmm0 + movq rax, xmm0 pop rdi pop rsi @@ -67,7 +59,6 @@ sym(vp8_block_error_xmm): pop rbp ret - ;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) global sym(vp8_block_error_mmx) sym(vp8_block_error_mmx): @@ -124,7 +115,7 @@ sym(vp8_block_error_mmx): psrlq mm1, 32 paddd mm0, mm1 - movd rax, mm0 + movq rax, mm0 pop rdi pop rsi @@ -201,7 +192,7 @@ mberror_loop_mmx: psrlq mm2, 32 paddd mm0, mm2 - movd rax, mm0 + movq rax, mm0 pop rdi pop rsi @@ -269,7 +260,7 @@ mberror_loop: psrldq xmm0, 8 paddd xmm0, xmm1 - movd rax, xmm0 + movq rax, xmm0 pop rdi pop rsi @@ -326,7 +317,7 @@ mbuverror_loop_mmx: psrlq mm7, 32 paddd mm0, mm7 - movd rax, mm0 + movq rax, mm0 pop rdi pop rsi @@ -383,7 +374,7 @@ mbuverror_loop: psrldq xmm1, 8 paddd xmm1, xmm2 - movd rax, xmm1 + movq rax, xmm1 pop rdi pop rsi diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm index 7d8620178..39439f0d8 100644 --- a/vp8/encoder/x86/fwalsh_sse2.asm +++ b/vp8/encoder/x86/fwalsh_sse2.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; @@ -16,102 +17,148 @@ sym(vp8_short_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 + SAVE_XMM + GET_GOT rbx push rsi push rdi ; end prolog - mov rsi, arg(0) - mov rdi, arg(1) - - movdqu xmm4, [rsi + 0] ;ip[4] ip[0] - movdqu xmm0, [rsi + 16] ;ip[12] ip[8] - - pxor xmm7, xmm7 - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm4 ;ip[4] ip[0] - - paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output + movsxd rdx, dword ptr arg(2) ; pitch + + ; first for loop + movq xmm0, MMWORD PTR [rsi] ; load input + movq xmm1, MMWORD PTR [rsi + rdx] + lea rsi, [rsi + rdx*2] + movq xmm2, MMWORD PTR [rsi] + movq xmm3, MMWORD PTR [rsi + rdx] + + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + movdqa xmm1, xmm0 + punpckldq xmm0, xmm2 ; ip[1] ip[0] + punpckhdq xmm1, xmm2 ; ip[3] ip[2] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + + psllw xmm0, 2 ; d1 a1 + psllw xmm2, 2 ; c1 b1 + + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 ; b1 a1 + punpckhqdq xmm1, xmm2 ; c1 d1 + + pxor xmm6, xmm6 + movq xmm6, xmm0 + pxor xmm7, xmm7 + pcmpeqw xmm7, xmm6 + paddw xmm7, [GLOBAL(c1)] + + movdqa xmm2, xmm0 + paddw xmm0, xmm1 ; b1+c1 a1+d1 + psubw xmm2, xmm1 ; b1-c1 a1-d1 + paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) + + ; second for loop + ; input: 13 9 5 1 12 8 4 0 (xmm0) + ; 14 10 6 2 15 11 7 3 (xmm2) + ; after shuffle: + ; 13 5 9 1 12 4 8 0 (xmm0) + ; 14 6 10 2 15 7 11 3 (xmm1) + pshuflw xmm3, xmm0, 0xd8 + pshufhw xmm0, xmm3, 0xd8 + pshuflw xmm3, xmm2, 0xd8 + pshufhw xmm1, xmm3, 0xd8 + + movdqa xmm2, xmm0 + pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 + pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 + movdqa xmm3, xmm1 + pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 + pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 + + pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 + pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 + pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 + pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 + + movdqa xmm0, xmm4 + punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 + punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 + movdqa xmm1, xmm6 + punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 + punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 + + movdqa xmm2, xmm0 + paddd xmm0, xmm4 ; b21 b20 a21 a20 + psubd xmm2, xmm4 ; c21 c20 d21 d20 + movdqa xmm3, xmm1 + paddd xmm1, xmm6 ; b23 b22 a23 a22 + psubd xmm3, xmm6 ; c23 c22 d23 d22 + + pxor xmm4, xmm4 movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm3 ;d1 a1 - punpckhqdq xmm5, xmm3 ;c1 b1 - - movdqa xmm1, xmm5 ;c1 b1 - paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm5 ;ip[4] ip[0] - - paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm6, xmm5 - punpcklqdq xmm5, xmm3 ;d1 a1 - punpckhqdq xmm6, xmm3 ;c1 b1 - - movdqa xmm1, xmm6 ;c1 b1 - paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] - - movdqa xmm0, xmm6 ;aka b2 a2 - movdqa xmm1, xmm5 ;aka d2 c2 - - pcmpgtw xmm0, xmm7 - pcmpgtw xmm1, xmm7 - - psrlw xmm0, 15 - psrlw xmm1, 15 - - paddw xmm6, xmm0 - paddw xmm5, xmm1 - - psraw xmm6, 1 - psraw xmm5, 1 - - ; a2 = a1 + b1; - ; b2 = c1 + d1; - ; c2 = a1 - b1; - ; d2 = d1 - c1; - ; a2 += (a2>0); - ; b2 += (b2>0); - ; c2 += (c2>0); - ; d2 += (d2>0); - ; op[0] = (a2)>>1; - ; op[4] = (b2)>>1; - ; op[8] = (c2)>>1; - ; op[12]= (d2)>>1; - - movdqu [rdi + 0], xmm6 - movdqu [rdi + 16], xmm5 + pcmpgtd xmm4, xmm0 + pcmpgtd xmm5, xmm2 + pand xmm4, [GLOBAL(cd1)] + pand xmm5, [GLOBAL(cd1)] + + pxor xmm6, xmm6 + movdqa xmm7, xmm6 + pcmpgtd xmm6, xmm1 + pcmpgtd xmm7, xmm3 + pand xmm6, [GLOBAL(cd1)] + pand xmm7, [GLOBAL(cd1)] + + paddd xmm0, xmm4 + paddd xmm2, xmm5 + paddd xmm0, [GLOBAL(cd3)] + paddd xmm2, [GLOBAL(cd3)] + paddd xmm1, xmm6 + paddd xmm3, xmm7 + paddd xmm1, [GLOBAL(cd3)] + paddd xmm3, [GLOBAL(cd3)] + + psrad xmm0, 3 + psrad xmm1, 3 + psrad xmm2, 3 + psrad xmm3, 3 + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 + punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 + movdqa xmm5, xmm2 + punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 + punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 + + packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 + packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm2 ; begin epilog pop rdi pop rsi + RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret + +SECTION_RODATA +align 16 +c1: + dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 +align 16 +cn1: + dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff +align 16 +cd1: + dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 +align 16 +cd3: + dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h index 5661491ad..3b7b29c21 100644 --- a/vp8/encoder/x86/mcomp_x86.h +++ b/vp8/encoder/x86/mcomp_x86.h @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -23,5 +24,14 @@ #endif #endif +#if HAVE_SSE4_1 +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_search_full_search +#define vp8_search_full_search vp8_full_search_sadx8 + +#endif +#endif + #endif diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c index 69617ca47..a182c8856 100644 --- a/vp8/encoder/x86/preproc_mmx.c +++ b/vp8/encoder/x86/preproc_mmx.c @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm index 847fc6e37..f29a54ecd 100644 --- a/vp8/encoder/x86/quantize_mmx.asm +++ b/vp8/encoder/x86/quantize_mmx.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; @@ -248,7 +249,7 @@ sym(vp8_fast_quantize_b_impl_mmx): paddd mm0, mm5 ; eob adjustment begins here - movd rcx, mm0 + movq rcx, mm0 and rcx, 0xffff xor rdx, rdx @@ -261,7 +262,7 @@ sym(vp8_fast_quantize_b_impl_mmx): and rax, rdx ; Substitute the sse assembly for the old mmx mixed assembly/C. The ; following is kept as reference - ; movd rcx, mm0 + ; movq rcx, mm0 ; bsr rax, rcx ; ; mov eob, rax @@ -283,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx): UNSHADOW_ARGS pop rbp ret - - -;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; short *scan_mask, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr); -global sym(vp8_fast_quantize_b_impl_sse) -sym(vp8_fast_quantize_b_impl_sse): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - movdqa xmm0, [rsi] - - mov rax, arg(1) ;zbin_ptr - movdqa xmm1, [rax] - - movdqa xmm3, xmm0 - psraw xmm0, 15 - - pxor xmm3, xmm0 - psubw xmm3, xmm0 ; abs - - movdqa xmm2, xmm3 - pcmpgtw xmm1, xmm2 - - pandn xmm1, xmm2 - movdqa xmm3, xmm1 - - mov rdx, arg(6) ; quant_ptr - movdqa xmm1, [rdx] - - mov rcx, arg(5) ; round_ptr - movdqa xmm2, [rcx] - - paddw xmm3, xmm2 - pmulhuw xmm3, xmm1 - - pxor xmm3, xmm0 - psubw xmm3, xmm0 ;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - movdqa xmm0, xmm3 - - movdqa [rdi], xmm3 - - mov rax, arg(3) ;dequant_ptr - movdqa xmm2, [rax] - - pmullw xmm3, xmm2 - mov rax, arg(7) ;dqcoeff_ptr - - movdqa [rax], xmm3 - - ; next 8 - movdqa xmm4, [rsi+16] - - mov rax, arg(1) ;zbin_ptr - movdqa xmm5, [rax+16] - - movdqa xmm7, xmm4 - psraw xmm4, 15 - - pxor xmm7, xmm4 - psubw xmm7, xmm4 ; abs - - movdqa xmm6, xmm7 - pcmpgtw xmm5, xmm6 - - pandn xmm5, xmm6 - movdqa xmm7, xmm5 - - movdqa xmm5, [rdx+16] - movdqa xmm6, [rcx+16] - - - paddw xmm7, xmm6 - pmulhuw xmm7, xmm5 - - pxor xmm7, xmm4 - psubw xmm7, xmm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movdqa xmm1, xmm7 - movdqa [rdi+16], xmm7 - - mov rax, arg(3) ;dequant_ptr - movdqa xmm6, [rax+16] - - pmullw xmm7, xmm6 - mov rax, arg(7) ;dqcoeff_ptr - - movdqa [rax+16], xmm7 - mov rdi, arg(4) ;scan_mask - - pxor xmm7, xmm7 - movdqa xmm2, [rdi] - - movdqa xmm3, [rdi+16]; - pcmpeqw xmm0, xmm7 - - pcmpeqw xmm1, xmm7 - pcmpeqw xmm6, xmm6 - - pxor xmm0, xmm6 - pxor xmm1, xmm6 - - psrlw xmm0, 15 - psrlw xmm1, 15 - - pmaddwd xmm0, xmm2 - pmaddwd xmm1, xmm3 - - movq xmm2, xmm0 - movq xmm3, xmm1 - - psrldq xmm0, 8 - psrldq xmm1, 8 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - - paddd xmm0, xmm2 - movq xmm1, xmm0 - - psrldq xmm0, 4 - paddd xmm1, xmm0 - - movd rcx, xmm1 - and rcx, 0xffff - - xor rdx, rdx - sub rdx, rcx - - bsr rax, rcx - inc rax - - sar rdx, 31 - and rax, rdx - - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm new file mode 100644 index 000000000..1e0bd5c48 --- /dev/null +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -0,0 +1,388 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; const int *default_zig_zag, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr, +; unsigned short zbin_oq_value, +; short *zbin_boost_ptr); +; +global sym(vp8_regular_quantize_b_impl_sse2) +sym(vp8_regular_quantize_b_impl_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 10 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + + %define abs_minus_zbin_lo 0 + %define abs_minus_zbin_hi 16 + %define temp_qcoeff_lo 32 + %define temp_qcoeff_hi 48 + %define save_xmm6 64 + %define save_xmm7 80 + %define eob 96 + + %define vp8_regularquantizeb_stack_size eob + 16 + + sub rsp, vp8_regularquantizeb_stack_size + + movdqa OWORD PTR[rsp + save_xmm6], xmm6 + movdqa OWORD PTR[rsp + save_xmm7], xmm7 + + mov rdx, arg(0) ;coeff_ptr + mov eax, arg(8) ;zbin_oq_value + + mov rcx, arg(1) ;zbin_ptr + movd xmm7, eax + + movdqa xmm0, OWORD PTR[rdx] + movdqa xmm4, OWORD PTR[rdx + 16] + + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + + psraw xmm0, 15 ;sign of z (aka sz) + psraw xmm4, 15 ;sign of z (aka sz) + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + + movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr + movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr + + pshuflw xmm7, xmm7, 0 + psubw xmm1, xmm0 ;x = abs(z) + + punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value + psubw xmm5, xmm4 ;x = abs(z) + + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value) + psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value) + + mov rdi, arg(5) ;round_ptr + mov rsi, arg(6) ;quant_ptr + + movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1 + movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5 + + paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back + paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back + + movdqa xmm2, OWORD PTR[rdi] + movdqa xmm3, OWORD PTR[rsi] + + movdqa xmm6, OWORD PTR[rdi + 16] + movdqa xmm7, OWORD PTR[rsi + 16] + + paddw xmm1, xmm2 + paddw xmm5, xmm6 + + pmulhw xmm1, xmm3 + pmulhw xmm5, xmm7 + + mov rsi, arg(2) ;qcoeff_ptr + pxor xmm6, xmm6 + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1 + movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5 + + movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff + movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff + + xor rax, rax + mov rcx, -1 + + mov [rsp + eob], rcx + mov rsi, arg(9) ;zbin_boost_ptr + + mov rbx, arg(4) ;default_zig_zag + +rq_zigzag_loop: + movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + + sub edx, edi ;x - zbin + jl rq_zigzag_1 + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1 + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1: + movsxd rcx, DWORD PTR[rbx + rax*4 + 4] + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + lea rax, [rax + 1] + + sub edx, edi ;x - zbin + jl rq_zigzag_1a + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1a + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1a: + movsxd rcx, DWORD PTR[rbx + rax*4 + 4] + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + lea rax, [rax + 1] + + sub edx, edi ;x - zbin + jl rq_zigzag_1b + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1b + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1b: + movsxd rcx, DWORD PTR[rbx + rax*4 + 4] + movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin + lea rsi, [rsi + 2] ;zbin_boost_ptr++ + + movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + lea rax, [rax + 1] + + sub edx, edi ;x - zbin + jl rq_zigzag_1c + + mov rdi, arg(2) ;qcoeff_ptr + + movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] + + cmp edx, 0 + je rq_zigzag_1c + + mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ;zbin_boost_ptr + mov [rsp + eob], rax ;eob = i + +rq_zigzag_1c: + lea rax, [rax + 1] + + cmp rax, 16 + jl rq_zigzag_loop + + mov rdi, arg(2) ;qcoeff_ptr + mov rcx, arg(3) ;dequant_ptr + mov rsi, arg(7) ;dqcoeff_ptr + + movdqa xmm2, OWORD PTR[rdi] + movdqa xmm3, OWORD PTR[rdi + 16] + + movdqa xmm0, OWORD PTR[rcx] + movdqa xmm1, OWORD PTR[rcx + 16] + + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff + movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff + + mov rax, [rsp + eob] + + movdqa xmm6, OWORD PTR[rsp + save_xmm6] + movdqa xmm7, OWORD PTR[rsp + save_xmm7] + + add rax, 1 + + add rsp, vp8_regularquantizeb_stack_size + pop rsp + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, +; short *qcoeff_ptr,short *dequant_ptr, +; short *scan_mask, short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +global sym(vp8_fast_quantize_b_impl_sse2) +sym(vp8_fast_quantize_b_impl_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + + %define save_xmm6 0 + %define save_xmm7 16 + + %define vp8_fastquantizeb_stack_size save_xmm7 + 16 + + sub rsp, vp8_fastquantizeb_stack_size + + movdqa XMMWORD PTR[rsp + save_xmm6], xmm6 + movdqa XMMWORD PTR[rsp + save_xmm7], xmm7 + + mov rdx, arg(0) ;coeff_ptr + mov rcx, arg(2) ;dequant_ptr + mov rax, arg(3) ;scan_mask + mov rdi, arg(4) ;round_ptr + mov rsi, arg(5) ;quant_ptr + + movdqa xmm0, XMMWORD PTR[rdx] + movdqa xmm4, XMMWORD PTR[rdx + 16] + + movdqa xmm6, XMMWORD PTR[rdi] ;round lo + movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi + + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + + psraw xmm0, 15 ;sign of z (aka sz) + psraw xmm4, 15 ;sign of z (aka sz) + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 ;x = abs(z) + psubw xmm5, xmm4 ;x = abs(z) + + paddw xmm1, xmm6 + paddw xmm5, xmm7 + + pmulhw xmm1, XMMWORD PTR[rsi] + pmulhw xmm5, XMMWORD PTR[rsi + 16] + + mov rdi, arg(1) ;qcoeff_ptr + mov rsi, arg(6) ;dqcoeff_ptr + + movdqa xmm6, XMMWORD PTR[rcx] + movdqa xmm7, XMMWORD PTR[rcx + 16] + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + movdqa XMMWORD PTR[rdi], xmm1 + movdqa XMMWORD PTR[rdi + 16], xmm5 + + pmullw xmm6, xmm1 + pmullw xmm7, xmm5 + + movdqa xmm2, XMMWORD PTR[rax] + movdqa xmm3, XMMWORD PTR[rax+16]; + + pxor xmm4, xmm4 ;clear all bits + pcmpeqw xmm1, xmm4 + pcmpeqw xmm5, xmm4 + + pcmpeqw xmm4, xmm4 ;set all bits + pxor xmm1, xmm4 + pxor xmm5, xmm4 + + psrlw xmm1, 15 + psrlw xmm5, 15 + + pmaddwd xmm1, xmm2 + pmaddwd xmm5, xmm3 + + movq xmm2, xmm1 + movq xmm3, xmm5 + + psrldq xmm1, 8 + psrldq xmm5, 8 + + paddd xmm1, xmm5 + paddd xmm2, xmm3 + + paddd xmm1, xmm2 + movq xmm5, xmm1 + + psrldq xmm1, 4 + paddd xmm5, xmm1 + + movq rcx, xmm5 + and rcx, 0xffff + + xor rdx, rdx + sub rdx, rcx + + bsr rax, rcx + inc rax + + sar rdx, 31 + and rax, rdx + + movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff + movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff + + movdqa xmm6, XMMWORD PTR[rsp + save_xmm6] + movdqa xmm7, XMMWORD PTR[rsp + save_xmm7] + + add rsp, vp8_fastquantizeb_stack_size + pop rsp + + ; begin epilog + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm new file mode 100755 index 000000000..2f33199e5 --- /dev/null +++ b/vp8/encoder/x86/quantize_ssse3.asm @@ -0,0 +1,114 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr +; short *qcoeff_ptr,short *dequant_ptr, +; short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +; +global sym(vp8_fast_quantize_b_impl_ssse3) +sym(vp8_fast_quantize_b_impl_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(0) ;coeff_ptr + mov rdi, arg(3) ;round_ptr + mov rsi, arg(4) ;quant_ptr + + movdqa xmm0, [rdx] + movdqa xmm4, [rdx + 16] + + movdqa xmm2, [rdi] ;round lo + movdqa xmm3, [rdi + 16] ;round hi + + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + + psraw xmm0, 15 ;sign of z (aka sz) + psraw xmm4, 15 ;sign of z (aka sz) + + pabsw xmm1, xmm1 + pabsw xmm5, xmm5 + + paddw xmm1, xmm2 + paddw xmm5, xmm3 + + pmulhw xmm1, [rsi] + pmulhw xmm5, [rsi + 16] + + mov rdi, arg(1) ;qcoeff_ptr + mov rcx, arg(2) ;dequant_ptr + mov rsi, arg(5) ;dqcoeff_ptr + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm5 + + movdqa xmm2, [rcx] + movdqa xmm3, [rcx + 16] + + pxor xmm4, xmm4 + pmullw xmm2, xmm1 + pmullw xmm3, xmm5 + + pcmpeqw xmm1, xmm4 ;non zero mask + pcmpeqw xmm5, xmm4 ;non zero mask + packsswb xmm1, xmm5 + pshufb xmm1, [ GLOBAL(zz_shuf)] + + pmovmskb edx, xmm1 + +; xor ecx, ecx +; mov eax, -1 +;find_eob_loop: +; shr edx, 1 +; jc fq_skip +; mov eax, ecx +;fq_skip: +; inc ecx +; cmp ecx, 16 +; jne find_eob_loop + xor rdi, rdi + mov eax, -1 + xor dx, ax ;flip the bits for bsr + bsr eax, edx + + movdqa [rsi], xmm2 ;store dqcoeff + movdqa [rsi + 16], xmm3 ;store dqcoeff + + sub edi, edx ;check for all zeros in bit mask + sar edi, 31 ;0 or -1 + add eax, 1 + and eax, edi ;if the bit mask was all zero, + ;then eob = 0 + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +zz_shuf: + db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h new file mode 100644 index 000000000..b5b22c022 --- /dev/null +++ b/vp8/encoder/x86/quantize_x86.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + +#ifndef QUANTIZE_X86_H +#define QUANTIZE_X86_H + + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ +#if HAVE_MMX + +#endif + + +#if HAVE_SSE2 +extern prototype_quantize_block(vp8_regular_quantize_b_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +/* The sse2 quantizer has not been updated to match the new exact + * quantizer introduced in commit e04e2935 + *#undef vp8_quantize_quantb + *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2 + */ + +#endif + +#endif + + +#endif diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm index a825698e7..85cb023a4 100644 --- a/vp8/encoder/x86/sad_mmx.asm +++ b/vp8/encoder/x86/sad_mmx.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; @@ -16,8 +17,6 @@ global sym(vp8_sad8x8_mmx) global sym(vp8_sad4x4_mmx) global sym(vp8_sad16x8_mmx) -%idefine QWORD - ;unsigned int vp8_sad16x16_mmx( ; unsigned char *src_ptr, ; int src_stride, @@ -99,7 +98,7 @@ x16x16sad_mmx_loop: psrlq mm0, 32 paddw mm7, mm0 - movd rax, mm7 + movq rax, mm7 pop rdi pop rsi @@ -171,7 +170,7 @@ x8x16sad_mmx_loop: psrlq mm0, 32 paddw mm7, mm0 - movd rax, mm7 + movq rax, mm7 pop rdi pop rsi @@ -241,7 +240,7 @@ x8x8sad_mmx_loop: psrlq mm0, 32 paddw mm7, mm0 - movd rax, mm7 + movq rax, mm7 pop rdi pop rsi @@ -271,11 +270,11 @@ sym(vp8_sad4x4_mmx): movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride - movd mm0, QWORD PTR [rsi] - movd mm1, QWORD PTR [rdi] + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rdi] - movd mm2, QWORD PTR [rsi+rax] - movd mm3, QWORD PTR [rdi+rdx] + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rdi+rdx] punpcklbw mm0, mm2 punpcklbw mm1, mm3 @@ -297,11 +296,11 @@ sym(vp8_sad4x4_mmx): lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - movd mm4, QWORD PTR [rsi] - movd mm5, QWORD PTR [rdi] + movd mm4, DWORD PTR [rsi] + movd mm5, DWORD PTR [rdi] - movd mm6, QWORD PTR [rsi+rax] - movd mm7, QWORD PTR [rdi+rdx] + movd mm6, DWORD PTR [rsi+rax] + movd mm7, DWORD PTR [rdi+rdx] punpcklbw mm4, mm6 punpcklbw mm5, mm7 @@ -330,7 +329,7 @@ sym(vp8_sad4x4_mmx): psrlq mm0, 32 paddw mm0, mm1 - movd rax, mm0 + movq rax, mm0 pop rdi pop rsi @@ -417,7 +416,7 @@ x16x8sad_mmx_loop: psrlq mm0, 32 paddw mm7, mm0 - movd rax, mm7 + movq rax, mm7 pop rdi pop rsi diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index 53240bbf1..39ed79604 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -1,17 +1,16 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" -%idefine QWORD - ;unsigned int vp8_sad16x16_wmt( ; unsigned char *src_ptr, ; int src_stride, @@ -74,7 +73,7 @@ x16x16sad_wmt_loop: psrldq xmm7, 8 paddw xmm0, xmm7 - movd rax, xmm0 + movq rax, xmm0 ; begin epilog pop rdi @@ -112,7 +111,7 @@ sym(vp8_sad8x16_wmt): x8x16sad_wmt_loop: - movd rax, mm7 + movq rax, mm7 cmp rax, arg(4) jg x8x16sad_wmt_early_exit @@ -134,7 +133,7 @@ x8x16sad_wmt_loop: cmp rsi, rcx jne x8x16sad_wmt_loop - movd rax, mm7 + movq rax, mm7 x8x16sad_wmt_early_exit: @@ -173,7 +172,7 @@ sym(vp8_sad8x8_wmt): x8x8sad_wmt_loop: - movd rax, mm7 + movq rax, mm7 cmp rax, arg(4) jg x8x8sad_wmt_early_exit @@ -189,7 +188,7 @@ x8x8sad_wmt_loop: cmp rsi, rcx jne x8x8sad_wmt_loop - movd rax, mm7 + movq rax, mm7 x8x8sad_wmt_early_exit: ; begin epilog @@ -220,11 +219,11 @@ sym(vp8_sad4x4_wmt): movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride - movd mm0, QWORD PTR [rsi] - movd mm1, QWORD PTR [rdi] + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rdi] - movd mm2, QWORD PTR [rsi+rax] - movd mm3, QWORD PTR [rdi+rdx] + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rdi+rdx] punpcklbw mm0, mm2 punpcklbw mm1, mm3 @@ -233,19 +232,19 @@ sym(vp8_sad4x4_wmt): lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - movd mm4, QWORD PTR [rsi] + movd mm4, DWORD PTR [rsi] - movd mm5, QWORD PTR [rdi] - movd mm6, QWORD PTR [rsi+rax] + movd mm5, DWORD PTR [rdi] + movd mm6, DWORD PTR [rsi+rax] - movd mm7, QWORD PTR [rdi+rdx] + movd mm7, DWORD PTR [rdi+rdx] punpcklbw mm4, mm6 punpcklbw mm5, mm7 psadbw mm4, mm5 paddw mm0, mm4 - movd rax, mm0 + movq rax, mm0 ; begin epilog pop rdi @@ -282,7 +281,7 @@ sym(vp8_sad16x8_wmt): x16x8sad_wmt_loop: - movd rax, mm7 + movq rax, mm7 cmp rax, arg(4) jg x16x8sad_wmt_early_exit @@ -316,7 +315,7 @@ x16x8sad_wmt_loop: cmp rsi, rcx jne x16x8sad_wmt_loop - movd rax, mm7 + movq rax, mm7 x16x8sad_wmt_early_exit: diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 38cc02957..1b7293c20 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -1,32 +1,31 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" -%idefine QWORD - %macro PROCESS_16X2X3 1 %if %1 - movdqa xmm0, [rsi] - lddqu xmm5, [rdi] - lddqu xmm6, [rdi+1] - lddqu xmm7, [rdi+2] + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm5, XMMWORD PTR [rdi] + lddqu xmm6, XMMWORD PTR [rdi+1] + lddqu xmm7, XMMWORD PTR [rdi+2] psadbw xmm5, xmm0 psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else - movdqa xmm0, [rsi] - lddqu xmm1, [rdi] - lddqu xmm2, [rdi+1] - lddqu xmm3, [rdi+2] + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rdi] + lddqu xmm2, XMMWORD PTR [rdi+1] + lddqu xmm3, XMMWORD PTR [rdi+2] psadbw xmm1, xmm0 psadbw xmm2, xmm0 @@ -36,10 +35,10 @@ paddw xmm6, xmm2 paddw xmm7, xmm3 %endif - movdqa xmm0, QWORD PTR [rsi+rax] - lddqu xmm1, QWORD PTR [rdi+rdx] - lddqu xmm2, QWORD PTR [rdi+rdx+1] - lddqu xmm3, QWORD PTR [rdi+rdx+2] + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rdi+rdx] + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] @@ -55,19 +54,19 @@ %macro PROCESS_8X2X3 1 %if %1 - movq mm0, [rsi] - movq mm5, [rdi] - movq mm6, [rdi+1] - movq mm7, [rdi+2] + movq mm0, QWORD PTR [rsi] + movq mm5, QWORD PTR [rdi] + movq mm6, QWORD PTR [rdi+1] + movq mm7, QWORD PTR [rdi+2] psadbw mm5, mm0 psadbw mm6, mm0 psadbw mm7, mm0 %else - movq mm0, [rsi] - movq mm1, [rdi] - movq mm2, [rdi+1] - movq mm3, [rdi+2] + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rdi] + movq mm2, QWORD PTR [rdi+1] + movq mm3, QWORD PTR [rdi+2] psadbw mm1, mm0 psadbw mm2, mm0 @@ -104,45 +103,45 @@ %macro PROCESS_16X2X4 1 %if %1 - movdqa xmm0, [rsi] - lddqu xmm4, [rcx] - lddqu xmm5, [rdx] - lddqu xmm6, [rbx] - lddqu xmm7, [rdi] + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm4, XMMWORD PTR [rcx] + lddqu xmm5, XMMWORD PTR [rdx] + lddqu xmm6, XMMWORD PTR [rbx] + lddqu xmm7, XMMWORD PTR [rdi] psadbw xmm4, xmm0 psadbw xmm5, xmm0 psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else - movdqa xmm0, [rsi] - lddqu xmm1, [rcx] - lddqu xmm2, [rdx] - lddqu xmm3, [rbx] + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rcx] + lddqu xmm2, XMMWORD PTR [rdx] + lddqu xmm3, XMMWORD PTR [rbx] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 paddw xmm4, xmm1 - lddqu xmm1, [rdi] + lddqu xmm1, XMMWORD PTR [rdi] paddw xmm5, xmm2 paddw xmm6, xmm3 psadbw xmm1, xmm0 paddw xmm7, xmm1 %endif - movdqa xmm0, QWORD PTR [rsi+rax] - lddqu xmm1, QWORD PTR [rcx+rbp] - lddqu xmm2, QWORD PTR [rdx+rbp] - lddqu xmm3, QWORD PTR [rbx+rbp] + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rcx+rbp] + lddqu xmm2, XMMWORD PTR [rdx+rbp] + lddqu xmm3, XMMWORD PTR [rbx+rbp] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 paddw xmm4, xmm1 - lddqu xmm1, QWORD PTR [rdi+rbp] + lddqu xmm1, XMMWORD PTR [rdi+rbp] paddw xmm5, xmm2 paddw xmm6, xmm3 @@ -161,28 +160,28 @@ %macro PROCESS_8X2X4 1 %if %1 - movq mm0, [rsi] - movq mm4, [rcx] - movq mm5, [rdx] - movq mm6, [rbx] - movq mm7, [rdi] + movq mm0, QWORD PTR [rsi] + movq mm4, QWORD PTR [rcx] + movq mm5, QWORD PTR [rdx] + movq mm6, QWORD PTR [rbx] + movq mm7, QWORD PTR [rdi] psadbw mm4, mm0 psadbw mm5, mm0 psadbw mm6, mm0 psadbw mm7, mm0 %else - movq mm0, [rsi] - movq mm1, [rcx] - movq mm2, [rdx] - movq mm3, [rbx] + movq mm0, QWORD PTR [rsi] + movq mm1, QWORD PTR [rcx] + movq mm2, QWORD PTR [rdx] + movq mm3, QWORD PTR [rbx] psadbw mm1, mm0 psadbw mm2, mm0 psadbw mm3, mm0 paddw mm4, mm1 - movq mm1, [rdi] + movq mm1, QWORD PTR [rdi] paddw mm5, mm2 paddw mm6, mm3 @@ -429,20 +428,20 @@ sym(vp8_sad4x4x3_sse3): movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride - movd mm0, QWORD PTR [rsi] - movd mm1, QWORD PTR [rdi] + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rdi] - movd mm2, QWORD PTR [rsi+rax] - movd mm3, QWORD PTR [rdi+rdx] + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rdi+rdx] punpcklbw mm0, mm2 punpcklbw mm1, mm3 - movd mm4, QWORD PTR [rdi+1] - movd mm5, QWORD PTR [rdi+2] + movd mm4, DWORD PTR [rdi+1] + movd mm5, DWORD PTR [rdi+2] - movd mm2, QWORD PTR [rdi+rdx+1] - movd mm3, QWORD PTR [rdi+rdx+2] + movd mm2, DWORD PTR [rdi+rdx+1] + movd mm3, DWORD PTR [rdi+rdx+2] psadbw mm1, mm0 @@ -457,24 +456,24 @@ sym(vp8_sad4x4x3_sse3): lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - movd mm0, QWORD PTR [rsi] - movd mm2, QWORD PTR [rdi] + movd mm0, DWORD PTR [rsi] + movd mm2, DWORD PTR [rdi] - movd mm3, QWORD PTR [rsi+rax] - movd mm6, QWORD PTR [rdi+rdx] + movd mm3, DWORD PTR [rsi+rax] + movd mm6, DWORD PTR [rdi+rdx] punpcklbw mm0, mm3 punpcklbw mm2, mm6 - movd mm3, QWORD PTR [rdi+1] - movd mm7, QWORD PTR [rdi+2] + movd mm3, DWORD PTR [rdi+1] + movd mm7, DWORD PTR [rdi+2] psadbw mm2, mm0 paddw mm1, mm2 - movd mm2, QWORD PTR [rdi+rdx+1] - movd mm6, QWORD PTR [rdi+rdx+2] + movd mm2, DWORD PTR [rdi+rdx+1] + movd mm6, DWORD PTR [rdi+rdx+2] punpcklbw mm3, mm2 punpcklbw mm7, mm6 @@ -529,7 +528,7 @@ sym(vp8_sad16x16_sse3): vp8_sad16x16_sse3_loop: - movd rax, mm7 + movq rax, mm7 cmp rax, arg(4) jg vp8_sad16x16_early_exit @@ -563,7 +562,7 @@ vp8_sad16x16_sse3_loop: cmp rsi, rcx jne vp8_sad16x16_sse3_loop - movd rax, mm7 + movq rax, mm7 vp8_sad16x16_early_exit: @@ -845,23 +844,23 @@ sym(vp8_sad4x4x4d_sse3): xchg rbx, rax - movd mm0, QWORD PTR [rsi] - movd mm1, QWORD PTR [rcx] + movd mm0, DWORD PTR [rsi] + movd mm1, DWORD PTR [rcx] - movd mm2, QWORD PTR [rsi+rax] - movd mm3, QWORD PTR [rcx+rbp] + movd mm2, DWORD PTR [rsi+rax] + movd mm3, DWORD PTR [rcx+rbp] punpcklbw mm0, mm2 punpcklbw mm1, mm3 - movd mm4, QWORD PTR [rdx] - movd mm5, QWORD PTR [rbx] + movd mm4, DWORD PTR [rdx] + movd mm5, DWORD PTR [rbx] - movd mm6, QWORD PTR [rdi] - movd mm2, QWORD PTR [rdx+rbp] + movd mm6, DWORD PTR [rdi] + movd mm2, DWORD PTR [rdx+rbp] - movd mm3, QWORD PTR [rbx+rbp] - movd mm7, QWORD PTR [rdi+rbp] + movd mm3, DWORD PTR [rbx+rbp] + movd mm7, DWORD PTR [rdi+rbp] psadbw mm1, mm0 @@ -884,17 +883,17 @@ sym(vp8_sad4x4x4d_sse3): lea rdi, [rdi+rbp*2] - movd mm0, QWORD PTR [rsi] - movd mm2, QWORD PTR [rcx] + movd mm0, DWORD PTR [rsi] + movd mm2, DWORD PTR [rcx] - movd mm3, QWORD PTR [rsi+rax] - movd mm7, QWORD PTR [rcx+rbp] + movd mm3, DWORD PTR [rsi+rax] + movd mm7, DWORD PTR [rcx+rbp] punpcklbw mm0, mm3 punpcklbw mm2, mm7 - movd mm3, QWORD PTR [rdx] - movd mm7, QWORD PTR [rbx] + movd mm3, DWORD PTR [rdx] + movd mm7, DWORD PTR [rbx] psadbw mm2, mm0 mov rax, rbp @@ -905,8 +904,8 @@ sym(vp8_sad4x4x4d_sse3): paddw mm1, mm2 movd [rsi], mm1 - movd mm2, QWORD PTR [rdx+rax] - movd mm1, QWORD PTR [rbx+rax] + movd mm2, DWORD PTR [rdx+rax] + movd mm1, DWORD PTR [rbx+rax] punpcklbw mm3, mm2 punpcklbw mm7, mm1 @@ -914,8 +913,8 @@ sym(vp8_sad4x4x4d_sse3): psadbw mm3, mm0 psadbw mm7, mm0 - movd mm2, QWORD PTR [rdi] - movd mm1, QWORD PTR [rdi+rax] + movd mm2, DWORD PTR [rdi] + movd mm1, DWORD PTR [rdi+rax] paddw mm3, mm4 paddw mm7, mm5 diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm new file mode 100644 index 000000000..21e2e5007 --- /dev/null +++ b/vp8/encoder/x86/sad_sse4.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X8 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm1, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endif + movdqa xmm0, XMMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + movq xmm2, MMWORD PTR [rdi+ rdx+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_8X2X8 1 +%if %1 + movq xmm0, MMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm1, xmm2 +%else + movq xmm0, MMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endif + movq xmm0, MMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_4X2X8 1 +%if %1 + movd xmm0, [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + mpsadbw xmm1, xmm0, 0x0 +%else + movd xmm0, [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endif + movd xmm0, [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endmacro + + +;void vp8_sad16x16x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array); +global sym(vp8_sad16x16x8_sse4) +sym(vp8_sad16x16x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + mov rdi, arg(4) ;Results + movdqu XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad16x8x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad16x8x8_sse4) +sym(vp8_sad16x8x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + mov rdi, arg(4) ;Results + movdqu XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad8x8x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad8x8x8_sse4) +sym(vp8_sad8x8x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + mov rdi, arg(4) ;Results + movdqu XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad8x16x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad8x16x8_sse4) +sym(vp8_sad8x16x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + mov rdi, arg(4) ;Results + movdqu XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad4x4x8_c( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad4x4x8_sse4) +sym(vp8_sad4x4x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 + + mov rdi, arg(4) ;Results + movdqu XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + + diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 1bb956121..69c5eaedc 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -1,32 +1,31 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" -%idefine QWORD - %macro PROCESS_16X2X3 1 %if %1 - movdqa xmm0, [rsi] - lddqu xmm5, [rdi] - lddqu xmm6, [rdi+1] - lddqu xmm7, [rdi+2] + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm5, XMMWORD PTR [rdi] + lddqu xmm6, XMMWORD PTR [rdi+1] + lddqu xmm7, XMMWORD PTR [rdi+2] psadbw xmm5, xmm0 psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else - movdqa xmm0, [rsi] - lddqu xmm1, [rdi] - lddqu xmm2, [rdi+1] - lddqu xmm3, [rdi+2] + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rdi] + lddqu xmm2, XMMWORD PTR [rdi+1] + lddqu xmm3, XMMWORD PTR [rdi+2] psadbw xmm1, xmm0 psadbw xmm2, xmm0 @@ -36,10 +35,10 @@ paddw xmm6, xmm2 paddw xmm7, xmm3 %endif - movdqa xmm0, QWORD PTR [rsi+rax] - lddqu xmm1, QWORD PTR [rdi+rdx] - lddqu xmm2, QWORD PTR [rdi+rdx+1] - lddqu xmm3, QWORD PTR [rdi+rdx+2] + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rdi+rdx] + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] @@ -55,9 +54,9 @@ %macro PROCESS_16X2X3_OFFSET 2 %if %1 - movdqa xmm0, [rsi] - movdqa xmm4, [rdi] - movdqa xmm7, [rdi+16] + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm7, XMMWORD PTR [rdi+16] movdqa xmm5, xmm7 palignr xmm5, xmm4, %2 @@ -71,9 +70,9 @@ psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else - movdqa xmm0, [rsi] - movdqa xmm4, [rdi] - movdqa xmm3, [rdi+16] + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm3, XMMWORD PTR [rdi+16] movdqa xmm1, xmm3 palignr xmm1, xmm4, %2 @@ -91,9 +90,9 @@ paddw xmm6, xmm2 paddw xmm7, xmm3 %endif - movdqa xmm0, QWORD PTR [rsi+rax] - movdqa xmm4, QWORD PTR [rdi+rdx] - movdqa xmm3, QWORD PTR [rdi+rdx+16] + movdqa xmm0, XMMWORD PTR [rsi+rax] + movdqa xmm4, XMMWORD PTR [rdi+rdx] + movdqa xmm3, XMMWORD PTR [rdi+rdx+16] movdqa xmm1, xmm3 palignr xmm1, xmm4, %2 diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm index ce3e61066..a47e1f0d6 100644 --- a/vp8/encoder/x86/subtract_mmx.asm +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -1,20 +1,21 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" ;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, -; unsigned short *diff, unsigned char *Predictor, +; short *diff, unsigned char *Predictor, ; int pitch); global sym(vp8_subtract_b_mmx_impl) -sym(vp8_subtract_b_mmx_impl) +sym(vp8_subtract_b_mmx_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -150,7 +151,7 @@ submby_loop: ;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) global sym(vp8_subtract_mbuv_mmx) -sym(vp8_subtract_mbuv_mmx) +sym(vp8_subtract_mbuv_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm new file mode 100644 index 000000000..3fb23d097 --- /dev/null +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -0,0 +1,356 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, +; short *diff, unsigned char *Predictor, +; int pitch); +global sym(vp8_subtract_b_sse2_impl) +sym(vp8_subtract_b_sse2_impl): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdi, arg(2) ;diff + mov rax, arg(3) ;Predictor + mov rsi, arg(0) ;z + movsxd rdx, dword ptr arg(1);src_stride; + movsxd rcx, dword ptr arg(4);pitch + pxor mm7, mm7 + + movd mm0, [rsi] + movd mm1, [rax] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi], mm0 + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi+rcx*2], mm0 + + movd mm0, [rsi+rdx*2] + movd mm1, [rax+rcx*2] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi+rcx*4], mm0 + + lea rsi, [rsi+rdx*2] + lea rcx, [rcx+rcx*2] + + movd mm0, [rsi+rdx] + movd mm1, [rax+rcx] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + psubw mm0, mm1 + movq MMWORD PTR [rdi+rcx*2], mm0 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) +global sym(vp8_subtract_mby_sse2) +sym(vp8_subtract_mby_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(1) ;src + mov rdi, arg(0) ;diff + + mov rax, arg(2) ;pred + movsxd rdx, dword ptr arg(3) ;stride + + mov rcx, 8 ; do two lines at one time + +submby_loop: + movdqa xmm0, XMMWORD PTR [rsi] ; src + movdqa xmm1, XMMWORD PTR [rax] ; pred + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi +16], xmm2 + + movdqa xmm4, XMMWORD PTR [rsi + rdx] + movdqa xmm5, XMMWORD PTR [rax + 16] + + movdqa xmm6, xmm4 + psubb xmm4, xmm5 + + pxor xmm5, [GLOBAL(t80)] ;convert to signed values + pxor xmm6, [GLOBAL(t80)] + pcmpgtb xmm5, xmm6 ; obtain sign information + + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + punpcklbw xmm4, xmm5 ; put sign back to subtraction + punpckhbw xmm6, xmm7 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi +32], xmm4 + movdqa XMMWORD PTR [rdi +48], xmm6 + + add rdi, 64 + add rax, 32 + lea rsi, [rsi+rdx*2] + + sub rcx, 1 + jnz submby_loop + + pop rdi + pop rsi + ; begin epilog + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) +global sym(vp8_subtract_mbuv_sse2) +sym(vp8_subtract_mbuv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdi, arg(0) ;diff + mov rax, arg(3) ;pred + mov rsi, arg(1) ;z = usrc + add rdi, 256*2 ;diff = diff + 256 (shorts) + add rax, 256 ;Predictor = pred + 256 + movsxd rdx, dword ptr arg(4) ;stride; + lea rcx, [rdx + rdx*2] + + ;u + ;line 0 1 + movq xmm0, MMWORD PTR [rsi] ; src + movq xmm2, MMWORD PTR [rsi+rdx] + movdqa xmm1, XMMWORD PTR [rax] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi +16], xmm2 + + ;line 2 3 + movq xmm0, MMWORD PTR [rsi+rdx*2] ; src + movq xmm2, MMWORD PTR [rsi+rcx] + movdqa xmm1, XMMWORD PTR [rax+16] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi + 32], xmm0 + movdqa XMMWORD PTR [rdi + 48], xmm2 + + ;line 4 5 + lea rsi, [rsi + rdx*4] + + movq xmm0, MMWORD PTR [rsi] ; src + movq xmm2, MMWORD PTR [rsi+rdx] + movdqa xmm1, XMMWORD PTR [rax + 32] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi + 64], xmm0 + movdqa XMMWORD PTR [rdi + 80], xmm2 + + ;line 6 7 + movq xmm0, MMWORD PTR [rsi+rdx*2] ; src + movq xmm2, MMWORD PTR [rsi+rcx] + movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi + 96], xmm0 + movdqa XMMWORD PTR [rdi + 112], xmm2 + + ;v + mov rsi, arg(2) ;z = vsrc + add rdi, 64*2 ;diff = diff + 320 (shorts) + add rax, 64 ;Predictor = pred + 320 + + ;line 0 1 + movq xmm0, MMWORD PTR [rsi] ; src + movq xmm2, MMWORD PTR [rsi+rdx] + movdqa xmm1, XMMWORD PTR [rax] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi +16], xmm2 + + ;line 2 3 + movq xmm0, MMWORD PTR [rsi+rdx*2] ; src + movq xmm2, MMWORD PTR [rsi+rcx] + movdqa xmm1, XMMWORD PTR [rax+16] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi + 32], xmm0 + movdqa XMMWORD PTR [rdi + 48], xmm2 + + ;line 4 5 + lea rsi, [rsi + rdx*4] + + movq xmm0, MMWORD PTR [rsi] ; src + movq xmm2, MMWORD PTR [rsi+rdx] + movdqa xmm1, XMMWORD PTR [rax + 32] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi + 64], xmm0 + movdqa XMMWORD PTR [rdi + 80], xmm2 + + ;line 6 7 + movq xmm0, MMWORD PTR [rsi+rdx*2] ; src + movq xmm2, MMWORD PTR [rsi+rcx] + movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred + punpcklqdq xmm0, xmm2 + + movdqa xmm2, xmm0 + psubb xmm0, xmm1 ; subtraction with sign missed + + pxor xmm1, [GLOBAL(t80)] ;convert to signed values + pxor xmm2, [GLOBAL(t80)] + pcmpgtb xmm1, xmm2 ; obtain sign information + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + punpcklbw xmm0, xmm1 ; put sign back to subtraction + punpckhbw xmm2, xmm3 ; put sign back to subtraction + + movdqa XMMWORD PTR [rdi + 96], xmm0 + movdqa XMMWORD PTR [rdi + 112], xmm2 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t80: + times 16 db 0x80 diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm index d0da82ad4..67a9b4d3e 100644 --- a/vp8/encoder/x86/variance_impl_mmx.asm +++ b/vp8/encoder/x86/variance_impl_mmx.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; @@ -497,7 +498,7 @@ sym(vp8_get4x4sse_cs_mmx): psrlq mm7, 32 paddd mm0, mm7 - movd rax, mm0 + movq rax, mm0 ; begin epilog @@ -555,7 +556,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx): pmullw mm3, [rax+8] ; paddw mm1, mm3 ; - paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; psraw mm1, mmx_filter_shift ; movq mm5, mm1 @@ -579,7 +580,7 @@ filter_block2d_bil4x4_var_mmx_loop: pmullw mm3, [rax+8] ; paddw mm1, mm3 ; - paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; psraw mm1, mmx_filter_shift ; movq mm3, mm5 ; @@ -591,7 +592,7 @@ filter_block2d_bil4x4_var_mmx_loop: paddw mm1, mm3 ; - paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; psraw mm1, mmx_filter_shift ; movd mm3, [rdi] ; @@ -709,10 +710,10 @@ sym(vp8_filter_block2d_bil_var_mmx): paddw mm1, mm3 ; paddw mm2, mm4 ; - paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; psraw mm1, mmx_filter_shift ; - paddw mm2, [mmx_bi_rd GLOBAL] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; psraw mm2, mmx_filter_shift ; movq mm5, mm1 @@ -748,10 +749,10 @@ filter_block2d_bil_var_mmx_loop: paddw mm1, mm3 ; paddw mm2, mm4 ; - paddw mm1, [mmx_bi_rd GLOBAL] ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; psraw mm1, mmx_filter_shift ; - paddw mm2, [mmx_bi_rd GLOBAL] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; psraw mm2, mmx_filter_shift ; movq mm3, mm5 ; @@ -772,8 +773,8 @@ filter_block2d_bil_var_mmx_loop: paddw mm1, mm3 ; paddw mm2, mm4 ; - paddw mm1, [mmx_bi_rd GLOBAL] ; - paddw mm2, [mmx_bi_rd GLOBAL] ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; psraw mm1, mmx_filter_shift ; psraw mm2, mmx_filter_shift ; diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 7e5ee284b..cefa0a956 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; @@ -57,7 +58,7 @@ NEXTROW: movdqa xmm3,xmm4 psrldq xmm4,4 paddd xmm4,xmm3 - movd rax,xmm4 + movq rax,xmm4 ; begin epilog @@ -470,7 +471,7 @@ sym(vp8_get8x8var_sse2): mov rax, arg(5) ;[Sum] mov rdi, arg(4) ;[SSE] - movd rdx, xmm7 + movq rdx, xmm7 movsx rcx, dx mov dword ptr [rax], ecx @@ -531,7 +532,7 @@ sym(vp8_filter_block2d_bil_var_sse2): pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; - paddw xmm1, [xmm_bi_rd GLOBAL] ; + paddw xmm1, [GLOBAL(xmm_bi_rd)] ; psraw xmm1, xmm_filter_shift ; movdqa xmm5, xmm1 @@ -553,7 +554,7 @@ filter_block2d_bil_var_sse2_loop: pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; - paddw xmm1, [xmm_bi_rd GLOBAL] ; + paddw xmm1, [GLOBAL(xmm_bi_rd)] ; psraw xmm1, xmm_filter_shift ; movdqa xmm3, xmm5 ; @@ -564,7 +565,7 @@ filter_block2d_bil_var_sse2_loop: pmullw xmm1, [rdx+16] ; paddw xmm1, xmm3 ; - paddw xmm1, [xmm_bi_rd GLOBAL] ; + paddw xmm1, [GLOBAL(xmm_bi_rd)] ; psraw xmm1, xmm_filter_shift ; movq xmm3, QWORD PTR [rdi] ; diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c index 4a5b25b0d..2df73a635 100644 --- a/vp8/encoder/x86/variance_mmx.c +++ b/vp8/encoder/x86/variance_mmx.c @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -14,7 +15,7 @@ extern void filter_block1d_h6_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, @@ -24,7 +25,7 @@ extern void filter_block1d_h6_mmx ); extern void filter_block1d_v6_mmx ( - short *src_ptr, + const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, @@ -36,34 +37,34 @@ extern void filter_block1d_v6_mmx extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr); extern unsigned int vp8_get8x8var_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum ); extern unsigned int vp8_get4x4var_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum ); extern unsigned int vp8_get4x4sse_cs_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride ); extern void vp8_filter_block2d_bil4x4_var_mmx ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, const short *HFilter, const short *VFilter, @@ -72,9 +73,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx ); extern void vp8_filter_block2d_bil_var_mmx ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, const short *HFilter, @@ -125,9 +126,9 @@ void vp8_test_get_mb_ss(void) unsigned int vp8_get16x16var_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned *SSE, unsigned *SUM @@ -156,9 +157,9 @@ unsigned int vp8_get16x16var_mmx( unsigned int vp8_variance4x4_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -172,9 +173,9 @@ unsigned int vp8_variance4x4_mmx( } unsigned int vp8_variance8x8_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -189,9 +190,9 @@ unsigned int vp8_variance8x8_mmx( } unsigned int vp8_mse16x16_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -211,9 +212,9 @@ unsigned int vp8_mse16x16_mmx( unsigned int vp8_variance16x16_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, int *sse) { @@ -233,9 +234,9 @@ unsigned int vp8_variance16x16_mmx( } unsigned int vp8_variance16x8_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -254,9 +255,9 @@ unsigned int vp8_variance16x8_mmx( unsigned int vp8_variance8x16_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -295,11 +296,11 @@ DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) = unsigned int vp8_sub_pixel_variance4x4_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse) @@ -319,11 +320,11 @@ unsigned int vp8_sub_pixel_variance4x4_mmx unsigned int vp8_sub_pixel_variance8x8_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -343,11 +344,11 @@ unsigned int vp8_sub_pixel_variance8x8_mmx unsigned int vp8_sub_pixel_variance16x16_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -382,11 +383,11 @@ unsigned int vp8_sub_pixel_variance16x16_mmx } unsigned int vp8_sub_pixel_mse16x16_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -397,11 +398,11 @@ unsigned int vp8_sub_pixel_mse16x16_mmx( unsigned int vp8_sub_pixel_variance16x8_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -434,11 +435,11 @@ unsigned int vp8_sub_pixel_variance16x8_mmx unsigned int vp8_sub_pixel_variance8x16_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, int *sse ) @@ -456,9 +457,9 @@ unsigned int vp8_sub_pixel_variance8x16_mmx } unsigned int vp8_i_variance16x16_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -479,9 +480,9 @@ unsigned int vp8_i_variance16x16_mmx( } unsigned int vp8_i_variance8x16_mmx( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -500,11 +501,11 @@ unsigned int vp8_i_variance8x16_mmx( unsigned int vp8_i_sub_pixel_variance16x16_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -559,11 +560,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_mmx unsigned int vp8_i_sub_pixel_variance8x16_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -594,3 +595,39 @@ unsigned int vp8_i_sub_pixel_variance8x16_mmx *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 7)); } + + +unsigned int vp8_variance_halfpixvar16x16_h_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4, + ref_ptr, recon_stride, sse); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_mmx( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) +{ + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4, + ref_ptr, recon_stride, sse); +} diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index ea80753bd..006e0a24a 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -12,16 +13,16 @@ #include "pragmas.h" #include "vpx_ports/mem.h" -extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); +extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); extern void vp8_filter_block2d_bil4x4_var_mmx ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, const short *HFilter, const short *VFilter, @@ -31,9 +32,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx extern unsigned int vp8_get4x4var_mmx ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum @@ -41,38 +42,38 @@ extern unsigned int vp8_get4x4var_mmx unsigned int vp8_get_mb_ss_sse2 ( - short *src_ptr + const short *src_ptr ); unsigned int vp8_get16x16var_sse2 ( - unsigned char *src_ptr, - int source_stride, - unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum ); unsigned int vp8_get16x16pred_error_sse2 ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_stride ); unsigned int vp8_get8x8var_sse2 ( - unsigned char *src_ptr, - int source_stride, - unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum ); void vp8_filter_block2d_bil_var_sse2 ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, const short *HFilter, @@ -82,9 +83,9 @@ void vp8_filter_block2d_bil_var_sse2 ); void vp8_half_horiz_vert_variance16x_h_sse2 ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, int *sum, @@ -92,9 +93,9 @@ void vp8_half_horiz_vert_variance16x_h_sse2 ); void vp8_half_horiz_variance16x_h_sse2 ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, int *sum, @@ -102,9 +103,9 @@ void vp8_half_horiz_variance16x_h_sse2 ); void vp8_half_vert_variance16x_h_sse2 ( - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int ref_pixels_per_line, - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, int *sum, @@ -114,9 +115,9 @@ void vp8_half_vert_variance16x_h_sse2 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); unsigned int vp8_variance4x4_wmt( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride) { unsigned int var; @@ -131,9 +132,9 @@ unsigned int vp8_variance4x4_wmt( unsigned int vp8_variance8x8_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride) { unsigned int var; @@ -148,9 +149,9 @@ unsigned int vp8_variance8x8_wmt unsigned int vp8_variance16x16_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -163,9 +164,9 @@ unsigned int vp8_variance16x16_wmt return (sse0 - ((sum0 * sum0) >> 8)); } unsigned int vp8_mse16x16_wmt( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -181,9 +182,9 @@ unsigned int vp8_mse16x16_wmt( unsigned int vp8_variance16x8_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -202,9 +203,9 @@ unsigned int vp8_variance16x8_wmt unsigned int vp8_variance8x16_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -238,11 +239,11 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) = }; unsigned int vp8_sub_pixel_variance4x4_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -262,11 +263,11 @@ unsigned int vp8_sub_pixel_variance4x4_wmt unsigned int vp8_sub_pixel_variance8x8_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -287,11 +288,11 @@ unsigned int vp8_sub_pixel_variance8x8_wmt unsigned int vp8_sub_pixel_variance16x16_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -363,11 +364,11 @@ unsigned int vp8_sub_pixel_variance16x16_wmt } unsigned int vp8_sub_pixel_mse16x16_wmt( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -378,11 +379,11 @@ unsigned int vp8_sub_pixel_mse16x16_wmt( unsigned int vp8_sub_pixel_variance16x8_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse @@ -416,11 +417,11 @@ unsigned int vp8_sub_pixel_variance16x8_wmt unsigned int vp8_sub_pixel_variance8x16_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -439,9 +440,9 @@ unsigned int vp8_sub_pixel_variance8x16_wmt } unsigned int vp8_i_variance16x16_wmt( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -463,9 +464,9 @@ unsigned int vp8_i_variance16x16_wmt( } unsigned int vp8_i_variance8x16_wmt( - unsigned char *src_ptr, + const unsigned char *src_ptr, int source_stride, - unsigned char *ref_ptr, + const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { @@ -485,11 +486,11 @@ unsigned int vp8_i_variance8x16_wmt( unsigned int vp8_i_sub_pixel_variance16x16_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -500,11 +501,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_wmt unsigned int vp8_i_sub_pixel_variance8x16_wmt ( - unsigned char *src_ptr, + const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - unsigned char *dst_ptr, + const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse ) @@ -512,3 +513,84 @@ unsigned int vp8_i_sub_pixel_variance8x16_wmt return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); } + + +unsigned int vp8_variance_halfpixvar16x16_h_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_variance_halfpixvar16x16_v_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} + + +unsigned int vp8_variance_halfpixvar16x16_hv_wmt( + const unsigned char *src_ptr, + int src_pixels_per_line, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) +{ + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum0, &xxsum0); + + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 16, + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 8)); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 35fc90c48..6bea15ebc 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -34,6 +35,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx); extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx); extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx); +extern prototype_variance(vp8_variance_halfpixvar16x16_h_mmx); +extern prototype_variance(vp8_variance_halfpixvar16x16_v_mmx); +extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx); extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx); extern prototype_getmbss(vp8_get_mb_ss_mmx); extern prototype_variance(vp8_mse16x16_mmx); @@ -88,6 +92,15 @@ extern prototype_sad(vp8_get4x4sse_cs_mmx); #undef vp8_variance_subpixvar16x16 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx +#undef vp8_variance_halfpixvar16x16_h +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_mmx + +#undef vp8_variance_halfpixvar16x16_v +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_mmx + +#undef vp8_variance_halfpixvar16x16_hv +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_mmx + #undef vp8_variance_subpixmse16x16 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx @@ -129,6 +142,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt); extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt); extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt); +extern prototype_variance(vp8_variance_halfpixvar16x16_h_wmt); +extern prototype_variance(vp8_variance_halfpixvar16x16_v_wmt); +extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt); extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt); extern prototype_getmbss(vp8_get_mb_ss_sse2); extern prototype_variance(vp8_mse16x16_wmt); @@ -182,6 +198,15 @@ extern prototype_variance2(vp8_get16x16var_sse2); #undef vp8_variance_subpixvar16x16 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt +#undef vp8_variance_halfpixvar16x16_h +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt + +#undef vp8_variance_halfpixvar16x16_v +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt + +#undef vp8_variance_halfpixvar16x16_hv +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt + #undef vp8_variance_subpixmse16x16 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt @@ -240,7 +265,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3 #undef vp8_variance_sad16x16x4d -#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3 +#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_sse3 #undef vp8_variance_sad16x8x4d #define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3 @@ -272,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); #endif #endif + +#if HAVE_SSE4_1 +extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad16x16x8 +#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4 + +#undef vp8_variance_sad16x8x8 +#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4 + +#undef vp8_variance_sad8x16x8 +#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4 + +#undef vp8_variance_sad8x8x8 +#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4 + +#undef vp8_variance_sad4x4x8 +#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4 + +#endif +#endif + #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index f1391ba8c..fb1b37ccb 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -17,15 +18,10 @@ #if HAVE_MMX void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp8_short_fdct4x4_mmx(input, output, pitch); - vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); + vp8_short_fdct4x4_c(input, output, pitch); + vp8_short_fdct4x4_c(input + 4, output + 16, pitch); } -void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch) -{ - vp8_fast_fdct4x4_mmx(input, output , pitch); - vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch); -} int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dequant_ptr, @@ -33,14 +29,14 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, short *quant_ptr, short *dqcoeff_ptr); void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) { - short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; - short *coeff_ptr = &b->coeff[0]; - short *zbin_ptr = &b->zbin[0][0]; - short *round_ptr = &b->round[0][0]; - short *quant_ptr = &b->quant[0][0]; - short *qcoeff_ptr = d->qcoeff; + short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; + short *coeff_ptr = b->coeff; + short *zbin_ptr = b->zbin; + short *round_ptr = b->round; + short *quant_ptr = b->quant; + short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = &d->dequant[0][0]; + short *dequant_ptr = d->dequant; d->eob = vp8_fast_quantize_b_impl_mmx( coeff_ptr, @@ -86,30 +82,28 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #endif #if HAVE_SSE2 -void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) +void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) { - vp8_short_fdct4x4_wmt(input, output, pitch); - vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch); + vp8_short_fdct4x4_sse2(input, output, pitch); + vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch); } -int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, +int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, short *quant_ptr, short *dqcoeff_ptr); -void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { - short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; - short *coeff_ptr = &b->coeff[0]; - short *zbin_ptr = &b->zbin[0][0]; - short *round_ptr = &b->round[0][0]; - short *quant_ptr = &b->quant[0][0]; - short *qcoeff_ptr = d->qcoeff; + short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; + short *coeff_ptr = b->coeff; + short *round_ptr = b->round; + short *quant_ptr = b->quant; + short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = &d->dequant[0][0]; + short *dequant_ptr = d->dequant; - d->eob = vp8_fast_quantize_b_impl_sse( + d->eob = vp8_fast_quantize_b_impl_sse2( coeff_ptr, - zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask, @@ -120,6 +114,41 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d) ); } + +int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, + short *qcoeff_ptr,short *dequant_ptr, + const int *default_zig_zag, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr, + unsigned short zbin_oq_value, + short *zbin_boost_ptr); + +void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) +{ + short *zbin_boost_ptr = b->zrun_zbin_boost; + short *coeff_ptr = b->coeff; + short *zbin_ptr = b->zbin; + short *round_ptr = b->round; + short *quant_ptr = b->quant; + short *qcoeff_ptr = d->qcoeff; + short *dqcoeff_ptr = d->dqcoeff; + short *dequant_ptr = d->dequant; + short zbin_oq_value = b->zbin_extra; + + d->eob = vp8_regular_quantize_b_impl_sse2( + coeff_ptr, + zbin_ptr, + qcoeff_ptr, + dequant_ptr, + vp8_default_zig_zag1d, + + round_ptr, + quant_ptr, + dqcoeff_ptr, + zbin_oq_value, + zbin_boost_ptr + ); +} + int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) { @@ -136,8 +165,39 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb) return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); } +void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, + short *diff, unsigned char *predictor, + int pitch); +void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) +{ + unsigned char *z = *(be->base_src) + be->src; + unsigned int src_stride = be->src_stride; + short *diff = &be->src_diff[0]; + unsigned char *predictor = &bd->predictor[0]; + vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); +} + +#endif + +#if HAVE_SSSE3 +int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr, + short *qcoeff_ptr, short *dequant_ptr, + short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) +{ + d->eob = vp8_fast_quantize_b_impl_ssse3( + b->coeff, + d->qcoeff, + d->dequant, + b->round, + b->quant, + d->dqcoeff + ); +} #endif + void vp8_arch_x86_encoder_init(VP8_COMP *cpi) { #if CONFIG_RUNTIME_CPU_DETECT @@ -147,6 +207,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) int wmt_enabled = flags & HAS_SSE2; int SSE3Enabled = flags & HAS_SSE3; int SSSE3Enabled = flags & HAS_SSSE3; + int SSE4_1Enabled = flags & HAS_SSE4_1; /* Note: * @@ -157,7 +218,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) /* Override default functions with fastest ones for this CPU. */ #if HAVE_MMX - if (mmx_enabled) { cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; @@ -177,6 +237,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_mmx; cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_mmx; cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_mmx; + cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; + cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; + cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_mmx; cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx; @@ -186,11 +249,19 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; - +#if 0 // new fdct cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; - cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; +#else + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; + +#endif + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; cpi->rtcd.encodemb.berr = vp8_block_error_mmx; @@ -200,12 +271,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_mmx; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_mmx; - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx; + /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/ } - #endif -#if HAVE_SSE2 +#if HAVE_SSE2 if (wmt_enabled) { cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; @@ -225,6 +295,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_wmt; cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_wmt; cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_wmt; + cpi->rtcd.variance.halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; + cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; + cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; cpi->rtcd.variance.subpixmse16x16 = vp8_sub_pixel_mse16x16_wmt; cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt; @@ -235,26 +308,26 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; -#if 0 - /* short SSE2 DCT currently disabled, does not match the MMX version */ - cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt; - cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt; -#endif - /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt; - cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2; + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2; + + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ; cpi->rtcd.encodemb.berr = vp8_block_error_xmm; cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm; cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm; - /* cpi->rtcd.encodemb.sub* not implemented for wmt */ + cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2; + cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; + cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse; + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; } - #endif -#if HAVE_SSE3 +#if HAVE_SSE3 if (SSE3Enabled) { cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; @@ -272,16 +345,30 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; } - #endif -#if HAVE_SSSE3 +#if HAVE_SSSE3 if (SSSE3Enabled) { cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; + } +#endif +#if HAVE_SSE4_1 + if (SSE4_1Enabled) + { + cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4; + cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4; + cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4; + cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4; + cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4; + cpi->rtcd.search.full_search = vp8_full_search_sadx8; + } #endif + #endif } |