diff options
Diffstat (limited to 'vp8/common/x86')
-rw-r--r-- | vp8/common/x86/mask_sse3.asm | 484 | ||||
-rw-r--r-- | vp8/common/x86/recon_wrapper_sse2.c | 2 | ||||
-rw-r--r-- | vp8/common/x86/subpixel_ssse3.asm | 20 | ||||
-rw-r--r-- | vp8/common/x86/vp8_asm_stubs.c | 43 | ||||
-rw-r--r-- | vp8/common/x86/x86_systemdependent.c | 8 |
5 files changed, 552 insertions, 5 deletions
diff --git a/vp8/common/x86/mask_sse3.asm b/vp8/common/x86/mask_sse3.asm new file mode 100644 index 000000000..0d90cfa86 --- /dev/null +++ b/vp8/common/x86/mask_sse3.asm @@ -0,0 +1,484 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void int vp8_makemask_sse3( +; unsigned char *y, +; unsigned char *u, +; unsigned char *v, +; unsigned char *ym, +; unsigned char *uvm, +; int yp, +; int uvp, +; int ys, +; int us, +; int vs, +; int yt, +; int ut, +; int vt) +global sym(vp8_makemask_sse3) +sym(vp8_makemask_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 14 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;y + mov rdi, arg(1) ;u + mov rcx, arg(2) ;v + mov rax, arg(3) ;ym + movsxd rbx, dword arg(4) ;yp + movsxd rdx, dword arg(5) ;uvp + + pxor xmm0,xmm0 + + ;make 16 copies of the center y value + movd xmm1, arg(6) + pshufb xmm1, xmm0 + + ; make 16 copies of the center u value + movd xmm2, arg(7) + pshufb xmm2, xmm0 + + ; make 16 copies of the center v value + movd xmm3, arg(8) + pshufb xmm3, xmm0 + unpcklpd xmm2, xmm3 + + ;make 16 copies of the y tolerance + movd xmm3, arg(9) + pshufb xmm3, xmm0 + + ;make 16 copies of the u tolerance + movd xmm4, arg(10) + pshufb xmm4, xmm0 + + ;make 16 copies of the v tolerance + movd xmm5, arg(11) + pshufb xmm5, xmm0 + unpckhpd xmm4, xmm5 + + mov r8,8 + +NextPairOfRows: + + ;grab the y source values + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm6, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm6, xmm7 + por xmm0, xmm6 + + ;compute abs difference between + movdqa xmm6, xmm3 + pcmpgtb xmm6, xmm0 + + ;grab the y source values + add rsi, rbx + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm11, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm11, xmm7 + por xmm0, xmm11 + + ;compute abs difference between + movdqa xmm11, xmm3 + pcmpgtb xmm11, xmm0 + + + ;grab the u and v source values + movdqu xmm7, [rdi] + movdqu xmm8, [rcx] + unpcklpd xmm7, xmm8 + + ;compute abs difference between source and uv targets + movdqa xmm9, xmm2 + movdqa xmm10, xmm7 + psubusb xmm7, xmm2 + psubusb xmm9, xmm10 + por xmm7, xmm9 + + ;check whether the number is < tolerance + movdqa xmm0, xmm4 + pcmpgtb xmm0, xmm7 + + ;double u and v masks + movdqa xmm8, xmm0 + punpckhbw xmm0, xmm0 + punpcklbw xmm8, xmm8 + + ;mask row 0 and output + pand xmm6, xmm8 + pand xmm6, xmm0 + movdqa [rax],xmm6 + + ;mask row 1 and output + pand xmm11, xmm8 + pand xmm11, xmm0 + movdqa [rax+16],xmm11 + + + ; to the next row or set of rows + add rsi, rbx + add rdi, rdx + add rcx, rdx + add rax,32 + dec r8 + jnz NextPairOfRows + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;GROW_HORIZ (register for result, source register or mem local) +; takes source and shifts left and ors with source +; then shifts right and ors with source +%macro GROW_HORIZ 2 + movdqa %1, %2 + movdqa xmm14, %1 + movdqa xmm15, %1 + pslldq xmm14, 1 + psrldq xmm15, 1 + por %1,xmm14 + por %1,xmm15 +%endmacro +;GROW_VERT (result, center row, above row, below row) +%macro GROW_VERT 4 + movdqa %1,%2 + por %1,%3 + por %1,%4 +%endmacro + +;GROW_NEXTLINE (new line to grow, new source, line to write) +%macro GROW_NEXTLINE 3 + GROW_HORIZ %1, %2 + GROW_VERT xmm3, xmm0, xmm1, xmm2 + movdqa %3,xmm3 +%endmacro + + +;void int vp8_growmaskmb_sse3( +; unsigned char *om, +; unsigned char *nm, +global sym(vp8_growmaskmb_sse3) +sym(vp8_growmaskmb_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src + mov rdi, arg(1) ;rst + + GROW_HORIZ xmm0, [rsi] + GROW_HORIZ xmm1, [rsi+16] + GROW_HORIZ xmm2, [rsi+32] + + GROW_VERT xmm3, xmm0, xmm1, xmm2 + por xmm0,xmm1 + movdqa [rdi], xmm0 + movdqa [rdi+16],xmm3 + + GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] + GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] + GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] + GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] + GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] + GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] + GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] + GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] + GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] + GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] + GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] + GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] + GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] + + por xmm0,xmm2 + movdqa [rdi+240], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int vp8_sad16x16_masked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_masked_wmt) +sym(vp8_sad16x16_masked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +NextSadRow: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + pand xmm0, xmm2 + pand xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz NextSadRow + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x16_unmasked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_unmasked_wmt) +sym(vp8_sad16x16_unmasked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_sad16x16_unmasked_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + por xmm0, xmm2 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_sad16x16_unmasked_wmt + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_masked_predictor_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_wmt) +sym(vp8_masked_predictor_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movdqu [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_masked_predictor_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_masked_predictor_uv_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_uv_wmt) +sym(vp8_masked_predictor_uv_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_uv_wmt: + movq xmm0, [rsi] + movq xmm1, [rdi] + movq xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movq [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rax + add rbx, 8 + + dec rcx + jnz next_vp8_masked_predictor_uv_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_uv_from_y_mask( +; unsigned char *ymask, +; unsigned char *uvmask) +global sym(vp8_uv_from_y_mask) +sym(vp8_uv_from_y_mask): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_p8_uv_from_y_mask: + movdqu xmm0, [rsi] + pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] + movq [rdi],xmm0 + add rdi, 8 + add rsi,32 + + dec rcx + jnz next_p8_uv_from_y_mask + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 + diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c index fcc75a901..cb7b69c08 100644 --- a/vp8/common/x86/recon_wrapper_sse2.c +++ b/vp8/common/x86/recon_wrapper_sse2.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "vpx_ports/config.h" #include "vp8/common/recon.h" #include "recon_x86.h" #include "vpx_mem/vpx_mem.h" diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index 6bca82bfb..39f4f7b88 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -1495,13 +1495,33 @@ k2_k4: times 8 db 36, -11 times 8 db 12, -6 align 16 +%if CONFIG_SIXTEENTH_SUBPEL_UV vp8_bilinear_filters_ssse3: times 8 db 128, 0 + times 8 db 120, 8 times 8 db 112, 16 + times 8 db 104, 24 times 8 db 96, 32 + times 8 db 88, 40 times 8 db 80, 48 + times 8 db 72, 56 times 8 db 64, 64 + times 8 db 56, 72 times 8 db 48, 80 + times 8 db 40, 88 times 8 db 32, 96 + times 8 db 24, 104 times 8 db 16, 112 + times 8 db 8, 120 +%else +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 +%endif diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index bce7bc38e..458b3f638 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -9,12 +9,19 @@ */ -#include "vpx_config.h" +#include "vpx_ports/config.h" #include "vpx_ports/mem.h" #include "vp8/common/subpixel.h" +#if CONFIG_SIXTEENTH_SUBPEL_UV +extern const short vp8_six_tap_mmx[16][6*8]; +extern const short vp8_bilinear_filters_mmx[16][2*8]; +#else extern const short vp8_six_tap_mmx[8][6*8]; extern const short vp8_bilinear_filters_mmx[8][2*8]; +#endif + +//#define ANNOUNCE_FUNCTION extern void vp8_filter_block1d_h6_mmx ( @@ -128,6 +135,9 @@ void vp8_sixtap_predict4x4_mmx int dst_pitch ) { +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict4x4_mmx\n"); +#endif DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; HFilter = vp8_six_tap_mmx[xoffset]; @@ -149,6 +159,9 @@ void vp8_sixtap_predict16x16_mmx ) { +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict16x16_mmx\n"); +#endif DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; @@ -181,6 +194,9 @@ void vp8_sixtap_predict8x8_mmx ) { +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict8x8_mmx\n"); +#endif DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; @@ -206,7 +222,9 @@ void vp8_sixtap_predict8x4_mmx int dst_pitch ) { - +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict8x4_mmx\n"); +#endif DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; @@ -256,6 +274,9 @@ void vp8_sixtap_predict16x16_sse2 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict16x16_sse2\n"); +#endif if (xoffset) { @@ -295,6 +316,9 @@ void vp8_sixtap_predict8x8_sse2 { DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict8x8_sse2\n"); +#endif if (xoffset) { @@ -333,6 +357,9 @@ void vp8_sixtap_predict8x4_sse2 { DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict8x4_sse2\n"); +#endif if (xoffset) { @@ -434,6 +461,9 @@ void vp8_sixtap_predict16x16_ssse3 ) { DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24); +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict16x16_ssse3\n"); +#endif if (xoffset) { @@ -466,6 +496,9 @@ void vp8_sixtap_predict8x8_ssse3 ) { DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict8x8_ssse3\n"); +#endif if (xoffset) { @@ -498,6 +531,9 @@ void vp8_sixtap_predict8x4_ssse3 ) { DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict8x4_ssse3\n"); +#endif if (xoffset) { @@ -530,6 +566,9 @@ void vp8_sixtap_predict4x4_ssse3 ) { DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9); +#ifdef ANNOUNCE_FUNCTION + printf("vp8_sixtap_predict4x4_ssse3\n"); +#endif if (xoffset) { diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index 33a984b79..53009502c 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -43,17 +43,17 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx; rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx; - - rtcd->recon.recon = vp8_recon_b_mmx; rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx; rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx; rtcd->recon.copy16x16 = vp8_copy_mem16x16_mmx; +#if CONFIG_ENHANCED_INTERP == 0 && CONFIG_HIGH_PRECISION_MV == 0 && CONFIG_SIXTEENTH_SUBPEL_UV == 0 rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_mmx; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_mmx; rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_mmx; rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_mmx; +#endif rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_mmx; rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_mmx; rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_mmx; @@ -91,9 +91,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2; +#if CONFIG_ENHANCED_INTERP == 0 && CONFIG_HIGH_PRECISION_MV == 0 && CONFIG_SIXTEENTH_SUBPEL_UV == 0 rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_sse2; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_sse2; rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_sse2; +#endif rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_sse2; rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_sse2; @@ -120,12 +122,14 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) if (flags & HAS_SSSE3) { +#if CONFIG_ENHANCED_INTERP == 0 && CONFIG_HIGH_PRECISION_MV == 0 rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_ssse3; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_ssse3; rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_ssse3; rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3; rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3; rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_ssse3; +#endif rtcd->recon.build_intra_predictors_mbuv = vp8_build_intra_predictors_mbuv_ssse3; |