diff options
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 10 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm | 944 |
2 files changed, 397 insertions, 557 deletions
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index fa0fea0c1..ba6a0c6e1 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1723,11 +1723,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, pd->dst.stride = this_mode_pred->stride; } } else { + // TODO(jackychen): the low-bitdepth condition causes a segfault in + // high-bitdepth builds. + // https://bugs.chromium.org/p/webm/issues/detail?id=1250 +#if CONFIG_VP9_HIGHBITDEPTH + const int large_block = bsize > BLOCK_32X32; +#else + const int large_block = bsize >= BLOCK_32X32; +#endif mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); // For large partition blocks, extra testing is done. - if (cpi->oxcf.rc_mode == VPX_CBR && bsize >= BLOCK_32X32 && + if (cpi->oxcf.rc_mode == VPX_CBR && large_block && !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && cm->base_qindex) { model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm index 17da26889..d2cb8ea29 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -23,7 +23,11 @@ pw_64: times 8 dw 64 ; z = signed SAT(x + y) SECTION .text -%define LOCAL_VARS_SIZE 16*6 +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE 16*4 +%else + %define LOCAL_VARS_SIZE 16*6 +%endif %macro SETUP_LOCAL_VARS 0 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + @@ -50,11 +54,11 @@ SECTION .text mova k6k7, m3 %if ARCH_X86_64 %define krd m12 - %define tmp0 [rsp + 16*4] - %define tmp1 [rsp + 16*5] + %define tmp m13 mova krd, [GLOBAL(pw_64)] %else - %define krd [rsp + 16*4] + %define tmp [rsp + 16*4] + %define krd [rsp + 16*5] %if CONFIG_PIC=0 mova m6, [GLOBAL(pw_64)] %else @@ -67,31 +71,50 @@ SECTION .text %endif %endm -;------------------------------------------------------------------------------- -%if ARCH_X86_64 - %define LOCAL_VARS_SIZE_H4 0 -%else - %define LOCAL_VARS_SIZE_H4 16*4 -%endif +%macro HORIZx4_ROW 2 + mova %2, %1 + punpcklbw %1, %1 + punpckhbw %2, %2 + + mova m3, %2 + palignr %2, %1, 1 + palignr m3, %1, 5 + + pmaddubsw %2, k0k1k4k5 + pmaddubsw m3, k2k3k6k7 + mova m4, %2 ;k0k1 + mova m5, m3 ;k2k3 + psrldq %2, 8 ;k4k5 + psrldq m3, 8 ;k6k7 + paddsw %2, m4 + paddsw m5, m3 + paddsw %2, m5 + paddsw %2, krd + psraw %2, 7 + packuswb %2, %2 +%endm +;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER4 1 -cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ +cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] packsswb m4, m4 %if ARCH_X86_64 - %define k0k1k4k5 m8 - %define k2k3k6k7 m9 - %define krd m10 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + %define orig_height r7d mova krd, [GLOBAL(pw_64)] pshuflw k0k1k4k5, m4, 0b ;k0_k1 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 %else - %define k0k1k4k5 [rsp + 16*0] - %define k2k3k6k7 [rsp + 16*1] - %define krd [rsp + 16*2] + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + %define orig_height [rsp + 16*3] pshuflw m6, m4, 0b ;k0_k1 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 pshuflw m7, m4, 01010101b ;k2_k3 @@ -108,46 +131,61 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ mova k2k3k6k7, m7 mova krd, m1 %endif - dec heightd - + mov orig_height, heightd + shr heightd, 1 .loop: ;Do two rows at once - movu m4, [srcq - 3] - movu m5, [srcq + sstrideq - 3] - punpckhbw m1, m4, m4 - punpcklbw m4, m4 - punpckhbw m3, m5, m5 - punpcklbw m5, m5 - palignr m0, m1, m4, 1 - pmaddubsw m0, k0k1k4k5 - palignr m1, m4, 5 + movh m0, [srcq - 3] + movh m1, [srcq + 5] + punpcklqdq m0, m1 + mova m1, m0 + movh m2, [srcq + sstrideq - 3] + movh m3, [srcq + sstrideq + 5] + punpcklqdq m2, m3 + mova m3, m2 + punpcklbw m0, m0 + punpckhbw m1, m1 + punpcklbw m2, m2 + punpckhbw m3, m3 + mova m4, m1 + palignr m4, m0, 1 + pmaddubsw m4, k0k1k4k5 + palignr m1, m0, 5 pmaddubsw m1, k2k3k6k7 - palignr m2, m3, m5, 1 - pmaddubsw m2, k0k1k4k5 - palignr m3, m5, 5 + mova m7, m3 + palignr m7, m2, 1 + pmaddubsw m7, k0k1k4k5 + palignr m3, m2, 5 pmaddubsw m3, k2k3k6k7 - punpckhqdq m4, m0, m2 - punpcklqdq m0, m2 - punpckhqdq m5, m1, m3 - punpcklqdq m1, m3 - paddsw m0, m4 - paddsw m1, m5 -%ifidn %1, h8_avg - movd m4, [dstq] - movd m5, [dstq + dstrideq] -%endif - paddsw m0, m1 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 - psrldq m1, m0, 4 + mova m0, m4 ;k0k1 + mova m5, m1 ;k2k3 + mova m2, m7 ;k0k1 upper + psrldq m4, 8 ;k4k5 + psrldq m1, 8 ;k6k7 + paddsw m4, m0 + paddsw m5, m1 + mova m1, m3 ;k2k3 upper + psrldq m7, 8 ;k4k5 upper + psrldq m3, 8 ;k6k7 upper + paddsw m7, m2 + paddsw m4, m5 + paddsw m1, m3 + paddsw m7, m1 + paddsw m4, krd + psraw m4, 7 + packuswb m4, m4 + paddsw m7, krd + psraw m7, 7 + packuswb m7, m7 %ifidn %1, h8_avg - pavgb m0, m4 - pavgb m1, m5 + movd m0, [dstq] + pavgb m4, m0 + movd m2, [dstq + dstrideq] + pavgb m7, m2 %endif - movd [dstq], m0 - movd [dstq + dstrideq], m1 + movd [dstq], m4 + movd [dstq + dstrideq], m7 lea srcq, [srcq + sstrideq ] prefetcht0 [srcq + 4 * sstrideq - 3] @@ -155,156 +193,205 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ lea dstq, [dstq + 2 * dstrideq ] prefetcht0 [srcq + 2 * sstrideq - 3] - sub heightd, 2 - jg .loop + dec heightd + jnz .loop ; Do last row if output_height is odd - jne .done - - movu m4, [srcq - 3] - punpckhbw m1, m4, m4 - punpcklbw m4, m4 - palignr m0, m1, m4, 1 - palignr m1, m4, 5 - pmaddubsw m0, k0k1k4k5 - pmaddubsw m1, k2k3k6k7 - psrldq m2, m0, 8 - psrldq m3, m1, 8 - paddsw m0, m2 - paddsw m1, m3 - paddsw m0, m1 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 + mov heightd, orig_height + and heightd, 1 + je .done + + movh m0, [srcq - 3] ; load src + movh m1, [srcq + 5] + punpcklqdq m0, m1 + + HORIZx4_ROW m0, m1 %ifidn %1, h8_avg - movd m4, [dstq] - pavgb m0, m4 + movd m0, [dstq] + pavgb m1, m0 %endif - movd [dstq], m0 + movd [dstq], m1 .done - REP_RET + RET +%endm + +%macro HORIZx8_ROW 5 + mova %2, %1 + punpcklbw %1, %1 + punpckhbw %2, %2 + + mova %3, %2 + mova %4, %2 + mova %5, %2 + + palignr %2, %1, 1 + palignr %3, %1, 5 + palignr %4, %1, 9 + palignr %5, %1, 13 + + pmaddubsw %2, k0k1 + pmaddubsw %3, k2k3 + pmaddubsw %4, k4k5 + pmaddubsw %5, k6k7 + paddsw %2, %4 + paddsw %5, %3 + paddsw %2, %5 + paddsw %2, krd + psraw %2, 7 + packuswb %2, %2 + SWAP %1, %2 %endm ;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER8 1 -cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ +cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS - dec heightd +%if ARCH_X86_64 + %define orig_height r7d +%else + %define orig_height heightmp +%endif + mov orig_height, heightd + shr heightd, 1 .loop: - ;Do two rows at once - movu m0, [srcq - 3] - movu m4, [srcq + sstrideq - 3] - punpckhbw m1, m0, m0 + movh m0, [srcq - 3] + movh m3, [srcq + 5] + movh m4, [srcq + sstrideq - 3] + movh m7, [srcq + sstrideq + 5] + punpcklqdq m0, m3 + mova m1, m0 punpcklbw m0, m0 - palignr m5, m1, m0, 13 + punpckhbw m1, m1 + mova m5, m1 + palignr m5, m0, 13 pmaddubsw m5, k6k7 - palignr m2, m1, m0, 5 - palignr m3, m1, m0, 9 + mova m2, m1 + mova m3, m1 palignr m1, m0, 1 pmaddubsw m1, k0k1 - punpckhbw m6, m4, m4 + punpcklqdq m4, m7 + mova m6, m4 punpcklbw m4, m4 + palignr m2, m0, 5 + punpckhbw m6, m6 + palignr m3, m0, 9 + mova m7, m6 pmaddubsw m2, k2k3 pmaddubsw m3, k4k5 - palignr m7, m6, m4, 13 - palignr m0, m6, m4, 5 + palignr m7, m4, 13 + mova m0, m6 + palignr m0, m4, 5 pmaddubsw m7, k6k7 paddsw m1, m3 paddsw m2, m5 paddsw m1, m2 -%ifidn %1, h8_avg - movh m2, [dstq] - movhps m2, [dstq + dstrideq] -%endif - palignr m5, m6, m4, 9 + mova m5, m6 palignr m6, m4, 1 pmaddubsw m0, k2k3 pmaddubsw m6, k0k1 + palignr m5, m4, 9 paddsw m1, krd pmaddubsw m5, k4k5 psraw m1, 7 paddsw m0, m7 +%ifidn %1, h8_avg + movh m7, [dstq] + movh m2, [dstq + dstrideq] +%endif + packuswb m1, m1 paddsw m6, m5 paddsw m6, m0 paddsw m6, krd psraw m6, 7 - packuswb m1, m6 + packuswb m6, m6 %ifidn %1, h8_avg - pavgb m1, m2 + pavgb m1, m7 + pavgb m6, m2 %endif - movh [dstq], m1 - movhps [dstq + dstrideq], m1 + movh [dstq], m1 + movh [dstq + dstrideq], m6 lea srcq, [srcq + sstrideq ] prefetcht0 [srcq + 4 * sstrideq - 3] lea srcq, [srcq + sstrideq ] lea dstq, [dstq + 2 * dstrideq ] prefetcht0 [srcq + 2 * sstrideq - 3] - sub heightd, 2 - jg .loop + dec heightd + jnz .loop - ; Do last row if output_height is odd - jne .done + ;Do last row if output_height is odd + mov heightd, orig_height + and heightd, 1 + je .done + + movh m0, [srcq - 3] + movh m3, [srcq + 5] + punpcklqdq m0, m3 + + HORIZx8_ROW m0, m1, m2, m3, m4 - movu m0, [srcq - 3] - punpckhbw m3, m0, m0 - punpcklbw m0, m0 - palignr m1, m3, m0, 1 - palignr m2, m3, m0, 5 - palignr m4, m3, m0, 13 - palignr m3, m0, 9 - pmaddubsw m1, k0k1 - pmaddubsw m2, k2k3 - pmaddubsw m3, k4k5 - pmaddubsw m4, k6k7 - paddsw m1, m3 - paddsw m4, m2 - paddsw m1, m4 - paddsw m1, krd - psraw m1, 7 - packuswb m1, m1 %ifidn %1, h8_avg - movh m0, [dstq] - pavgb m1, m0 + movh m1, [dstq] + pavgb m0, m1 %endif - movh [dstq], m1 + movh [dstq], m0 .done: - REP_RET + RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER16 1 -cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ +cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS - .loop: prefetcht0 [srcq + 2 * sstrideq -3] - movu m0, [srcq - 3] - movu m4, [srcq - 2] + movh m0, [srcq - 3] + movh m4, [srcq + 5] + movh m6, [srcq + 13] + punpcklqdq m0, m4 + mova m7, m0 + punpckhbw m0, m0 + mova m1, m0 + punpcklqdq m4, m6 + mova m3, m0 + punpcklbw m7, m7 + + palignr m3, m7, 13 + mova m2, m0 + pmaddubsw m3, k6k7 + palignr m0, m7, 1 pmaddubsw m0, k0k1 - pmaddubsw m4, k0k1 - movu m1, [srcq - 1] - movu m5, [srcq + 0] + palignr m1, m7, 5 pmaddubsw m1, k2k3 - pmaddubsw m5, k2k3 - movu m2, [srcq + 1] - movu m6, [srcq + 2] + palignr m2, m7, 9 pmaddubsw m2, k4k5 + paddsw m1, m3 + mova m3, m4 + punpckhbw m4, m4 + mova m5, m4 + punpcklbw m3, m3 + mova m7, m4 + palignr m5, m3, 5 + mova m6, m4 + palignr m4, m3, 1 + pmaddubsw m4, k0k1 + pmaddubsw m5, k2k3 + palignr m6, m3, 9 pmaddubsw m6, k4k5 - movu m3, [srcq + 3] - movu m7, [srcq + 4] - pmaddubsw m3, k6k7 + palignr m7, m3, 13 pmaddubsw m7, k6k7 paddsw m0, m2 - paddsw m1, m3 paddsw m0, m1 +%ifidn %1, h8_avg + mova m1, [dstq] +%endif paddsw m4, m6 paddsw m5, m7 paddsw m4, m5 @@ -312,18 +399,16 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ paddsw m4, krd psraw m0, 7 psraw m4, 7 - packuswb m0, m0 - packuswb m4, m4 - punpcklbw m0, m4 + packuswb m0, m4 %ifidn %1, h8_avg - pavgb m0, [dstq] + pavgb m0, m1 %endif lea srcq, [srcq + sstrideq] mova [dstq], m0 lea dstq, [dstq + dstrideq] dec heightd jnz .loop - REP_RET + RET %endm INIT_XMM ssse3 @@ -335,457 +420,204 @@ SUBPIX_HFILTER4 h8 SUBPIX_HFILTER4 h8_avg ;------------------------------------------------------------------------------- - -; TODO(Linfeng): Detect cpu type and choose the code with better performance. -%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 - %macro SUBPIX_VFILTER 2 -cglobal filter_block1d%2_%1, 6, 6, 15, LOCAL_VARS_SIZE, \ +cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS - -%ifidn %2, 8 - %define movx movh +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq %else - %define movx movd + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp %endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 - dec heightd - -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq +%ifidn %2, 8 + %define movx movh %else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp + %define movx movd %endif - mov src1q, srcq - add src1q, sstrideq - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - .loop: - ;Do two rows at once - movx m0, [srcq ] ;A - movx m1, [src1q ] ;B - punpcklbw m0, m1 ;A B - movx m2, [srcq + sstrideq * 2 ] ;C - pmaddubsw m0, k0k1 - mova m6, m2 - movx m3, [src1q + sstrideq * 2] ;D - punpcklbw m2, m3 ;C D - pmaddubsw m2, k2k3 - movx m4, [srcq + sstrideq * 4 ] ;E - mova m7, m4 - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m4, k4k5 - punpcklbw m1, m6 ;A B next iter - movx m6, [srcq + sstride6q ] ;G - punpcklbw m5, m6 ;E F next iter - punpcklbw m3, m7 ;C D next iter - pmaddubsw m5, k4k5 - movx m7, [src1q + sstride6q ] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m6, k6k7 - pmaddubsw m3, k2k3 - pmaddubsw m1, k0k1 - paddsw m0, m4 - paddsw m2, m6 - movx m6, [srcq + sstrideq * 8 ] ;H next iter - punpcklbw m7, m6 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - paddsw m1, m5 - packuswb m0, m0 - - paddsw m3, m7 - paddsw m1, m3 - paddsw m1, krd - psraw m1, 7 - lea srcq, [srcq + sstrideq * 2 ] - lea src1q, [src1q + sstrideq * 2] - packuswb m1, m1 + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 %ifidn %1, v8_avg - movx m2, [dstq] - pavgb m0, m2 + movx m2, [dstq] + pavgb m0, m2 %endif - movx [dstq], m0 - add dstq, dst_stride + movx [dstq], m0 + add dstq, dst_stride %ifidn %1, v8_avg - movx m3, [dstq] - pavgb m1, m3 -%endif - movx [dstq], m1 - add dstq, dst_stride - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - movx m6, [srcq + sstride6q ] ;G - punpcklbw m0, m1 ;A B - movx m7, [src1q + sstride6q ] ;H - pmaddubsw m0, k0k1 - movx m2, [srcq + sstrideq * 2 ] ;C - punpcklbw m6, m7 ;G H - movx m3, [src1q + sstrideq * 2] ;D - pmaddubsw m6, k6k7 - movx m4, [srcq + sstrideq * 4 ] ;E - punpcklbw m2, m3 ;C D - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - paddsw m2, m6 - paddsw m0, m4 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 - -%else - ; ARCH_X86_64 - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - lea srcq, [srcq + sstrideq * 2 ] - movx m2, [srcq] ;C - movx m3, [srcq + sstrideq] ;D - lea srcq, [srcq + sstrideq * 2 ] - movx m4, [srcq] ;E - movx m5, [srcq + sstrideq] ;F - lea srcq, [srcq + sstrideq * 2 ] - movx m6, [srcq] ;G - punpcklbw m0, m1 ;A B - punpcklbw m1, m2 ;A B next iter - punpcklbw m2, m3 ;C D - punpcklbw m3, m4 ;C D next iter - punpcklbw m4, m5 ;E F - punpcklbw m5, m6 ;E F next iter - -.loop: - ;Do two rows at once - movx m7, [srcq + sstrideq] ;H - lea srcq, [srcq + sstrideq * 2 ] - movx m14, [srcq] ;H next iter - punpcklbw m6, m7 ;G H - punpcklbw m7, m14 ;G H next iter - pmaddubsw m8, m0, k0k1 - pmaddubsw m9, m1, k0k1 - mova m0, m2 - mova m1, m3 - pmaddubsw m10, m2, k2k3 - pmaddubsw m11, m3, k2k3 - mova m2, m4 - mova m3, m5 - pmaddubsw m4, k4k5 - pmaddubsw m5, k4k5 - paddsw m8, m4 - paddsw m9, m5 - mova m4, m6 - mova m5, m7 - pmaddubsw m6, k6k7 - pmaddubsw m7, k6k7 - paddsw m10, m6 - paddsw m11, m7 - paddsw m8, m10 - paddsw m9, m11 - mova m6, m14 - paddsw m8, krd - paddsw m9, krd - psraw m8, 7 - psraw m9, 7 -%ifidn %2, 4 - packuswb m8, m8 - packuswb m9, m9 -%else - packuswb m8, m9 + movx m3, [dstq] + pavgb m1, m3 %endif - + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + cmp heightd, 1 + jg .loop + + cmp heightd, 0 + je .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 %ifidn %1, v8_avg - movx m7, [dstq] -%ifidn %2, 4 - movx m10, [dstq + dstrideq] - pavgb m9, m10 -%else - movhpd m7, [dstq + dstrideq] + movx m1, [dstq] + pavgb m0, m1 %endif - pavgb m8, m7 -%endif - movx [dstq], m8 -%ifidn %2, 4 - movx [dstq + dstrideq], m9 -%else - movhpd [dstq + dstrideq], m8 -%endif - - lea dstq, [dstq + dstrideq * 2 ] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movx m7, [srcq + sstrideq] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m0, k0k1 - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - pmaddubsw m6, k6k7 - paddsw m0, m4 - paddsw m2, m6 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 - -%endif ; ARCH_X86_64 - + movx [dstq], m0 .done: - REP_RET - + RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_VFILTER16 1 -cglobal filter_block1d16_%1, 6, 6, 16, LOCAL_VARS_SIZE, \ +cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter - mova m4, [filterq] + mova m4, [filterq] SETUP_LOCAL_VARS - -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - %if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq %else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp %endif - lea src1q, [srcq + sstrideq] - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 .loop: - movh m0, [srcq ] ;A - movh m1, [src1q ] ;B - movh m2, [srcq + sstrideq * 2 ] ;C - movh m3, [src1q + sstrideq * 2] ;D - movh m4, [srcq + sstrideq * 4 ] ;E - movh m5, [src1q + sstrideq * 4] ;F - - punpcklbw m0, m1 ;A B - movh m6, [srcq + sstride6q] ;G - punpcklbw m2, m3 ;C D - movh m7, [src1q + sstride6q] ;H - punpcklbw m4, m5 ;E F - pmaddubsw m0, k0k1 - movh m3, [srcq + 8] ;A - pmaddubsw m2, k2k3 - punpcklbw m6, m7 ;G H - movh m5, [srcq + sstrideq + 8] ;B - pmaddubsw m4, k4k5 - punpcklbw m3, m5 ;A B - movh m7, [srcq + sstrideq * 2 + 8] ;C - pmaddubsw m6, k6k7 - movh m5, [src1q + sstrideq * 2 + 8] ;D - punpcklbw m7, m5 ;C D - paddsw m2, m6 - pmaddubsw m3, k0k1 - movh m1, [srcq + sstrideq * 4 + 8] ;E - paddsw m0, m4 - pmaddubsw m7, k2k3 - movh m6, [src1q + sstrideq * 4 + 8] ;F - punpcklbw m1, m6 ;E F - paddsw m0, m2 - paddsw m0, krd - movh m2, [srcq + sstride6q + 8] ;G - pmaddubsw m1, k4k5 - movh m5, [src1q + sstride6q + 8] ;H - psraw m0, 7 - punpcklbw m2, m5 ;G H - pmaddubsw m2, k6k7 - paddsw m7, m2 - paddsw m3, m1 - paddsw m3, m7 - paddsw m3, krd - psraw m3, 7 - packuswb m0, m3 - - add srcq, sstrideq - add src1q, sstrideq -%ifidn %1, v8_avg - pavgb m0, [dstq] -%endif - mova [dstq], m0 - add dstq, dst_stride - dec heightd - jnz .loop - REP_RET - -%else - ; ARCH_X86_64 - dec heightd - - movu m1, [srcq ] ;A - movu m3, [srcq + sstrideq ] ;B - lea srcq, [srcq + sstrideq * 2] - punpcklbw m0, m1, m3 ;A B - punpckhbw m1, m3 ;A B - movu m5, [srcq] ;C - punpcklbw m2, m3, m5 ;A B next iter - punpckhbw m3, m5 ;A B next iter - mova tmp0, m2 ;store to stack - mova tmp1, m3 ;store to stack - movu m7, [srcq + sstrideq] ;D - lea srcq, [srcq + sstrideq * 2] - punpcklbw m4, m5, m7 ;C D - punpckhbw m5, m7 ;C D - movu m9, [srcq] ;E - punpcklbw m6, m7, m9 ;C D next iter - punpckhbw m7, m9 ;C D next iter - movu m11, [srcq + sstrideq] ;F - lea srcq, [srcq + sstrideq * 2] - punpcklbw m8, m9, m11 ;E F - punpckhbw m9, m11 ;E F - movu m2, [srcq] ;G - punpcklbw m10, m11, m2 ;E F next iter - punpckhbw m11, m2 ;E F next iter - -.loop: - ;Do two rows at once - pmaddubsw m13, m0, k0k1 - mova m0, m4 - pmaddubsw m14, m8, k4k5 - pmaddubsw m15, m4, k2k3 - mova m4, m8 - paddsw m13, m14 - movu m3, [srcq + sstrideq] ;H - lea srcq, [srcq + sstrideq * 2] - punpcklbw m14, m2, m3 ;G H - mova m8, m14 - pmaddubsw m14, k6k7 - paddsw m15, m14 - paddsw m13, m15 - paddsw m13, krd - psraw m13, 7 - - pmaddubsw m14, m1, k0k1 - pmaddubsw m1, m9, k4k5 - pmaddubsw m15, m5, k2k3 - paddsw m14, m1 - mova m1, m5 - mova m5, m9 - punpckhbw m2, m3 ;G H - mova m9, m2 - pmaddubsw m2, k6k7 - paddsw m15, m2 - paddsw m14, m15 - paddsw m14, krd - psraw m14, 7 - packuswb m13, m14 -%ifidn %1, v8_avg - pavgb m13, [dstq] -%endif - mova [dstq], m13 - - ; next iter - pmaddubsw m15, tmp0, k0k1 - pmaddubsw m14, m10, k4k5 - pmaddubsw m13, m6, k2k3 - paddsw m15, m14 - mova tmp0, m6 - mova m6, m10 - movu m2, [srcq] ;G next iter - punpcklbw m14, m3, m2 ;G H next iter - mova m10, m14 - pmaddubsw m14, k6k7 - paddsw m13, m14 - paddsw m15, m13 - paddsw m15, krd - psraw m15, 7 - - pmaddubsw m14, tmp1, k0k1 - mova tmp1, m7 - pmaddubsw m13, m7, k2k3 - mova m7, m11 - pmaddubsw m11, k4k5 - paddsw m14, m11 - punpckhbw m3, m2 ;G H next iter - mova m11, m3 - pmaddubsw m3, k6k7 - paddsw m13, m3 - paddsw m14, m13 - paddsw m14, krd - psraw m14, 7 - packuswb m15, m14 + movh m0, [srcq ] ;A + movh m1, [srcq + sstrideq ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 %ifidn %1, v8_avg - pavgb m15, [dstq + dstrideq] + mova m4, [dstq] %endif - mova [dstq + dstrideq], m15 - lea dstq, [dstq + dstrideq * 2] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m3, [srcq + sstrideq] ;H - punpcklbw m6, m2, m3 ;G H - punpckhbw m2, m3 ;G H - pmaddubsw m0, k0k1 - pmaddubsw m1, k0k1 - pmaddubsw m4, k2k3 - pmaddubsw m5, k2k3 - pmaddubsw m8, k4k5 - pmaddubsw m9, k4k5 - pmaddubsw m6, k6k7 - pmaddubsw m2, k6k7 - paddsw m0, m8 - paddsw m1, m9 - paddsw m4, m6 - paddsw m5, m2 - paddsw m0, m4 - paddsw m1, m5 - paddsw m0, krd - paddsw m1, krd - psraw m0, 7 - psraw m1, 7 - packuswb m0, m1 + movh [dstq], m0 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq %ifidn %1, v8_avg - pavgb m0, [dstq] + pavgb m0, m4 %endif - mova [dstq], m0 - -.done: - REP_RET - -%endif ; ARCH_X86_64 - + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + RET %endm INIT_XMM ssse3 |