diff options
author | Ronald S. Bultje <rbultje@google.com> | 2013-06-21 12:49:50 -0700 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2013-06-21 12:49:50 -0700 |
commit | 7756e9892b312e668fc3134bad2513806d609f73 (patch) | |
tree | cb9d543877f3dd653e8c57b83cc3b62c60bdff1b /vp9/encoder/x86/vp9_subtract_sse2.asm | |
parent | 9a480482cbc7f0d359d959bb2cfe097d0a672d6b (diff) | |
parent | 25c588b1e49deb70a06549a8c843c9a3bc19ea1a (diff) | |
download | libvpx-7756e9892b312e668fc3134bad2513806d609f73.tar libvpx-7756e9892b312e668fc3134bad2513806d609f73.tar.gz libvpx-7756e9892b312e668fc3134bad2513806d609f73.tar.bz2 libvpx-7756e9892b312e668fc3134bad2513806d609f73.zip |
Merge "Add subtract_block SSE2 version and unit test."
Diffstat (limited to 'vp9/encoder/x86/vp9_subtract_sse2.asm')
-rw-r--r-- | vp9/encoder/x86/vp9_subtract_sse2.asm | 464 |
1 files changed, 118 insertions, 346 deletions
diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm index 739d9487e..e428a1397 100644 --- a/vp9/encoder/x86/vp9_subtract_sse2.asm +++ b/vp9/encoder/x86/vp9_subtract_sse2.asm @@ -8,349 +8,121 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_sse2_impl) PRIVATE -sym(vp9_subtract_b_sse2_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi], mm0 - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_sse2) PRIVATE -sym(vp9_subtract_mby_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 8 ; do two lines at one time - -.submby_loop: - movdqa xmm0, XMMWORD PTR [rsi] ; src - movdqa xmm1, XMMWORD PTR [rax] ; pred - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - movdqa xmm4, XMMWORD PTR [rsi + rdx] - movdqa xmm5, XMMWORD PTR [rax + 16] - - movdqa xmm6, xmm4 - psubb xmm4, xmm5 - - pxor xmm5, [GLOBAL(t80)] ;convert to signed values - pxor xmm6, [GLOBAL(t80)] - pcmpgtb xmm5, xmm6 ; obtain sign information - - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - punpcklbw xmm4, xmm5 ; put sign back to subtraction - punpckhbw xmm6, xmm7 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi +32], xmm4 - movdqa XMMWORD PTR [rdi +48], xmm6 - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_sse2) PRIVATE -sym(vp9_subtract_mbuv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - lea rcx, [rdx + rdx*2] - - ;u - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ;v - mov rsi, arg(2) ;z = vsrc - add rdi, 64*2 ;diff = diff + 320 (shorts) - add rax, 64 ;Predictor = pred + 320 - - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -t80: - times 16 db 0x80 +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vp9_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + emms + RET |