diff options
author | Johann <johannkoenig@google.com> | 2011-01-11 09:41:57 -0500 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2011-01-14 14:26:10 -0500 |
commit | 15f9bea73b136df73ee5efd1589e19924162e8fe (patch) | |
tree | 5927e66487cfc09d4711a3586cd1750a818d6178 /vp8/encoder/x86/quantize_sse2.asm | |
parent | a1a4d23797b2537f986859c57a73fa9330fa0ed5 (diff) | |
download | libvpx-15f9bea73b136df73ee5efd1589e19924162e8fe.tar libvpx-15f9bea73b136df73ee5efd1589e19924162e8fe.tar.gz libvpx-15f9bea73b136df73ee5efd1589e19924162e8fe.tar.bz2 libvpx-15f9bea73b136df73ee5efd1589e19924162e8fe.zip |
update sse2 regular quantizer
about ~5% gain on 32bit. disabled for 64bit
unset executable bit on ssse3 version (cosmetic)
Change-Id: I1a5860839eb294ce4261f819caea2dcfa78e57ca
Diffstat (limited to 'vp8/encoder/x86/quantize_sse2.asm')
-rw-r--r-- | vp8/encoder/x86/quantize_sse2.asm | 316 |
1 files changed, 131 insertions, 185 deletions
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 57bf3c93a..45e1a2ad3 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -11,220 +11,169 @@ %include "vpx_ports/x86_abi_support.asm" -;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; const int *default_zig_zag, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr, +;int vp8_regular_quantize_b_impl_sse2( +; short *coeff_ptr, +; short *zbin_ptr, +; short *qcoeff_ptr, +; short *dequant_ptr, +; const int *default_zig_zag, +; short *round_ptr, +; short *quant_ptr, +; short *dqcoeff_ptr, ; unsigned short zbin_oq_value, -; short *zbin_boost_ptr); +; short *zbin_boost_ptr, +; short *quant_shift); ; global sym(vp8_regular_quantize_b_impl_sse2) sym(vp8_regular_quantize_b_impl_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 10 + SHADOW_ARGS_TO_STACK 11 + SAVE_XMM push rsi push rdi push rbx - ; end prolog - ALIGN_STACK 16, rax + %define abs_minus_zbin 0 + %define temp_qcoeff 32 + %define qcoeff 64 + %define eob_tmp 96 + %define stack_size 112 + sub rsp, stack_size + ; end prolog - %define abs_minus_zbin_lo 0 - %define abs_minus_zbin_hi 16 - %define temp_qcoeff_lo 32 - %define temp_qcoeff_hi 48 - %define save_xmm6 64 - %define save_xmm7 80 - %define eob 96 - - %define vp8_regularquantizeb_stack_size eob + 16 - - sub rsp, vp8_regularquantizeb_stack_size - - movdqa OWORD PTR[rsp + save_xmm6], xmm6 - movdqa OWORD PTR[rsp + save_xmm7], xmm7 - - mov rdx, arg(0) ;coeff_ptr - mov eax, arg(8) ;zbin_oq_value - - mov rcx, arg(1) ;zbin_ptr - movd xmm7, eax + mov rdx, arg(0) ; coeff_ptr + mov rcx, arg(1) ; zbin_ptr + movd xmm7, arg(8) ; zbin_oq_value + mov rdi, arg(5) ; round_ptr + mov rsi, arg(6) ; quant_ptr + ; z movdqa xmm0, OWORD PTR[rdx] movdqa xmm4, OWORD PTR[rdx + 16] + pshuflw xmm7, xmm7, 0 + punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value + movdqa xmm1, xmm0 movdqa xmm5, xmm4 - psraw xmm0, 15 ;sign of z (aka sz) - psraw xmm4, 15 ;sign of z (aka sz) + ; sz + psraw xmm0, 15 + psraw xmm4, 15 + ; (z ^ sz) pxor xmm1, xmm0 pxor xmm5, xmm4 - movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr - movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr - - pshuflw xmm7, xmm7, 0 - psubw xmm1, xmm0 ;x = abs(z) + ; x = abs(z) + psubw xmm1, xmm0 + psubw xmm5, xmm4 - punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value - psubw xmm5, xmm4 ;x = abs(z) + movdqa xmm2, OWORD PTR[rcx] + movdqa xmm3, OWORD PTR[rcx + 16] + ; *zbin_ptr + zbin_oq_value paddw xmm2, xmm7 paddw xmm3, xmm7 - psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value) - psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value) - - mov rdi, arg(5) ;round_ptr - mov rsi, arg(6) ;quant_ptr + ; x - (*zbin_ptr + zbin_oq_value) + psubw xmm1, xmm2 + psubw xmm5, xmm3 + movdqa OWORD PTR[rsp + abs_minus_zbin], xmm1 + movdqa OWORD PTR[rsp + abs_minus_zbin + 16], xmm5 - movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1 - movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5 - - paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back - paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back + ; add (zbin_ptr + zbin_oq_value) back + paddw xmm1, xmm2 + paddw xmm5, xmm3 movdqa xmm2, OWORD PTR[rdi] - movdqa xmm3, OWORD PTR[rsi] - movdqa xmm6, OWORD PTR[rdi + 16] + + movdqa xmm3, OWORD PTR[rsi] movdqa xmm7, OWORD PTR[rsi + 16] + ; x + round paddw xmm1, xmm2 paddw xmm5, xmm6 - pmulhw xmm1, xmm3 - pmulhw xmm5, xmm7 - - mov rsi, arg(2) ;qcoeff_ptr - pxor xmm6, xmm6 - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1 - movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5 - - movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff - movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff - - xor rax, rax - mov rcx, -1 - - mov [rsp + eob], rcx - mov rsi, arg(9) ;zbin_boost_ptr - - mov rbx, arg(4) ;default_zig_zag + ; y = x * quant_ptr >> 16 + pmulhw xmm3, xmm1 + pmulhw xmm7, xmm5 -rq_zigzag_loop: - movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ + ; y += x + paddw xmm1, xmm3 + paddw xmm5, xmm7 - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + movdqa OWORD PTR[rsp + temp_qcoeff], xmm1 + movdqa OWORD PTR[rsp + temp_qcoeff + 16], xmm5 - sub edx, edi ;x - zbin - jl rq_zigzag_1 - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1 - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1a - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1a - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1a: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1b - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1b - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1b: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1c - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1c - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1c: - lea rax, [rax + 1] - - cmp rax, 16 - jl rq_zigzag_loop - - mov rdi, arg(2) ;qcoeff_ptr - mov rcx, arg(3) ;dequant_ptr - mov rsi, arg(7) ;dqcoeff_ptr - - movdqa xmm2, OWORD PTR[rdi] - movdqa xmm3, OWORD PTR[rdi + 16] + pxor xmm6, xmm6 + ; zero qcoeff + movdqa OWORD PTR[rsp + qcoeff], xmm6 + movdqa OWORD PTR[rsp + qcoeff + 16], xmm6 + + mov [rsp + eob_tmp], DWORD -1 ; eob + mov rsi, arg(9) ; zbin_boost_ptr + mov rdi, arg(4) ; default_zig_zag + mov rax, arg(10) ; quant_shift_ptr + +%macro ZIGZAG_LOOP 2 +rq_zigzag_loop_%1: + movsxd rdx, DWORD PTR[rdi + (%1 * 4)] ; rc + movsx ebx, WORD PTR [rsi] ; *zbin_boost_ptr + lea rsi, [rsi + 2] ; zbin_boost_ptr++ + + ; x + movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] + + ; if (x >= zbin) + sub ecx, ebx ; x - zbin + jl rq_zigzag_loop_%2 ; x < zbin + + movsx ebx, WORD PTR[rsp + temp_qcoeff + rdx *2] + + ; downshift by quant_shift[rdx] + movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] + sar ebx, cl ; also sets Z bit + je rq_zigzag_loop_%2 ; !y + mov WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ; reset to b->zrun_zbin_boost + mov [rsp + eob_tmp], DWORD %1 ; eob = i +%endmacro +ZIGZAG_LOOP 0, 1 +ZIGZAG_LOOP 1, 2 +ZIGZAG_LOOP 2, 3 +ZIGZAG_LOOP 3, 4 +ZIGZAG_LOOP 4, 5 +ZIGZAG_LOOP 5, 6 +ZIGZAG_LOOP 6, 7 +ZIGZAG_LOOP 7, 8 +ZIGZAG_LOOP 8, 9 +ZIGZAG_LOOP 9, 10 +ZIGZAG_LOOP 10, 11 +ZIGZAG_LOOP 11, 12 +ZIGZAG_LOOP 12, 13 +ZIGZAG_LOOP 13, 14 +ZIGZAG_LOOP 14, 15 +ZIGZAG_LOOP 15, end +rq_zigzag_loop_end: + + mov rbx, arg(2) ; qcoeff_ptr + mov rcx, arg(3) ; dequant_ptr + mov rsi, arg(7) ; dqcoeff_ptr + mov rax, [rsp + eob_tmp] ; eob + + movdqa xmm2, OWORD PTR[rsp + qcoeff] + movdqa xmm3, OWORD PTR[rsp + qcoeff + 16] + + ; y ^ sz + pxor xmm2, xmm0 + pxor xmm3, xmm4 + ; x = (y ^ sz) - sz + psubw xmm2, xmm0 + psubw xmm3, xmm4 movdqa xmm0, OWORD PTR[rcx] movdqa xmm1, OWORD PTR[rcx + 16] @@ -232,23 +181,20 @@ rq_zigzag_1c: pmullw xmm0, xmm2 pmullw xmm1, xmm3 - movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff - movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff - - mov rax, [rsp + eob] - - movdqa xmm6, OWORD PTR[rsp + save_xmm6] - movdqa xmm7, OWORD PTR[rsp + save_xmm7] + movdqa OWORD PTR[rbx], xmm2 + movdqa OWORD PTR[rbx + 16], xmm3 + movdqa OWORD PTR[rsi], xmm0 ; store dqcoeff + movdqa OWORD PTR[rsi + 16], xmm1 ; store dqcoeff add rax, 1 - add rsp, vp8_regularquantizeb_stack_size - pop rsp - ; begin epilog + add rsp, stack_size + pop rsp pop rbx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret |