From 8edaf6e2f232f00de3e3aac1657a31a3effe9a11 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 10 Feb 2011 14:57:43 -0500 Subject: use asm_offsets with vp8_regular_quantize_b_sse2 remove helper function and avoid shadowing all the arguments to the stack on 64bit systems when running with --good --cpu-used=0: ~2% on linux x86 and x86_64 ~2% on win32 x86 msys and visual studio more on darwin10 x86_64 significantly more on x86_64-win64-vs9 Change-Id: Ib7be12edf511fbf2922f191afd5b33b19a0c4ae6 --- build/make/Makefile | 7 +- libs.mk | 4 + vp8/encoder/asm_enc_offsets.c | 30 ++-- vp8/encoder/x86/quantize_sse2.asm | 251 ++++++++++++++++++++------------- vp8/encoder/x86/quantize_x86.h | 3 - vp8/encoder/x86/x86_csystemdependent.c | 28 +--- 6 files changed, 179 insertions(+), 144 deletions(-) diff --git a/build/make/Makefile b/build/make/Makefile index 5c90463be..64d3c934b 100755 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -331,11 +331,8 @@ ifneq ($(call enabled,DIST-SRCS),) DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_sln.sh DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/yasm.rules DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh - # - # This isn't really ARCH_ARM dependent, it's dependent on whether we're - # using assembly code or not (CONFIG_OPTIMIZATIONS maybe). Just use - # this for now. - DIST-SRCS-$(ARCH_ARM) += build/make/obj_int_extract.c + # Include obj_int_extract if we use offsets from asm_*_offsets + DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64) += build/make/obj_int_extract.c DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas.pl DIST-SRCS-yes += $(target:-$(TOOLCHAIN)=).mk endif diff --git a/libs.mk b/libs.mk index 350b31077..6a5dc1886 100644 --- a/libs.mk +++ b/libs.mk @@ -245,7 +245,9 @@ ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o CLEAN-OBJS += asm_com_offsets.asm $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm + endif + ifeq ($(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64), yes) ifeq ($(CONFIG_VP8_ENCODER), yes) asm_enc_offsets.asm: obj_int_extract asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o @@ -254,7 +256,9 @@ ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat CLEAN-OBJS += asm_enc_offsets.asm $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm endif + endif + ifeq ($(ARCH_ARM), yes) ifeq ($(CONFIG_VP8_DECODER), yes) asm_dec_offsets.asm: obj_int_extract asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c index fcf77756a..c7983c1b0 100644 --- a/vp8/encoder/asm_enc_offsets.c +++ b/vp8/encoder/asm_enc_offsets.c @@ -12,9 +12,11 @@ #include "vpx_ports/config.h" #include +#include "block.h" +#include "vp8/common/blockd.h" +#include "onyx_int.h" #include "treewriter.h" #include "tokenize.h" -#include "onyx_int.h" #define ct_assert(name,cond) \ static void assert_##name(void) UNUSED;\ @@ -31,6 +33,21 @@ * { */ +//regular quantize +DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff)); +DEFINE(vp8_block_zbin, offsetof(BLOCK, zbin)); +DEFINE(vp8_block_round, offsetof(BLOCK, round)); +DEFINE(vp8_block_quant, offsetof(BLOCK, quant)); +DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast)); +DEFINE(vp8_block_zbin_extra, offsetof(BLOCK, zbin_extra)); +DEFINE(vp8_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost)); +DEFINE(vp8_block_quant_shift, offsetof(BLOCK, quant_shift)); + +DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); +DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant)); +DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); +DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob)); + //pack tokens DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue)); DEFINE(vp8_writer_range, offsetof(vp8_writer, range)); @@ -65,17 +82,6 @@ DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST)); DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows)); -// offsets from BLOCK structure -DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff)); -DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast)); -DEFINE(vp8_block_round, offsetof(BLOCK, round)); - -// offsets from BLOCKD structure -DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); -DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); -DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant)); -DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob)); - // These two sizes are used in vp8cx_pack_tokens. They are hard coded // so if the size changes this will have to be adjusted. #if HAVE_ARMV5TE diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 45e1a2ad3..bc70b68a9 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -9,48 +9,59 @@ %include "vpx_ports/x86_abi_support.asm" +%include "asm_enc_offsets.asm" -;int vp8_regular_quantize_b_impl_sse2( -; short *coeff_ptr, -; short *zbin_ptr, -; short *qcoeff_ptr, -; short *dequant_ptr, -; const int *default_zig_zag, -; short *round_ptr, -; short *quant_ptr, -; short *dqcoeff_ptr, -; unsigned short zbin_oq_value, -; short *zbin_boost_ptr, -; short *quant_shift); -; -global sym(vp8_regular_quantize_b_impl_sse2) -sym(vp8_regular_quantize_b_impl_sse2): +; void vp8_regular_quantize_b_sse2 | arg +; (BLOCK *b, | 0 +; BLOCKD *d) | 1 + +global sym(vp8_regular_quantize_b_sse2) +sym(vp8_regular_quantize_b_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 11 SAVE_XMM + GET_GOT rbx push rsi + +%if ABI_IS_32BIT + push rdi +%else + %ifidn __OUTPUT_FORMAT__,x64 push rdi - push rbx + %endif +%endif + ALIGN_STACK 16, rax - %define abs_minus_zbin 0 - %define temp_qcoeff 32 - %define qcoeff 64 - %define eob_tmp 96 + %define BLOCKD_d 0 ; 8 + %define zrun_zbin_boost 8 ; 8 + %define abs_minus_zbin 16 ; 32 + %define temp_qcoeff 48 ; 32 + %define qcoeff 80 ; 32 %define stack_size 112 sub rsp, stack_size ; end prolog - mov rdx, arg(0) ; coeff_ptr - mov rcx, arg(1) ; zbin_ptr - movd xmm7, arg(8) ; zbin_oq_value - mov rdi, arg(5) ; round_ptr - mov rsi, arg(6) ; quant_ptr +%if ABI_IS_32BIT + mov rdi, arg(0) +%else + %ifidn __OUTPUT_FORMAT__,x64 + mov rdi, rcx ; BLOCK *b + mov [rsp + BLOCKD_d], rdx + %else + ;mov rdi, rdi ; BLOCK *b + mov [rsp + BLOCKD_d], rsi + %endif +%endif + + mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr + mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr + movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value ; z - movdqa xmm0, OWORD PTR[rdx] - movdqa xmm4, OWORD PTR[rdx + 16] + movdqa xmm0, [rdx] + movdqa xmm4, [rdx + 16] + mov rdx, [rdi + vp8_block_round] ; round_ptr pshuflw xmm7, xmm7, 0 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value @@ -70,8 +81,9 @@ sym(vp8_regular_quantize_b_impl_sse2): psubw xmm1, xmm0 psubw xmm5, xmm4 - movdqa xmm2, OWORD PTR[rcx] - movdqa xmm3, OWORD PTR[rcx + 16] + movdqa xmm2, [rcx] + movdqa xmm3, [rcx + 16] + mov rcx, [rdi + vp8_block_quant] ; quant_ptr ; *zbin_ptr + zbin_oq_value paddw xmm2, xmm7 @@ -80,18 +92,18 @@ sym(vp8_regular_quantize_b_impl_sse2): ; x - (*zbin_ptr + zbin_oq_value) psubw xmm1, xmm2 psubw xmm5, xmm3 - movdqa OWORD PTR[rsp + abs_minus_zbin], xmm1 - movdqa OWORD PTR[rsp + abs_minus_zbin + 16], xmm5 + movdqa [rsp + abs_minus_zbin], xmm1 + movdqa [rsp + abs_minus_zbin + 16], xmm5 ; add (zbin_ptr + zbin_oq_value) back paddw xmm1, xmm2 paddw xmm5, xmm3 - movdqa xmm2, OWORD PTR[rdi] - movdqa xmm6, OWORD PTR[rdi + 16] + movdqa xmm2, [rdx] + movdqa xmm6, [rdx + 16] - movdqa xmm3, OWORD PTR[rsi] - movdqa xmm7, OWORD PTR[rsi + 16] + movdqa xmm3, [rcx] + movdqa xmm7, [rcx + 16] ; x + round paddw xmm1, xmm2 @@ -105,68 +117,67 @@ sym(vp8_regular_quantize_b_impl_sse2): paddw xmm1, xmm3 paddw xmm5, xmm7 - movdqa OWORD PTR[rsp + temp_qcoeff], xmm1 - movdqa OWORD PTR[rsp + temp_qcoeff + 16], xmm5 + movdqa [rsp + temp_qcoeff], xmm1 + movdqa [rsp + temp_qcoeff + 16], xmm5 pxor xmm6, xmm6 ; zero qcoeff - movdqa OWORD PTR[rsp + qcoeff], xmm6 - movdqa OWORD PTR[rsp + qcoeff + 16], xmm6 + movdqa [rsp + qcoeff], xmm6 + movdqa [rsp + qcoeff + 16], xmm6 - mov [rsp + eob_tmp], DWORD -1 ; eob - mov rsi, arg(9) ; zbin_boost_ptr - mov rdi, arg(4) ; default_zig_zag - mov rax, arg(10) ; quant_shift_ptr + mov rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr + mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr + mov [rsp + zrun_zbin_boost], rsi -%macro ZIGZAG_LOOP 2 -rq_zigzag_loop_%1: - movsxd rdx, DWORD PTR[rdi + (%1 * 4)] ; rc - movsx ebx, WORD PTR [rsi] ; *zbin_boost_ptr - lea rsi, [rsi + 2] ; zbin_boost_ptr++ +%macro ZIGZAG_LOOP 1 + movsx edx, WORD PTR[GLOBAL(zig_zag) + (%1 * 2)] ; rc ; x movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] ; if (x >= zbin) - sub ecx, ebx ; x - zbin - jl rq_zigzag_loop_%2 ; x < zbin + sub cx, WORD PTR[rsi] ; x - zbin + lea rsi, [rsi + 2] ; zbin_boost_ptr++ + jl rq_zigzag_loop_%1 ; x < zbin - movsx ebx, WORD PTR[rsp + temp_qcoeff + rdx *2] + movsx edi, WORD PTR[rsp + temp_qcoeff + rdx *2] ; downshift by quant_shift[rdx] movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] - sar ebx, cl ; also sets Z bit - je rq_zigzag_loop_%2 ; !y - mov WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ; reset to b->zrun_zbin_boost - mov [rsp + eob_tmp], DWORD %1 ; eob = i + sar edi, cl ; also sets Z bit + je rq_zigzag_loop_%1 ; !y + mov WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] + mov rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost +rq_zigzag_loop_%1: %endmacro -ZIGZAG_LOOP 0, 1 -ZIGZAG_LOOP 1, 2 -ZIGZAG_LOOP 2, 3 -ZIGZAG_LOOP 3, 4 -ZIGZAG_LOOP 4, 5 -ZIGZAG_LOOP 5, 6 -ZIGZAG_LOOP 6, 7 -ZIGZAG_LOOP 7, 8 -ZIGZAG_LOOP 8, 9 -ZIGZAG_LOOP 9, 10 -ZIGZAG_LOOP 10, 11 -ZIGZAG_LOOP 11, 12 -ZIGZAG_LOOP 12, 13 -ZIGZAG_LOOP 13, 14 -ZIGZAG_LOOP 14, 15 -ZIGZAG_LOOP 15, end -rq_zigzag_loop_end: - - mov rbx, arg(2) ; qcoeff_ptr - mov rcx, arg(3) ; dequant_ptr - mov rsi, arg(7) ; dqcoeff_ptr - mov rax, [rsp + eob_tmp] ; eob - - movdqa xmm2, OWORD PTR[rsp + qcoeff] - movdqa xmm3, OWORD PTR[rsp + qcoeff + 16] +ZIGZAG_LOOP 0 +ZIGZAG_LOOP 1 +ZIGZAG_LOOP 2 +ZIGZAG_LOOP 3 +ZIGZAG_LOOP 4 +ZIGZAG_LOOP 5 +ZIGZAG_LOOP 6 +ZIGZAG_LOOP 7 +ZIGZAG_LOOP 8 +ZIGZAG_LOOP 9 +ZIGZAG_LOOP 10 +ZIGZAG_LOOP 11 +ZIGZAG_LOOP 12 +ZIGZAG_LOOP 13 +ZIGZAG_LOOP 14 +ZIGZAG_LOOP 15 + + movdqa xmm2, [rsp + qcoeff] + movdqa xmm3, [rsp + qcoeff + 16] + +%if ABI_IS_32BIT + mov rdi, arg(1) +%else + mov rdi, [rsp + BLOCKD_d] +%endif + + mov rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr + mov rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr ; y ^ sz pxor xmm2, xmm0 @@ -175,34 +186,67 @@ rq_zigzag_loop_end: psubw xmm2, xmm0 psubw xmm3, xmm4 - movdqa xmm0, OWORD PTR[rcx] - movdqa xmm1, OWORD PTR[rcx + 16] + ; dequant + movdqa xmm0, [rcx] + movdqa xmm1, [rcx + 16] + + mov rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr pmullw xmm0, xmm2 pmullw xmm1, xmm3 - movdqa OWORD PTR[rbx], xmm2 - movdqa OWORD PTR[rbx + 16], xmm3 - movdqa OWORD PTR[rsi], xmm0 ; store dqcoeff - movdqa OWORD PTR[rsi + 16], xmm1 ; store dqcoeff - - add rax, 1 + movdqa [rcx], xmm2 ; store qcoeff + movdqa [rcx + 16], xmm3 + movdqa [rsi], xmm0 ; store dqcoeff + movdqa [rsi + 16], xmm1 + + ; select the last value (in zig_zag order) for EOB + pcmpeqw xmm2, xmm6 + pcmpeqw xmm3, xmm6 + ; ! + pcmpeqw xmm6, xmm6 + pxor xmm2, xmm6 + pxor xmm3, xmm6 + ; mask inv_zig_zag + pand xmm2, [GLOBAL(inv_zig_zag)] + pand xmm3, [GLOBAL(inv_zig_zag) + 16] + ; select the max value + pmaxsw xmm2, xmm3 + pshufd xmm3, xmm2, 00001110b + pmaxsw xmm2, xmm3 + pshuflw xmm3, xmm2, 00001110b + pmaxsw xmm2, xmm3 + pshuflw xmm3, xmm2, 00000001b + pmaxsw xmm2, xmm3 + movd eax, xmm2 + and eax, 0xff + mov [rdi + vp8_blockd_eob], eax ; begin epilog add rsp, stack_size pop rsp - pop rbx +%if ABI_IS_32BIT pop rdi +%else + %ifidn __OUTPUT_FORMAT__,x64 + pop rdi + %endif +%endif pop rsi + RESTORE_GOT RESTORE_XMM - UNSHADOW_ARGS pop rbp ret -;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; short *inv_scan_order, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr); +; int vp8_fast_quantize_b_impl_sse2 | arg +; (short *coeff_ptr, | 0 +; short *qcoeff_ptr, | 1 +; short *dequant_ptr, | 2 +; short *inv_scan_order, | 3 +; short *round_ptr, | 4 +; short *quant_ptr, | 5 +; short *dqcoeff_ptr) | 6 + global sym(vp8_fast_quantize_b_impl_sse2) sym(vp8_fast_quantize_b_impl_sse2): push rbp @@ -300,3 +344,16 @@ sym(vp8_fast_quantize_b_impl_sse2): UNSHADOW_ARGS pop rbp ret + +SECTION_RODATA +align 16 +zig_zag: + dw 0x0000, 0x0001, 0x0004, 0x0008 + dw 0x0005, 0x0002, 0x0003, 0x0006 + dw 0x0009, 0x000c, 0x000d, 0x000a + dw 0x0007, 0x000b, 0x000e, 0x000f +inv_zig_zag: + dw 0x0001, 0x0002, 0x0006, 0x0007 + dw 0x0003, 0x0005, 0x0008, 0x000d + dw 0x0004, 0x0009, 0x000c, 0x000e + dw 0x000a, 0x000b, 0x000f, 0x0010 diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h index 266efb446..6f54bec31 100644 --- a/vp8/encoder/x86/quantize_x86.h +++ b/vp8/encoder/x86/quantize_x86.h @@ -27,11 +27,8 @@ extern prototype_quantize_block(vp8_regular_quantize_b_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -// Currently, this function realizes a gain on x86 and a loss on x86_64 -#if ARCH_X86 #undef vp8_quantize_quantb #define vp8_quantize_quantb vp8_regular_quantize_b_sse2 -#endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 8327fad60..2b6bd98eb 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -106,30 +106,6 @@ static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) ); } - -int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, - short *qcoeff_ptr,short *dequant_ptr, - const int *default_zig_zag, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr, - unsigned short zbin_oq_value, - short *zbin_boost_ptr, - short *quant_shift_ptr); - -static void regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) -{ - d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff, - b->zbin, - d->qcoeff, - d->dequant, - vp8_default_zig_zag1d, - b->round, - b->quant, - d->dqcoeff, - b->zbin_extra, - b->zrun_zbin_boost, - b->quant_shift); -} - int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); static int mbblock_error_xmm(MACROBLOCK *mb, int dc) { @@ -317,9 +293,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; -#if ARCH_X86 - cpi->rtcd.quantize.quantb = regular_quantize_b_sse2; -#endif + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; cpi->rtcd.quantize.fastquantb = fast_quantize_b_sse2; #if !(CONFIG_REALTIME_ONLY) -- cgit v1.2.3