From eca59cad0b76d86f8ca8288afa074ee47046e7c7 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 27 Feb 2013 17:27:59 -0800 Subject: Use intrinsics for sse2 regular quantize Remove dependency of this function on asm_offsets. ssse3/sse4 next. Change quant_shift calculation so it be done using SIMD. Pre-calculate as much as possible to simplify EOB selection. Take advantage of qcoeff being zero'd by tying the if statements together. Speed parity with previous implementation with gcc x86_64 linux Change-Id: Ife97556a1eca3a74b09def1a3d04084974dff1fb --- vp8/common/rtcd_defs.sh | 5 +- vp8/encoder/block.h | 2 +- vp8/encoder/onyx_int.h | 6 +- vp8/encoder/quantize.c | 20 ++-- vp8/encoder/x86/quantize_sse2.asm | 245 -------------------------------------- vp8/encoder/x86/quantize_sse2.c | 139 ++++++++++++++++++++- vp8/vp8cx.mk | 1 - 7 files changed, 152 insertions(+), 266 deletions(-) delete mode 100644 vp8/encoder/x86/quantize_sse2.asm diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index 4eb96b743..ee892ded2 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -444,8 +444,9 @@ vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6 # Quantizer # prototype void vp8_regular_quantize_b "struct block *, struct blockd *" -specialize vp8_regular_quantize_b sse2 sse4_1 -vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4 +specialize vp8_regular_quantize_b sse2 #sse4_1 +# TODO(johann) Update sse4 implementation and re-enable +#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4 prototype void vp8_fast_quantize_b "struct block *, struct blockd *" specialize vp8_fast_quantize_b sse2 ssse3 media neon diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index a30f88816..cf74c7aaf 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -37,7 +37,7 @@ typedef struct block /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ short *quant; short *quant_fast; - unsigned char *quant_shift; + short *quant_shift; short *zbin; short *zrun_zbin_boost; short *round; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index fb8ad357c..c8ec826c4 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -282,17 +282,17 @@ typedef struct VP8_COMP { DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 33c8ef055..4e2fef793 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -50,8 +50,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) if (x >= zbin) { x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; /* quantize (x) */ + y = ((((x * quant_ptr[rc]) >> 16) + x) + * quant_shift_ptr[rc]) >> 16; /* quantize (x) */ x = (y ^ sz) - sz; /* get the sign back */ qcoeff_ptr[rc] = x; /* write to destination */ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */ @@ -113,7 +113,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant; - unsigned char *quant_shift_ptr = b->quant_shift; + short *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -138,8 +138,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d) if (x >= zbin) { x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; /* quantize (x) */ + y = ((((x * quant_ptr[rc]) >> 16) + x) + * quant_shift_ptr[rc]) >> 16; /* quantize (x) */ x = (y ^ sz) - sz; /* get the sign back */ qcoeff_ptr[rc] = x; /* write to destination */ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */ @@ -167,7 +167,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d) int sz; short *coeff_ptr; short *quant_ptr; - unsigned char *quant_shift_ptr; + short *quant_shift_ptr; short *qcoeff_ptr; short *dqcoeff_ptr; short *dequant_ptr; @@ -198,7 +198,7 @@ void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d) if (x >= dq) { /* Quantize x. */ - y = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc]; + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; /* Put the sign back. */ x = (y + sz) ^ sz; /* Save the coefficient and its dequantized value. */ @@ -406,7 +406,7 @@ static const int qzbin_factors_y2[129] = #define EXACT_QUANT #ifdef EXACT_QUANT static void invert_quant(int improved_quant, short *quant, - unsigned char *shift, short d) + short *shift, short d) { if(improved_quant) { @@ -418,11 +418,15 @@ static void invert_quant(int improved_quant, short *quant, t = 1 + (1<<(16+l))/d; *quant = (short)(t - (1<<16)); *shift = l; + /* use multiplication and constant shift by 16 */ + *shift = 1 << (16 - *shift); } else { *quant = (1 << 16) / d; *shift = 0; + /* use multiplication and constant shift by 16 */ + *shift = 1 << (16 - *shift); } } diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm deleted file mode 100644 index b41768ce0..000000000 --- a/vp8/encoder/x86/quantize_sse2.asm +++ /dev/null @@ -1,245 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_regular_quantize_b_sse2 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 - -global sym(vp8_regular_quantize_b_sse2) PRIVATE -sym(vp8_regular_quantize_b_sse2): - push rbp - mov rbp, rsp - SAVE_XMM 7 - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %if LIBVPX_YASM_WIN64 - push rdi - push rsi - %endif -%endif - - ALIGN_STACK 16, rax - %define zrun_zbin_boost 0 ; 8 - %define abs_minus_zbin 8 ; 32 - %define temp_qcoeff 40 ; 32 - %define qcoeff 72 ; 32 - %define stack_size 104 - sub rsp, stack_size - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr - mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr - movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value - - ; z - movdqa xmm0, [rdx] - movdqa xmm4, [rdx + 16] - mov rdx, [rdi + vp8_block_round] ; round_ptr - - pshuflw xmm7, xmm7, 0 - punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value - - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz - psraw xmm0, 15 - psraw xmm4, 15 - - ; (z ^ sz) - pxor xmm1, xmm0 - pxor xmm5, xmm4 - - ; x = abs(z) - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa xmm2, [rcx] - movdqa xmm3, [rcx + 16] - mov rcx, [rdi + vp8_block_quant] ; quant_ptr - - ; *zbin_ptr + zbin_oq_value - paddw xmm2, xmm7 - paddw xmm3, xmm7 - - ; x - (*zbin_ptr + zbin_oq_value) - psubw xmm1, xmm2 - psubw xmm5, xmm3 - movdqa [rsp + abs_minus_zbin], xmm1 - movdqa [rsp + abs_minus_zbin + 16], xmm5 - - ; add (zbin_ptr + zbin_oq_value) back - paddw xmm1, xmm2 - paddw xmm5, xmm3 - - movdqa xmm2, [rdx] - movdqa xmm6, [rdx + 16] - - movdqa xmm3, [rcx] - movdqa xmm7, [rcx + 16] - - ; x + round - paddw xmm1, xmm2 - paddw xmm5, xmm6 - - ; y = x * quant_ptr >> 16 - pmulhw xmm3, xmm1 - pmulhw xmm7, xmm5 - - ; y += x - paddw xmm1, xmm3 - paddw xmm5, xmm7 - - movdqa [rsp + temp_qcoeff], xmm1 - movdqa [rsp + temp_qcoeff + 16], xmm5 - - pxor xmm6, xmm6 - ; zero qcoeff - movdqa [rsp + qcoeff], xmm6 - movdqa [rsp + qcoeff + 16], xmm6 - - mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr - mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr - mov [rsp + zrun_zbin_boost], rdx - -%macro ZIGZAG_LOOP 1 - ; x - movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] - - ; if (x >= zbin) - sub cx, WORD PTR[rdx] ; x - zbin - lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl .rq_zigzag_loop_%1 ; x < zbin - - movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] - - ; downshift by quant_shift[rc] - movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] - sar edi, cl ; also sets Z bit - je .rq_zigzag_loop_%1 ; !y - mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] - mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost -.rq_zigzag_loop_%1: -%endmacro -; in vp8_default_zig_zag1d order: see vp8/common/entropy.c -ZIGZAG_LOOP 0 -ZIGZAG_LOOP 1 -ZIGZAG_LOOP 4 -ZIGZAG_LOOP 8 -ZIGZAG_LOOP 5 -ZIGZAG_LOOP 2 -ZIGZAG_LOOP 3 -ZIGZAG_LOOP 6 -ZIGZAG_LOOP 9 -ZIGZAG_LOOP 12 -ZIGZAG_LOOP 13 -ZIGZAG_LOOP 10 -ZIGZAG_LOOP 7 -ZIGZAG_LOOP 11 -ZIGZAG_LOOP 14 -ZIGZAG_LOOP 15 - - movdqa xmm2, [rsp + qcoeff] - movdqa xmm3, [rsp + qcoeff + 16] - - mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr - mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr - - ; y ^ sz - pxor xmm2, xmm0 - pxor xmm3, xmm4 - ; x = (y ^ sz) - sz - psubw xmm2, xmm0 - psubw xmm3, xmm4 - - ; dequant - movdqa xmm0, [rcx] - movdqa xmm1, [rcx + 16] - - mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr - - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - movdqa [rcx], xmm2 ; store qcoeff - movdqa [rcx + 16], xmm3 - movdqa [rdi], xmm0 ; store dqcoeff - movdqa [rdi + 16], xmm1 - - mov rcx, [rsi + vp8_blockd_eob] - - ; select the last value (in zig_zag order) for EOB - pcmpeqw xmm2, xmm6 - pcmpeqw xmm3, xmm6 - ; ! - pcmpeqw xmm6, xmm6 - pxor xmm2, xmm6 - pxor xmm3, xmm6 - ; mask inv_zig_zag - pand xmm2, [GLOBAL(inv_zig_zag)] - pand xmm3, [GLOBAL(inv_zig_zag + 16)] - ; select the max value - pmaxsw xmm2, xmm3 - pshufd xmm3, xmm2, 00001110b - pmaxsw xmm2, xmm3 - pshuflw xmm3, xmm2, 00001110b - pmaxsw xmm2, xmm3 - pshuflw xmm3, xmm2, 00000001b - pmaxsw xmm2, xmm3 - movd eax, xmm2 - and eax, 0xff - - mov BYTE PTR [rcx], al ; store eob - - ; begin epilog - add rsp, stack_size - pop rsp -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - %endif -%endif - RESTORE_GOT - RESTORE_XMM - pop rbp - ret - -SECTION_RODATA -align 16 -inv_zig_zag: - dw 0x0001, 0x0002, 0x0006, 0x0007 - dw 0x0003, 0x0005, 0x0008, 0x000d - dw 0x0004, 0x0009, 0x000c, 0x000e - dw 0x000a, 0x000b, 0x000f, 0x0010 diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c index 55d57ad62..aa2315a58 100644 --- a/vp8/encoder/x86/quantize_sse2.c +++ b/vp8/encoder/x86/quantize_sse2.c @@ -9,13 +9,140 @@ */ -#include "vp8/common/blockd.h" -#include "vp8/common/entropy.h" +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/x86.h" +#include "vpx_mem/vpx_mem.h" #include "vp8/encoder/block.h" - -#include //MMX -#include //SSE -#include //SSE2 +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#include /* MMX */ +#include /* SSE */ +#include /* SSE2 */ + +#define SELECT_EOB(i, z) \ + do { \ + __label__ select_eob_end; \ + short boost = *zbin_boost_ptr; \ + int cmp = (x[z] < boost) | (y[z] == 0); \ + zbin_boost_ptr++; \ + if (cmp) \ + goto select_eob_end; \ + qcoeff_ptr[z] = y[z]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + select_eob_end:; \ + } while (0) + +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) +{ + char eob = 0; + short *zbin_boost_ptr = b->zrun_zbin_boost; + short *qcoeff_ptr = d->qcoeff; + DECLARE_ALIGNED_ARRAY(16, short, x, 16); + DECLARE_ALIGNED_ARRAY(16, short, y, 16); + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + vpx_memset(qcoeff_ptr, 0, 32); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + _mm_store_si128((__m128i *)(x), x_minus_zbin0); + _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + _mm_store_si128((__m128i *)(y), y0); + _mm_store_si128((__m128i *)(y + 8), y1); + + zbin_boost_ptr = b->zrun_zbin_boost; + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0); + SELECT_EOB(2, 1); + SELECT_EOB(3, 4); + SELECT_EOB(4, 8); + SELECT_EOB(5, 5); + SELECT_EOB(6, 2); + SELECT_EOB(7, 3); + SELECT_EOB(8, 6); + SELECT_EOB(9, 9); + SELECT_EOB(10, 12); + SELECT_EOB(11, 13); + SELECT_EOB(12, 10); + SELECT_EOB(13, 7); + SELECT_EOB(14, 11); + SELECT_EOB(15, 14); + SELECT_EOB(16, 15); + + y0 = _mm_load_si128((__m128i *)(d->qcoeff)); + y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); + + /* dqcoeff = qcoeff * dequant */ + y0 = _mm_mullo_epi16(y0, dequant0); + y1 = _mm_mullo_epi16(y1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), y0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); + + *d->eob = eob; +} void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index f6feafb6e..a77971eb5 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -90,7 +90,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c -VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm # TODO(johann) make this generic ifeq ($(HAVE_SSE2),yes) -- cgit v1.2.3