diff options
author | John Koleszar <jkoleszar@google.com> | 2013-02-28 16:25:38 -0800 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2013-03-01 11:06:05 -0800 |
commit | 69c67c9531b08c1b15985b351677a162739af43e (patch) | |
tree | 5e07512832085e972ce4bdc7e96472d2c4f5c545 /vp8/encoder/x86 | |
parent | db4dc6f0c0c30f56d675a489c93b22517eedb9f8 (diff) | |
parent | a07bed2b2b2ca84fdb145ee8e4f1ca100d39915e (diff) | |
download | libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.gz libvpx-69c67c9531b08c1b15985b351677a162739af43e.tar.bz2 libvpx-69c67c9531b08c1b15985b351677a162739af43e.zip |
Merge master branch into experimental
Picks up some build system changes, compiler warning fixes, etc.
Change-Id: I2712f99e653502818a101a72696ad54018152d4e
Diffstat (limited to 'vp8/encoder/x86')
-rw-r--r-- | vp8/encoder/x86/dct_sse2.asm | 4 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse2.asm | 147 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse2.c | 103 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse4.asm | 6 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_ssse3.asm | 6 | ||||
-rw-r--r-- | vp8/encoder/x86/temporal_filter_apply_sse2.asm | 2 |
6 files changed, 115 insertions, 153 deletions
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index d880ce0c4..d06bca592 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -29,7 +29,7 @@ movsxd rax, dword ptr arg(2) lea rcx, [rsi + rax*2] %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 %define input rcx %define output rdx %define pitch r8 @@ -53,7 +53,7 @@ RESTORE_GOT pop rbp %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 RESTORE_XMM %endif %endif diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index fe9464b3d..b41768ce0 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -27,7 +27,7 @@ sym(vp8_regular_quantize_b_sse2): push rdi push rsi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 push rdi push rsi %endif @@ -46,7 +46,7 @@ sym(vp8_regular_quantize_b_sse2): mov rdi, arg(0) ; BLOCK *b mov rsi, arg(1) ; BLOCKD *d %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 mov rdi, rcx ; BLOCK *b mov rsi, rdx ; BLOCKD *d %else @@ -226,7 +226,7 @@ ZIGZAG_LOOP 15 pop rsi pop rdi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 pop rsi pop rdi %endif @@ -236,147 +236,6 @@ ZIGZAG_LOOP 15 pop rbp ret -; void vp8_fast_quantize_b_sse2 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 - -global sym(vp8_fast_quantize_b_sse2) PRIVATE -sym(vp8_fast_quantize_b_sse2): - push rbp - mov rbp, rsp - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %ifidn __OUTPUT_FORMAT__,x64 - push rdi - push rsi - %else - ; these registers are used for passing arguments - %endif -%endif - - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %ifidn __OUTPUT_FORMAT__,x64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_round] - mov rdx, [rdi + vp8_block_quant_fast] - - ; z = coeff - movdqa xmm0, [rax] - movdqa xmm4, [rax + 16] - - ; dup z so we can save sz - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz = z >> 15 - psraw xmm0, 15 - psraw xmm4, 15 - - ; x = abs(z) = (z ^ sz) - sz - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - ; x += round - paddw xmm1, [rcx] - paddw xmm5, [rcx + 16] - - mov rax, [rsi + vp8_blockd_qcoeff] - mov rcx, [rsi + vp8_blockd_dequant] - mov rdi, [rsi + vp8_blockd_dqcoeff] - - ; y = x * quant >> 16 - pmulhw xmm1, [rdx] - pmulhw xmm5, [rdx + 16] - - ; x = (y ^ sz) - sz - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - ; qcoeff = x - movdqa [rax], xmm1 - movdqa [rax + 16], xmm5 - - ; x * dequant - movdqa xmm2, xmm1 - movdqa xmm3, xmm5 - pmullw xmm2, [rcx] - pmullw xmm3, [rcx + 16] - - ; dqcoeff = x * dequant - movdqa [rdi], xmm2 - movdqa [rdi + 16], xmm3 - - pxor xmm4, xmm4 ;clear all bits - pcmpeqw xmm1, xmm4 - pcmpeqw xmm5, xmm4 - - pcmpeqw xmm4, xmm4 ;set all bits - pxor xmm1, xmm4 - pxor xmm5, xmm4 - - pand xmm1, [GLOBAL(inv_zig_zag)] - pand xmm5, [GLOBAL(inv_zig_zag + 16)] - - pmaxsw xmm1, xmm5 - - mov rcx, [rsi + vp8_blockd_eob] - - ; now down to 8 - pshufd xmm5, xmm1, 00001110b - - pmaxsw xmm1, xmm5 - - ; only 4 left - pshuflw xmm5, xmm1, 00001110b - - pmaxsw xmm1, xmm5 - - ; okay, just 2! - pshuflw xmm5, xmm1, 00000001b - - pmaxsw xmm1, xmm5 - - movd eax, xmm1 - and eax, 0xff - - mov BYTE PTR [rcx], al ; store eob - - ; begin epilog -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %ifidn __OUTPUT_FORMAT__,x64 - pop rsi - pop rdi - %endif -%endif - - RESTORE_GOT - pop rbp - ret - SECTION_RODATA align 16 inv_zig_zag: diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c new file mode 100644 index 000000000..55d57ad62 --- /dev/null +++ b/vp8/encoder/x86/quantize_sse2.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/common/blockd.h" +#include "vp8/common/entropy.h" +#include "vp8/encoder/block.h" + +#include <mmintrin.h> //MMX +#include <xmmintrin.h> //SSE +#include <emmintrin.h> //SSE2 + +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) +{ + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); + __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); + + __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* x = abs(y) = (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + xdq0 = _mm_mullo_epi16(x0, dequant0); + xdq1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); + + /* build a mask for the zig zag */ + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpeq_epi16(x0, zeros); + x1 = _mm_cmpeq_epi16(x1, zeros); + + ones = _mm_cmpeq_epi16(zeros, zeros); + + x0 = _mm_xor_si128(x0, ones); + x1 = _mm_xor_si128(x1, ones); + + x0 = _mm_and_si128(x0, inv_zig_zag0); + x1 = _mm_and_si128(x1, inv_zig_zag1); + + x0 = _mm_max_epi16(x0, x1); + + /* now down to 8 */ + x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* only 4 left */ + x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* okay, just 2! */ + x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 + + x0 = _mm_max_epi16(x0, x1); + + *d->eob = 0xFF & _mm_cvtsi128_si32(x0); +} diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm index f21146457..dbd171bfc 100644 --- a/vp8/encoder/x86/quantize_sse4.asm +++ b/vp8/encoder/x86/quantize_sse4.asm @@ -31,7 +31,7 @@ sym(vp8_regular_quantize_b_sse4): %define stack_size 32 sub rsp, stack_size %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 SAVE_XMM 8, u push rdi push rsi @@ -43,7 +43,7 @@ sym(vp8_regular_quantize_b_sse4): mov rdi, arg(0) ; BLOCK *b mov rsi, arg(1) ; BLOCKD *d %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 mov rdi, rcx ; BLOCK *b mov rsi, rdx ; BLOCKD *d %else @@ -240,7 +240,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 pop rbp %else %undef xmm5 - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 pop rsi pop rdi RESTORE_XMM diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm index 35368894d..7b1dc119f 100644 --- a/vp8/encoder/x86/quantize_ssse3.asm +++ b/vp8/encoder/x86/quantize_ssse3.asm @@ -27,7 +27,7 @@ sym(vp8_fast_quantize_b_ssse3): push rdi push rsi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 push rdi push rsi %endif @@ -38,7 +38,7 @@ sym(vp8_fast_quantize_b_ssse3): mov rdi, arg(0) ; BLOCK *b mov rsi, arg(1) ; BLOCKD *d %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 mov rdi, rcx ; BLOCK *b mov rsi, rdx ; BLOCKD *d %else @@ -122,7 +122,7 @@ sym(vp8_fast_quantize_b_ssse3): pop rsi pop rdi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 pop rsi pop rdi %endif diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm index ce9d9836b..bd92b398a 100644 --- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -50,7 +50,7 @@ sym(vp8_temporal_filter_apply_sse2): ; 0x8000 >> (16 - strength) mov rdx, 16 sub rdx, arg(4) ; 16 - strength - movd xmm4, rdx ; can't use rdx w/ shift + movq xmm4, rdx ; can't use rdx w/ shift movdqa xmm5, [GLOBAL(_const_top_bit)] psrlw xmm5, xmm4 movdqa [rsp + rounding_bit], xmm5 |