summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2011-04-06 10:53:55 -0700
committerCode Review <code-review@webmproject.org>2011-04-06 10:53:55 -0700
commit2de858b9fcdc1b8c0db265047c5bf387fe31de3a (patch)
tree5e342824d83b48cf2176b7df9d5d9bf38104b019 /vp8/encoder
parent9e9f61a317daccb7ad3a7106b4177ddf14e07992 (diff)
parentc32e0ecc592d12573199c992f0fb710b7785c5eb (diff)
downloadlibvpx-2de858b9fcdc1b8c0db265047c5bf387fe31de3a.tar
libvpx-2de858b9fcdc1b8c0db265047c5bf387fe31de3a.tar.gz
libvpx-2de858b9fcdc1b8c0db265047c5bf387fe31de3a.tar.bz2
libvpx-2de858b9fcdc1b8c0db265047c5bf387fe31de3a.zip
Merge "use asm_offsets with vp8_fast_quantize_b_sse2"
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm125
-rw-r--r--vp8/encoder/x86/quantize_x86.h4
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c27
3 files changed, 83 insertions, 73 deletions
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index e00faebd1..5e40dc7de 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -233,72 +233,97 @@ ZIGZAG_LOOP 15
pop rbp
ret
-; int vp8_fast_quantize_b_impl_sse2 | arg
-; (short *coeff_ptr, | 0
-; short *qcoeff_ptr, | 1
-; short *dequant_ptr, | 2
-; short *inv_scan_order, | 3
-; short *round_ptr, | 4
-; short *quant_ptr, | 5
-; short *dqcoeff_ptr) | 6
-
-global sym(vp8_fast_quantize_b_impl_sse2)
-sym(vp8_fast_quantize_b_impl_sse2):
+; void vp8_fast_quantize_b_sse2 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp8_fast_quantize_b_sse2)
+sym(vp8_fast_quantize_b_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
push rdi
+ push rsi
+ %else
+ ; these registers are used for passing arguments
+ %endif
+%endif
+
; end prolog
- mov rdx, arg(0) ;coeff_ptr
- mov rcx, arg(2) ;dequant_ptr
- mov rdi, arg(4) ;round_ptr
- mov rsi, arg(5) ;quant_ptr
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
- movdqa xmm0, XMMWORD PTR[rdx]
- movdqa xmm4, XMMWORD PTR[rdx + 16]
+ mov rax, [rdi + vp8_block_coeff]
+ mov rcx, [rdi + vp8_block_round]
+ mov rdx, [rdi + vp8_block_quant_fast]
- movdqa xmm2, XMMWORD PTR[rdi] ;round lo
- movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi
+ ; z = coeff
+ movdqa xmm0, [rax]
+ movdqa xmm4, [rax + 16]
+ ; dup z so we can save sz
movdqa xmm1, xmm0
movdqa xmm5, xmm4
- psraw xmm0, 15 ;sign of z (aka sz)
- psraw xmm4, 15 ;sign of z (aka sz)
+ ; sz = z >> 15
+ psraw xmm0, 15
+ psraw xmm4, 15
+ ; x = abs(z) = (z ^ sz) - sz
pxor xmm1, xmm0
pxor xmm5, xmm4
- psubw xmm1, xmm0 ;x = abs(z)
- psubw xmm5, xmm4 ;x = abs(z)
-
- paddw xmm1, xmm2
- paddw xmm5, xmm3
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
- pmulhw xmm1, XMMWORD PTR[rsi]
- pmulhw xmm5, XMMWORD PTR[rsi + 16]
+ ; x += round
+ paddw xmm1, [rcx]
+ paddw xmm5, [rcx + 16]
- mov rdi, arg(1) ;qcoeff_ptr
- mov rsi, arg(6) ;dqcoeff_ptr
+ mov rax, [rsi + vp8_blockd_qcoeff]
+ mov rcx, [rsi + vp8_blockd_dequant]
+ mov rdi, [rsi + vp8_blockd_dqcoeff]
- movdqa xmm2, XMMWORD PTR[rcx]
- movdqa xmm3, XMMWORD PTR[rcx + 16]
+ ; y = x * quant >> 16
+ pmulhw xmm1, [rdx]
+ pmulhw xmm5, [rdx + 16]
+ ; x = (y ^ sz) - sz
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
- movdqa XMMWORD PTR[rdi], xmm1
- movdqa XMMWORD PTR[rdi + 16], xmm5
+ ; qcoeff = x
+ movdqa [rax], xmm1
+ movdqa [rax + 16], xmm5
- pmullw xmm2, xmm1
- pmullw xmm3, xmm5
+ ; x * dequant
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm5
+ pmullw xmm2, [rcx]
+ pmullw xmm3, [rcx + 16]
- mov rdi, arg(3) ;inv_scan_order
+ ; dqcoeff = x * dequant
+ movdqa [rdi], xmm2
+ movdqa [rdi + 16], xmm3
- ; Start with 16
pxor xmm4, xmm4 ;clear all bits
pcmpeqw xmm1, xmm4
pcmpeqw xmm5, xmm4
@@ -307,8 +332,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
pxor xmm1, xmm4
pxor xmm5, xmm4
- pand xmm1, XMMWORD PTR[rdi]
- pand xmm5, XMMWORD PTR[rdi+16]
+ pand xmm1, [GLOBAL(inv_zig_zag)]
+ pand xmm5, [GLOBAL(inv_zig_zag + 16)]
pmaxsw xmm1, xmm5
@@ -327,16 +352,22 @@ sym(vp8_fast_quantize_b_impl_sse2):
pmaxsw xmm1, xmm5
- movd rax, xmm1
- and rax, 0xff
-
- movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff
- movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff
+ movd eax, xmm1
+ and eax, 0xff
+ mov [rsi + vp8_blockd_eob], eax
; begin epilog
+%if ABI_IS_32BIT
+ pop rsi
pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
pop rsi
- UNSHADOW_ARGS
+ pop rdi
+ %endif
+%endif
+
+ RESTORE_GOT
pop rbp
ret
diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
index 6f54bec31..df2e0bc39 100644
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -24,12 +24,16 @@
#if HAVE_SSE2
extern prototype_quantize_block(vp8_regular_quantize_b_sse2);
+extern prototype_quantize_block(vp8_fast_quantize_b_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_quantize_quantb
#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+#undef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_sse2
+
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 2b6bd98eb..8bceecec4 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -81,31 +81,6 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#endif
#if HAVE_SSE2
-int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
- short *qcoeff_ptr, short *dequant_ptr,
- const short *inv_scan_order, short *round_ptr,
- short *quant_ptr, short *dqcoeff_ptr);
-static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
-{
- short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
- short *coeff_ptr = b->coeff;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant_fast;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
-
- d->eob = vp8_fast_quantize_b_impl_sse2(
- coeff_ptr,
- qcoeff_ptr,
- dequant_ptr,
- vp8_default_inv_zig_zag,
- round_ptr,
- quant_ptr,
- dqcoeff_ptr
- );
-}
-
int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
{
@@ -294,7 +269,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2;
cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;
- cpi->rtcd.quantize.fastquantb = fast_quantize_b_sse2;
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
#if !(CONFIG_REALTIME_ONLY)
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;