summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2010-10-11 09:34:48 -0700
committerCode Review <code-review@webmproject.org>2010-10-11 09:34:48 -0700
commit6b1b28a83c3aa4eb25173a95708b55b722190e99 (patch)
tree8f577c28d17a21ae3a3b27b57a71bbd5d53b5a52 /vp8
parent4d2b178a22d11ae029ab85134399dfbd7c362d7c (diff)
parentd860f685b85ffafb32dfc20da53aaa81cb62c5c5 (diff)
downloadlibvpx-6b1b28a83c3aa4eb25173a95708b55b722190e99.tar
libvpx-6b1b28a83c3aa4eb25173a95708b55b722190e99.tar.gz
libvpx-6b1b28a83c3aa4eb25173a95708b55b722190e99.tar.bz2
libvpx-6b1b28a83c3aa4eb25173a95708b55b722190e99.zip
Merge "Added vp8_fast_quantize_b_sse2"
Diffstat (limited to 'vp8')
-rw-r--r--vp8/encoder/x86/quantize_mmx.asm153
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm134
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c15
3 files changed, 142 insertions, 160 deletions
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
index 51cd94078..f29a54ecd 100644
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -284,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx):
UNSHADOW_ARGS
pop rbp
ret
-
-
-;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
-; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
-; short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_sse)
-sym(vp8_fast_quantize_b_impl_sse):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- movdqa xmm0, [rsi]
-
- mov rax, arg(1) ;zbin_ptr
- movdqa xmm1, [rax]
-
- movdqa xmm3, xmm0
- psraw xmm0, 15
-
- pxor xmm3, xmm0
- psubw xmm3, xmm0 ; abs
-
- movdqa xmm2, xmm3
- pcmpgtw xmm1, xmm2
-
- pandn xmm1, xmm2
- movdqa xmm3, xmm1
-
- mov rdx, arg(6) ; quant_ptr
- movdqa xmm1, [rdx]
-
- mov rcx, arg(5) ; round_ptr
- movdqa xmm2, [rcx]
-
- paddw xmm3, xmm2
- pmulhuw xmm3, xmm1
-
- pxor xmm3, xmm0
- psubw xmm3, xmm0 ;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
- movdqa xmm0, xmm3
-
- movdqa [rdi], xmm3
-
- mov rax, arg(3) ;dequant_ptr
- movdqa xmm2, [rax]
-
- pmullw xmm3, xmm2
- mov rax, arg(7) ;dqcoeff_ptr
-
- movdqa [rax], xmm3
-
- ; next 8
- movdqa xmm4, [rsi+16]
-
- mov rax, arg(1) ;zbin_ptr
- movdqa xmm5, [rax+16]
-
- movdqa xmm7, xmm4
- psraw xmm4, 15
-
- pxor xmm7, xmm4
- psubw xmm7, xmm4 ; abs
-
- movdqa xmm6, xmm7
- pcmpgtw xmm5, xmm6
-
- pandn xmm5, xmm6
- movdqa xmm7, xmm5
-
- movdqa xmm5, [rdx+16]
- movdqa xmm6, [rcx+16]
-
-
- paddw xmm7, xmm6
- pmulhuw xmm7, xmm5
-
- pxor xmm7, xmm4
- psubw xmm7, xmm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movdqa xmm1, xmm7
- movdqa [rdi+16], xmm7
-
- mov rax, arg(3) ;dequant_ptr
- movdqa xmm6, [rax+16]
-
- pmullw xmm7, xmm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movdqa [rax+16], xmm7
- mov rdi, arg(4) ;scan_mask
-
- pxor xmm7, xmm7
- movdqa xmm2, [rdi]
-
- movdqa xmm3, [rdi+16];
- pcmpeqw xmm0, xmm7
-
- pcmpeqw xmm1, xmm7
- pcmpeqw xmm6, xmm6
-
- pxor xmm0, xmm6
- pxor xmm1, xmm6
-
- psrlw xmm0, 15
- psrlw xmm1, 15
-
- pmaddwd xmm0, xmm2
- pmaddwd xmm1, xmm3
-
- movq xmm2, xmm0
- movq xmm3, xmm1
-
- psrldq xmm0, 8
- psrldq xmm1, 8
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
-
- paddd xmm0, xmm2
- movq xmm1, xmm0
-
- psrldq xmm0, 4
- paddd xmm1, xmm0
-
- movq rcx, xmm1
- and rcx, 0xffff
-
- xor rdx, rdx
- sub rdx, rcx
-
- bsr rax, rcx
- inc rax
-
- sar rdx, 31
- and rax, rdx
-
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index a1b1c40cb..324881337 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -252,3 +252,137 @@ rq_zigzag_1c:
UNSHADOW_ARGS
pop rbp
ret
+
+
+;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_ssse2)
+sym(vp8_fast_quantize_b_impl_ssse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+
+ %define save_xmm6 0
+ %define save_xmm7 16
+
+ %define vp8_fastquantizeb_stack_size save_xmm7 + 16
+
+ sub rsp, vp8_fastquantizeb_stack_size
+
+ movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
+ movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
+
+ mov rdx, arg(0) ;coeff_ptr
+ mov rcx, arg(2) ;dequant_ptr
+ mov rax, arg(3) ;scan_mask
+ mov rdi, arg(4) ;round_ptr
+ mov rsi, arg(5) ;quant_ptr
+
+ movdqa xmm0, XMMWORD PTR[rdx]
+ movdqa xmm4, XMMWORD PTR[rdx + 16]
+
+ movdqa xmm6, XMMWORD PTR[rdi] ;round lo
+ movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ psraw xmm0, 15 ;sign of z (aka sz)
+ psraw xmm4, 15 ;sign of z (aka sz)
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0 ;x = abs(z)
+ psubw xmm5, xmm4 ;x = abs(z)
+
+ paddw xmm1, xmm6
+ paddw xmm5, xmm7
+
+ pmulhw xmm1, XMMWORD PTR[rsi]
+ pmulhw xmm5, XMMWORD PTR[rsi + 16]
+
+ mov rdi, arg(1) ;qcoeff_ptr
+ mov rsi, arg(6) ;dqcoeff_ptr
+
+ movdqa xmm6, XMMWORD PTR[rcx]
+ movdqa xmm7, XMMWORD PTR[rcx + 16]
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa XMMWORD PTR[rdi], xmm1
+ movdqa XMMWORD PTR[rdi + 16], xmm5
+
+ pmullw xmm6, xmm1
+ pmullw xmm7, xmm5
+
+ movdqa xmm2, XMMWORD PTR[rax]
+ movdqa xmm3, XMMWORD PTR[rax+16];
+
+ pxor xmm4, xmm4 ;clear all bits
+ pcmpeqw xmm1, xmm4
+ pcmpeqw xmm5, xmm4
+
+ pcmpeqw xmm4, xmm4 ;set all bits
+ pxor xmm1, xmm4
+ pxor xmm5, xmm4
+
+ psrlw xmm1, 15
+ psrlw xmm5, 15
+
+ pmaddwd xmm1, xmm2
+ pmaddwd xmm5, xmm3
+
+ movq xmm2, xmm1
+ movq xmm3, xmm5
+
+ psrldq xmm1, 8
+ psrldq xmm5, 8
+
+ paddd xmm1, xmm5
+ paddd xmm2, xmm3
+
+ paddd xmm1, xmm2
+ movq xmm5, xmm1
+
+ psrldq xmm1, 4
+ paddd xmm5, xmm1
+
+ movq rcx, xmm5
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+
+ movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
+ movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
+
+ movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
+ movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
+
+ add rsp, vp8_fastquantizeb_stack_size
+ pop rsp
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 18dc49cd4..7490a8add 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -88,24 +88,22 @@ void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
-int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
short *coeff_ptr = &b->coeff[0];
- short *zbin_ptr = &b->zbin[0][0];
short *round_ptr = &b->round[0][0];
short *quant_ptr = &b->quant[0][0];
short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
short *dequant_ptr = &d->dequant[0][0];
- d->eob = vp8_fast_quantize_b_impl_sse(
+ d->eob = vp8_fast_quantize_b_impl_ssse2(
coeff_ptr,
- zbin_ptr,
qcoeff_ptr,
dequant_ptr,
scan_mask,
@@ -116,6 +114,7 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
);
}
+
int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr,short *dequant_ptr,
const int *default_zig_zag, short *round_ptr,
@@ -285,8 +284,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
/* cpi->rtcd.encodemb.sub* not implemented for wmt */
- /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
- cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
+
}
#endif