summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder')
-rwxr-xr-xvp8/encoder/x86/quantize_ssse3.asm114
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c22
2 files changed, 136 insertions, 0 deletions
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
new file mode 100755
index 000000000..2f33199e5
--- /dev/null
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,114 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(0) ;coeff_ptr
+ mov rdi, arg(3) ;round_ptr
+ mov rsi, arg(4) ;quant_ptr
+
+ movdqa xmm0, [rdx]
+ movdqa xmm4, [rdx + 16]
+
+ movdqa xmm2, [rdi] ;round lo
+ movdqa xmm3, [rdi + 16] ;round hi
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ psraw xmm0, 15 ;sign of z (aka sz)
+ psraw xmm4, 15 ;sign of z (aka sz)
+
+ pabsw xmm1, xmm1
+ pabsw xmm5, xmm5
+
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
+
+ pmulhw xmm1, [rsi]
+ pmulhw xmm5, [rsi + 16]
+
+ mov rdi, arg(1) ;qcoeff_ptr
+ mov rcx, arg(2) ;dequant_ptr
+ mov rsi, arg(5) ;dqcoeff_ptr
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa [rdi], xmm1
+ movdqa [rdi + 16], xmm5
+
+ movdqa xmm2, [rcx]
+ movdqa xmm3, [rcx + 16]
+
+ pxor xmm4, xmm4
+ pmullw xmm2, xmm1
+ pmullw xmm3, xmm5
+
+ pcmpeqw xmm1, xmm4 ;non zero mask
+ pcmpeqw xmm5, xmm4 ;non zero mask
+ packsswb xmm1, xmm5
+ pshufb xmm1, [ GLOBAL(zz_shuf)]
+
+ pmovmskb edx, xmm1
+
+; xor ecx, ecx
+; mov eax, -1
+;find_eob_loop:
+; shr edx, 1
+; jc fq_skip
+; mov eax, ecx
+;fq_skip:
+; inc ecx
+; cmp ecx, 16
+; jne find_eob_loop
+ xor rdi, rdi
+ mov eax, -1
+ xor dx, ax ;flip the bits for bsr
+ bsr eax, edx
+
+ movdqa [rsi], xmm2 ;store dqcoeff
+ movdqa [rsi + 16], xmm3 ;store dqcoeff
+
+ sub edi, edx ;check for all zeros in bit mask
+ sar edi, 31 ;0 or -1
+ add eax, 1
+ and eax, edi ;if the bit mask was all zero,
+ ;then eob = 0
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+ db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index ed0e71ed0..fb1b37ccb 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -179,6 +179,25 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
#endif
+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+ d->eob = vp8_fast_quantize_b_impl_ssse3(
+ b->coeff,
+ d->qcoeff,
+ d->dequant,
+ b->round,
+ b->quant,
+ d->dqcoeff
+ );
+}
+#endif
+
+
void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -333,6 +352,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
{
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
+
}
#endif