summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/entropy.c8
-rw-r--r--vp8/common/entropy.h1
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm90
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c5
4 files changed, 40 insertions, 64 deletions
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index 1438e7e0f..61dbe4af7 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -36,6 +36,14 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
7, 11, 14, 15,
};
+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+ 1, 2, 6, 7,
+ 3, 5, 8, 13,
+ 4, 9, 12, 14,
+ 10, 11, 15, 16
+};
+
DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);
const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index 70e2ae675..fa2fce47b 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -95,6 +95,7 @@ struct VP8Common;
void vp8_default_coef_probs(struct VP8Common *);
extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
extern short vp8_default_zig_zag_mask[16];
extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 1e0bd5c48..57bf3c93a 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -253,10 +253,9 @@ rq_zigzag_1c:
pop rbp
ret
-
;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
+; short *inv_scan_order, short *round_ptr,
; short *quant_ptr, short *dqcoeff_ptr);
global sym(vp8_fast_quantize_b_impl_sse2)
sym(vp8_fast_quantize_b_impl_sse2):
@@ -265,32 +264,18 @@ sym(vp8_fast_quantize_b_impl_sse2):
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
- push rbx
; end prolog
- ALIGN_STACK 16, rax
-
- %define save_xmm6 0
- %define save_xmm7 16
-
- %define vp8_fastquantizeb_stack_size save_xmm7 + 16
-
- sub rsp, vp8_fastquantizeb_stack_size
-
- movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
- movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
-
mov rdx, arg(0) ;coeff_ptr
mov rcx, arg(2) ;dequant_ptr
- mov rax, arg(3) ;scan_mask
mov rdi, arg(4) ;round_ptr
mov rsi, arg(5) ;quant_ptr
movdqa xmm0, XMMWORD PTR[rdx]
movdqa xmm4, XMMWORD PTR[rdx + 16]
- movdqa xmm6, XMMWORD PTR[rdi] ;round lo
- movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
+ movdqa xmm2, XMMWORD PTR[rdi] ;round lo
+ movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi
movdqa xmm1, xmm0
movdqa xmm5, xmm4
@@ -303,8 +288,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
psubw xmm1, xmm0 ;x = abs(z)
psubw xmm5, xmm4 ;x = abs(z)
- paddw xmm1, xmm6
- paddw xmm5, xmm7
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
pmulhw xmm1, XMMWORD PTR[rsi]
pmulhw xmm5, XMMWORD PTR[rsi + 16]
@@ -312,8 +297,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
mov rdi, arg(1) ;qcoeff_ptr
mov rsi, arg(6) ;dqcoeff_ptr
- movdqa xmm6, XMMWORD PTR[rcx]
- movdqa xmm7, XMMWORD PTR[rcx + 16]
+ movdqa xmm2, XMMWORD PTR[rcx]
+ movdqa xmm3, XMMWORD PTR[rcx + 16]
pxor xmm1, xmm0
pxor xmm5, xmm4
@@ -323,64 +308,47 @@ sym(vp8_fast_quantize_b_impl_sse2):
movdqa XMMWORD PTR[rdi], xmm1
movdqa XMMWORD PTR[rdi + 16], xmm5
- pmullw xmm6, xmm1
- pmullw xmm7, xmm5
+ pmullw xmm2, xmm1
+ pmullw xmm3, xmm5
- movdqa xmm2, XMMWORD PTR[rax]
- movdqa xmm3, XMMWORD PTR[rax+16];
+ mov rdi, arg(3) ;inv_scan_order
- pxor xmm4, xmm4 ;clear all bits
+ ; Start with 16
+ pxor xmm4, xmm4 ;clear all bits
pcmpeqw xmm1, xmm4
pcmpeqw xmm5, xmm4
- pcmpeqw xmm4, xmm4 ;set all bits
+ pcmpeqw xmm4, xmm4 ;set all bits
pxor xmm1, xmm4
pxor xmm5, xmm4
- psrlw xmm1, 15
- psrlw xmm5, 15
-
- pmaddwd xmm1, xmm2
- pmaddwd xmm5, xmm3
-
- movq xmm2, xmm1
- movq xmm3, xmm5
-
- psrldq xmm1, 8
- psrldq xmm5, 8
+ pand xmm1, XMMWORD PTR[rdi]
+ pand xmm5, XMMWORD PTR[rdi+16]
- paddd xmm1, xmm5
- paddd xmm2, xmm3
+ pmaxsw xmm1, xmm5
- paddd xmm1, xmm2
- movq xmm5, xmm1
+ ; now down to 8
+ pshufd xmm5, xmm1, 00001110b
- psrldq xmm1, 4
- paddd xmm5, xmm1
+ pmaxsw xmm1, xmm5
- movq rcx, xmm5
- and rcx, 0xffff
+ ; only 4 left
+ pshuflw xmm5, xmm1, 00001110b
- xor rdx, rdx
- sub rdx, rcx
+ pmaxsw xmm1, xmm5
- bsr rax, rcx
- inc rax
+ ; okay, just 2!
+ pshuflw xmm5, xmm1, 00000001b
- sar rdx, 31
- and rax, rdx
+ pmaxsw xmm1, xmm5
- movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
- movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
+ movd rax, xmm1
+ and rax, 0xff
- movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
- movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
-
- add rsp, vp8_fastquantizeb_stack_size
- pop rsp
+ movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff
+ movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff
; begin epilog
- pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 781079849..d2199a499 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -83,7 +83,7 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#if HAVE_SSE2
int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
short *qcoeff_ptr, short *dequant_ptr,
- short *scan_mask, short *round_ptr,
+ const short *inv_scan_order, short *round_ptr,
short *quant_ptr, short *dqcoeff_ptr);
void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
@@ -99,8 +99,7 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
coeff_ptr,
qcoeff_ptr,
dequant_ptr,
- scan_mask,
-
+ vp8_default_inv_zig_zag,
round_ptr,
quant_ptr,
dqcoeff_ptr