summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2010-06-24 13:11:30 -0400
committerJohn Koleszar <jkoleszar@google.com>2010-06-24 16:40:56 -0400
commitf1a3b1e0d94dec2d40008f36fdfad99338484b9a (patch)
treef3ea5144c61222512f03775247a1a2d285fd8288
parentd0dd01b8ce8bc5f477d70f1c127d795418c5efb5 (diff)
downloadlibvpx-f1a3b1e0d94dec2d40008f36fdfad99338484b9a.tar
libvpx-f1a3b1e0d94dec2d40008f36fdfad99338484b9a.tar.gz
libvpx-f1a3b1e0d94dec2d40008f36fdfad99338484b9a.tar.bz2
libvpx-f1a3b1e0d94dec2d40008f36fdfad99338484b9a.zip
Added first-pass sse2 version of Yaowu's new fdct.
Change-Id: Ib479210067510162879c368428b92690591120b2
-rw-r--r--vp8/encoder/x86/dct_sse2.asm390
-rw-r--r--vp8/encoder/x86/dct_x86.h16
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c17
-rw-r--r--vp8/vp8cx.mk1
4 files changed, 180 insertions, 244 deletions
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 1cd137d15..0e8cfcfc3 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,251 +11,179 @@
%include "vpx_ports/x86_abi_support.asm"
-global sym(vp8_short_fdct4x4_wmt)
-
-%define DCTCONSTANTSBITS (16)
-%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
-%define x_c1 (60547) ; cos(pi /8) * (1<<15)
-%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
-%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-
-%define _1STSTAGESHIFT 14
-%define _2NDSTAGESHIFT 16
-
-
-;; using matrix multiply
-;void vp8_short_fdct4x4_wmt(short *input, short *output)
-sym(vp8_short_fdct4x4_wmt):
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
+ SHADOW_ARGS_TO_STACK 3
+;; SAVE_XMM
GET_GOT rbx
+ push rsi
+ push rdi
; end prolog
- mov rax, arg(0) ;input
- mov rcx, arg(1) ;output
-
- lea rdx, [dct_matrix_sse2 GLOBAL]
-
- movdqu xmm0, [rax ]
- movdqu xmm1, [rax+16]
-
- ; first column
- movdqa xmm2, xmm0
- movdqa xmm7, [rdx]
-
- pmaddwd xmm2, xmm7
- movdqa xmm3, xmm1
-
- pmaddwd xmm3, xmm7
- movdqa xmm4, xmm2
-
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
-
- movdqa xmm3, xmm2
- punpckldq xmm2, xmm4
-
- punpckhdq xmm3, xmm4
- paddd xmm2, xmm3
-
-
- paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
- psrad xmm2, _1STSTAGESHIFT
- ;second column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+16]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+16]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
- psrad xmm3, _1STSTAGESHIFT
- packssdw xmm2, xmm3
-
- ;third column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+32]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+32]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _1STSTAGESHIFT
-
- ;fourth column (this is the last column, so we do not have save the source any more)
- pmaddwd xmm0, [rdx+48]
- pmaddwd xmm1, [rdx+48]
-
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
-
- punpckhdq xmm4, xmm1
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm4
- punpckhdq xmm1, xmm4
-
- paddd xmm0, xmm1
- paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
- psrad xmm0, _1STSTAGESHIFT
- packssdw xmm3, xmm0
- ; done with one pass
- ; now start second pass
- movdqa xmm0, xmm2
- movdqa xmm1, xmm3
-
- pmaddwd xmm2, xmm7
- pmaddwd xmm3, xmm7
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
+ mov rsi, arg(0)
+ movsxd rax, DWORD PTR arg(2)
+ lea rdi, [rsi + rax*2]
+
+ movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10
+ movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20
+ movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30
+
+ punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
+ punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
+
+ mov rdi, arg(1)
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
+ punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
+ pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
+ pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
+
+ punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
+ psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
+ psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
+ psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[_mult_add GLOBAL] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[_mult_sub GLOBAL] ;a1 - b1
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[_5352_2217 GLOBAL] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
+
+ paddd xmm3, XMMWORD PTR[_14500 GLOBAL]
+ paddd xmm4, XMMWORD PTR[_7500 GLOBAL]
+ psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm0, xmm1 ;op[2] op[0]
+ packssdw xmm3, xmm4 ;op[3] op[1]
+ ; 23 22 21 20 03 02 01 00
+ ;
+ ; 33 32 31 30 13 12 11 10
+ ;
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
+
+ movdqa xmm3, xmm0
+ punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
+ punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
+
+ movdqa xmm5, XMMWORD PTR[_7 GLOBAL]
+ pshufd xmm2, xmm2, 04eh
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
+ psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
+
+ pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
+ movdqa xmm2, xmm3 ;save d1 for compare
+ pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
+ pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
+ pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
+ pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
+ pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[_mult_add GLOBAL] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[_mult_sub GLOBAL] ;a1 - b1
+
+ pxor xmm4, xmm4 ;zero out for compare
+ paddd xmm0, xmm5
+ paddd xmm1, xmm5
+ pcmpeqw xmm2, xmm4
+ psrad xmm0, 4 ;(a1 + b1 + 7)>>4
+ psrad xmm1, 4 ;(a1 - b1 + 7)>>4
+ pandn xmm2, XMMWORD PTR[_cmp_mask GLOBAL] ;clear upper,
+ ;and keep bit 0 of lower
+
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[_5352_2217 GLOBAL] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
+ paddd xmm3, XMMWORD PTR[_12000 GLOBAL]
+ paddd xmm4, XMMWORD PTR[_51000 GLOBAL]
+ packssdw xmm0, xmm1 ;op[8] op[0]
+ psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
+ psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
+
+ packssdw xmm3, xmm4 ;op[12] op[4]
+ movdqa xmm1, xmm0
+ paddw xmm3, xmm2 ;op[4] += (d1!=0)
+ punpcklqdq xmm0, xmm3 ;op[4] op[0]
+ punpckhqdq xmm1, xmm3 ;op[12] op[8]
+
+ movdqa XMMWORD PTR[rdi + 0], xmm0
+ movdqa XMMWORD PTR[rdi + 16], xmm1
- punpckhdq xmm4, xmm3
- movdqa xmm3, xmm2
-
- punpckldq xmm2, xmm4
- punpckhdq xmm3, xmm4
-
- paddd xmm2, xmm3
- paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm2, _2NDSTAGESHIFT
-
- ;second column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+16]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+16]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _2NDSTAGESHIFT
- packssdw xmm2, xmm3
-
- movdqu [rcx], xmm2
- ;third column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+32]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+32]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _2NDSTAGESHIFT
- ;fourth column
- pmaddwd xmm0, [rdx+48]
- pmaddwd xmm1, [rdx+48]
-
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
-
- punpckhdq xmm4, xmm1
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm4
- punpckhdq xmm1, xmm4
-
- paddd xmm0, xmm1
- paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm0, _2NDSTAGESHIFT
- packssdw xmm3, xmm0
-
- movdqu [rcx+16], xmm3
-
- mov rsp, rbp
; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
+;; RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-
SECTION_RODATA
-;static unsigned int dct1st_stage_rounding_sse2[4] =
align 16
-dct1st_stage_rounding_sse2:
- times 4 dd 8192
-
-
-;static unsigned int dct2nd_stage_rounding_sse2[4] =
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
align 16
-dct2nd_stage_rounding_sse2:
- times 4 dd 32768
-
-;static short dct_matrix_sse2[4][8]=
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
align 16
-dct_matrix_sse2:
- times 8 dw 23170
-
- dw 30274
- dw 12540
- dw -12540
- dw -30274
- dw 30274
- dw 12540
- dw -12540
- dw -30274
-
- dw 23170
- times 2 dw -23170
- times 2 dw 23170
- times 2 dw -23170
- dw 23170
+_mult_add:
+ times 8 dw 1
+align 16
+_cmp_mask:
+ times 4 dw 1
+ times 4 dw 0
- dw 12540
- dw -30274
- dw 30274
- dw -12540
- dw 12540
- dw -30274
- dw 30274
- dw -12540
+align 16
+_mult_sub:
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+align 16
+_7:
+ times 4 dd 7
+align 16
+_14500:
+ times 4 dd 14500
+align 16
+_7500:
+ times 4 dd 7500
+align 16
+_12000:
+ times 4 dd 12000
+align 16
+_51000:
+ times 4 dd 51000
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index ada16d34f..bff52e129 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,7 +24,7 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
extern prototype_fdct(vp8_short_fdct8x4_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
-#if 0 new c version,
+#if 0
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
@@ -40,19 +40,23 @@ extern prototype_fdct(vp8_short_fdct8x4_mmx);
extern prototype_fdct(vp8_short_fdct8x4_wmt);
extern prototype_fdct(vp8_short_walsh4x4_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
+extern prototype_fdct(vp8_short_fdct4x4_sse2);
-#if 0
+#if !CONFIG_RUNTIME_CPU_DETECT
+#if 1
/* short SSE2 DCT currently disabled, does not match the MMX version */
#undef vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
#undef vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
#endif
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
+
#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 0fb82e60e..4d0515662 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -82,6 +82,11 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#endif
#if HAVE_SSE2
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_sse2(input, output, pitch);
+ vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
+}
int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
@@ -268,13 +273,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
-#if 0 //new fdct
- /* short SSE2 DCT currently disabled, does not match the MMX version */
- cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
- cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
- /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
- cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_wmt;
-#endif
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2;
+
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2;
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index f86a0b2aa..c88df4705 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -93,6 +93,7 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm