summaryrefslogtreecommitdiff
path: root/vp8/encoder/x86/dct_sse2.asm
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
committerJohn Koleszar <jkoleszar@google.com>2010-05-18 11:58:33 -0400
commit0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder/x86/dct_sse2.asm
downloadlibvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip
Initial WebM release
Diffstat (limited to 'vp8/encoder/x86/dct_sse2.asm')
-rw-r--r--vp8/encoder/x86/dct_sse2.asm260
1 files changed, 260 insertions, 0 deletions
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..3e5e9a70c
--- /dev/null
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,260 @@
+;
+; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_short_fdct4x4_wmt)
+
+%define DCTCONSTANTSBITS (16)
+%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
+%define x_c1 (60547) ; cos(pi /8) * (1<<15)
+%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
+%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
+
+%define _1STSTAGESHIFT 14
+%define _2NDSTAGESHIFT 16
+
+
+;; using matrix multiply
+;void vp8_short_fdct4x4_wmt(short *input, short *output)
+sym(vp8_short_fdct4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ GET_GOT rbx
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rcx, arg(1) ;output
+
+ lea rdx, [dct_matrix_sse2 GLOBAL]
+
+ movdqu xmm0, [rax ]
+ movdqu xmm1, [rax+16]
+
+ ; first column
+ movdqa xmm2, xmm0
+ movdqa xmm7, [rdx]
+
+ pmaddwd xmm2, xmm7
+ movdqa xmm3, xmm1
+
+ pmaddwd xmm3, xmm7
+ movdqa xmm4, xmm2
+
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ movdqa xmm3, xmm2
+ punpckldq xmm2, xmm4
+
+ punpckhdq xmm3, xmm4
+ paddd xmm2, xmm3
+
+
+ paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+ psrad xmm2, _1STSTAGESHIFT
+ ;second column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+16]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+16]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+ psrad xmm3, _1STSTAGESHIFT
+ packssdw xmm2, xmm3
+
+ ;third column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+32]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+32]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _1STSTAGESHIFT
+
+ ;fourth column (this is the last column, so we do not have save the source any more)
+ pmaddwd xmm0, [rdx+48]
+ pmaddwd xmm1, [rdx+48]
+
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+
+ punpckhdq xmm4, xmm1
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm4
+ punpckhdq xmm1, xmm4
+
+ paddd xmm0, xmm1
+ paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+ psrad xmm0, _1STSTAGESHIFT
+ packssdw xmm3, xmm0
+ ; done with one pass
+ ; now start second pass
+ movdqa xmm0, xmm2
+ movdqa xmm1, xmm3
+
+ pmaddwd xmm2, xmm7
+ pmaddwd xmm3, xmm7
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+
+ punpckhdq xmm4, xmm3
+ movdqa xmm3, xmm2
+
+ punpckldq xmm2, xmm4
+ punpckhdq xmm3, xmm4
+
+ paddd xmm2, xmm3
+ paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm2, _2NDSTAGESHIFT
+
+ ;second column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+16]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+16]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _2NDSTAGESHIFT
+ packssdw xmm2, xmm3
+
+ movdqu [rcx], xmm2
+ ;third column
+ movdqa xmm3, xmm0
+ pmaddwd xmm3, [rdx+32]
+
+ movdqa xmm4, xmm1
+ pmaddwd xmm4, [rdx+32]
+
+ movdqa xmm5, xmm3
+ punpckldq xmm3, xmm4
+
+ punpckhdq xmm5, xmm4
+ movdqa xmm4, xmm3
+
+ punpckldq xmm3, xmm5
+ punpckhdq xmm4, xmm5
+
+ paddd xmm3, xmm4
+ paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm3, _2NDSTAGESHIFT
+ ;fourth column
+ pmaddwd xmm0, [rdx+48]
+ pmaddwd xmm1, [rdx+48]
+
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+
+ punpckhdq xmm4, xmm1
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm4
+ punpckhdq xmm1, xmm4
+
+ paddd xmm0, xmm1
+ paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+ psrad xmm0, _2NDSTAGESHIFT
+ packssdw xmm3, xmm0
+
+ movdqu [rcx+16], xmm3
+
+ mov rsp, rbp
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;static unsigned int dct1st_stage_rounding_sse2[4] =
+align 16
+dct1st_stage_rounding_sse2:
+ times 4 dd 8192
+
+
+;static unsigned int dct2nd_stage_rounding_sse2[4] =
+align 16
+dct2nd_stage_rounding_sse2:
+ times 4 dd 32768
+
+;static short dct_matrix_sse2[4][8]=
+align 16
+dct_matrix_sse2:
+ times 8 dw 23170
+
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+ dw 30274
+ dw 12540
+ dw -12540
+ dw -30274
+
+ dw 23170
+ times 2 dw -23170
+ times 2 dw 23170
+ times 2 dw -23170
+ dw 23170
+
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540
+ dw 12540
+ dw -30274
+ dw 30274
+ dw -12540