From 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Tue, 18 May 2010 11:58:33 -0400 Subject: Initial WebM release --- vp8/encoder/x86/dct_mmx.asm | 846 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 846 insertions(+) create mode 100644 vp8/encoder/x86/dct_mmx.asm (limited to 'vp8/encoder/x86/dct_mmx.asm') diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm new file mode 100644 index 000000000..e13423796 --- /dev/null +++ b/vp8/encoder/x86/dct_mmx.asm @@ -0,0 +1,846 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text + global sym(vp8_short_fdct4x4_mmx) + global sym(vp8_fast_fdct4x4_mmx) + global sym(vp8_fast_fdct8x4_wmt) + + +%define DCTCONSTANTSBITS (16) +%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) +%define x_c1 (60547) ; cos(pi /8) * (1<<15) +%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) +%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) + + +%define _1STSTAGESHIFT 14 +%define _2NDSTAGESHIFT 16 + +; using matrix multiply with source and destbuffer has a pitch +;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) +sym(vp8_short_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + movsxd rax, dword ptr arg(2) ;pitch + lea rdx, [dct_matrix GLOBAL] + + movq mm0, [rsi ] + movq mm1, [rsi + rax] + + movq mm2, [rsi + rax*2] + lea rsi, [rsi + rax*2] + + movq mm3, [rsi + rax] + + ; first column + movq mm4, mm0 + movq mm7, [rdx] + + pmaddwd mm4, mm7 + movq mm5, mm1 + + pmaddwd mm5, mm7 + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + + pmaddwd mm5, mm7 + movq mm6, mm3 + + pmaddwd mm6, mm7 + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi], mm4 + + ;second column + movq mm4, mm0 + + pmaddwd mm4, [rdx+8] + movq mm5, mm1 + + pmaddwd mm5, [rdx+8] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+8] + movq mm6, mm3 + + pmaddwd mm6, [rdx+8] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+8], mm4 + + + ;third column + movq mm4, mm0 + + pmaddwd mm4, [rdx+16] + movq mm5, mm1 + + pmaddwd mm5, [rdx+16] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+16] + movq mm6, mm3 + + pmaddwd mm6, [rdx+16] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _1STSTAGESHIFT + psrad mm5, _1STSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+16], mm4 + + ;fourth column (this is the last column, so we do not have save the source any more) + + pmaddwd mm0, [rdx+24] + + pmaddwd mm1, [rdx+24] + movq mm6, mm0 + + punpckldq mm0, mm1 + punpckhdq mm6, mm1 + + paddd mm0, mm6 + + pmaddwd mm2, [rdx+24] + + pmaddwd mm3, [rdx+24] + movq mm7, mm2 + + punpckldq mm2, mm3 + punpckhdq mm7, mm3 + + paddd mm2, mm7 + movq mm6, [dct1st_stage_rounding_mmx GLOBAL] + + paddd mm0, mm6 + paddd mm2, mm6 + + psrad mm0, _1STSTAGESHIFT + psrad mm2, _1STSTAGESHIFT + + packssdw mm0, mm2 + + movq mm3, mm0 + + ; done with one pass + ; now start second pass + movq mm0, [rdi ] + movq mm1, [rdi+ 8] + movq mm2, [rdi+ 16] + + movq mm4, mm0 + + pmaddwd mm4, [rdx] + movq mm5, mm1 + + pmaddwd mm5, [rdx] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx] + movq mm6, mm3 + + pmaddwd mm6, [rdx] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi], mm4 + + ;second column + movq mm4, mm0 + + pmaddwd mm4, [rdx+8] + movq mm5, mm1 + + pmaddwd mm5, [rdx+8] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+8] + movq mm6, mm3 + + pmaddwd mm6, [rdx+8] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+8], mm4 + + + ;third column + movq mm4, mm0 + + pmaddwd mm4, [rdx+16] + movq mm5, mm1 + + pmaddwd mm5, [rdx+16] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+16] + movq mm6, mm3 + + pmaddwd mm6, [rdx+16] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+16], mm4 + + ;fourth column + movq mm4, mm0 + + pmaddwd mm4, [rdx+24] + movq mm5, mm1 + + pmaddwd mm5, [rdx+24] + movq mm6, mm4 + + punpckldq mm4, mm5 + punpckhdq mm6, mm5 + + paddd mm4, mm6 + movq mm5, mm2 + + pmaddwd mm5, [rdx+24] + movq mm6, mm3 + + pmaddwd mm6, [rdx+24] + movq mm7, mm5 + + punpckldq mm5, mm6 + punpckhdq mm7, mm6 + + paddd mm5, mm7 + movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] + + paddd mm4, mm6 + paddd mm5, mm6 + + psrad mm4, _2NDSTAGESHIFT + psrad mm5, _2NDSTAGESHIFT + + packssdw mm4, mm5 + movq [rdi+24], mm4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch) +sym(vp8_fast_fdct4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + lea rdx, [dct_const_mmx GLOBAL] + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movq mm0, [rsi] + movq mm1, [rsi + rax ] + + movq mm2, [rcx] + movq mm3, [rcx + rax] + ; get the constants + ;shift to left by 1 for prescision + paddw mm0, mm0 + paddw mm1, mm1 + + psllw mm2, 1 + psllw mm3, 1 + + ; transpose for the second stage + movq mm4, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 10 11 12 03 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm4, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm3 ; 20 30 21 31 + punpckhwd mm5, mm3 ; 22 32 23 33 + + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm4 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm4, mm5 ; 03 13 23 33 + movq mm3, mm4 + + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a = 0 + 3 + paddw mm1, mm2 ; b = 1 + 2 + + psubw mm4, mm2 ; c = 1 - 2 + psubw mm5, mm3 ; d = 0 - 3 + + + ; output 0 and 2 + movq mm6, [rdx + 16] ; c2 + movq mm2, mm0 ; a + + paddw mm0, mm1 ; a + b + psubw mm2, mm1 ; a - b + + movq mm1, mm0 ; a + b + pmulhw mm0, mm6 ; 00 01 02 03 + + paddw mm0, mm1 ; output 00 01 02 03 + pmulhw mm6, mm2 ; 20 21 22 23 + + paddw mm2, mm6 ; output 20 21 22 23 + + ; output 1 and 3 + movq mm6, [rdx + 8] ; c1 + movq mm7, [rdx + 24] ; c3 + + movq mm1, mm4 ; c + movq mm3, mm5 ; d + + pmulhw mm1, mm7 ; c * c3 + pmulhw mm3, mm6 ; d * c1 + + paddw mm3, mm5 ; d * c1 rounded + paddw mm1, mm3 ; output 10 11 12 13 + + movq mm3, mm4 ; c + pmulhw mm5, mm7 ; d * c3 + + pmulhw mm4, mm6 ; c * c1 + paddw mm3, mm4 ; round c* c1 + + psubw mm5, mm3 ; output 30 31 32 33 + movq mm3, mm5 + + + ; done with vertical + ; transpose for the second stage + movq mm4, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 10 11 12 03 + + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm4, mm1 ; 02 12 03 13 + + punpcklwd mm2, mm3 ; 20 30 21 31 + punpckhwd mm5, mm3 ; 22 32 23 33 + + + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 + + punpckhdq mm1, mm2 ; 01 11 21 31 + + movq mm2, mm4 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 + + punpckhdq mm4, mm5 ; 03 13 23 33 + movq mm3, mm4 + + + ; first stage + movq mm5, mm0 + movq mm4, mm1 + + paddw mm0, mm3 ; a = 0 + 3 + paddw mm1, mm2 ; b = 1 + 2 + + psubw mm4, mm2 ; c = 1 - 2 + psubw mm5, mm3 ; d = 0 - 3 + + + ; output 0 and 2 + movq mm6, [rdx + 16] ; c2 + movq mm2, mm0 ; a + paddw mm0, mm1 ; a + b + + psubw mm2, mm1 ; a - b + + movq mm1, mm0 ; a + b + pmulhw mm0, mm6 ; 00 01 02 03 + + paddw mm0, mm1 ; output 00 01 02 03 + pmulhw mm6, mm2 ; 20 21 22 23 + + paddw mm2, mm6 ; output 20 21 22 23 + + + ; output 1 and 3 + movq mm6, [rdx + 8] ; c1 + movq mm7, [rdx + 24] ; c3 + + movq mm1, mm4 ; c + movq mm3, mm5 ; d + + pmulhw mm1, mm7 ; c * c3 + pmulhw mm3, mm6 ; d * c1 + + paddw mm3, mm5 ; d * c1 rounded + paddw mm1, mm3 ; output 10 11 12 13 + + movq mm3, mm4 ; c + pmulhw mm5, mm7 ; d * c3 + + pmulhw mm4, mm6 ; c * c1 + paddw mm3, mm4 ; round c* c1 + + psubw mm5, mm3 ; output 30 31 32 33 + movq mm3, mm5 + ; done with vertical + + pcmpeqw mm4, mm4 + pcmpeqw mm5, mm5 + psrlw mm4, 15 + psrlw mm5, 15 + + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm2, mm4 + paddw mm3, mm5 + + psraw mm0, 1 + psraw mm1, 1 + psraw mm2, 1 + psraw mm3, 1 + + movq [rdi ], mm0 + movq [rdi+ 8], mm1 + movq [rdi+16], mm2 + movq [rdi+24], mm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch) +sym(vp8_fast_fdct8x4_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;input + mov rdi, arg(1) ;output + + lea rdx, [dct_const_xmm GLOBAL] + movsxd rax, dword ptr arg(2) ;pitch + + lea rcx, [rsi + rax*2] + ; read the input data + movdqa xmm0, [rsi] + movdqa xmm2, [rsi + rax] + + movdqa xmm4, [rcx] + movdqa xmm3, [rcx + rax] + ; get the constants + ;shift to left by 1 for prescision + psllw xmm0, 1 + psllw xmm2, 1 + + psllw xmm4, 1 + psllw xmm3, 1 + + ; transpose for the second stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a = 0 + 3 + paddw xmm1, xmm2 ; b = 1 + 2 + + psubw xmm4, xmm2 ; c = 1 - 2 + psubw xmm5, xmm3 ; d = 0 - 3 + + + ; output 0 and 2 + movdqa xmm6, [rdx + 32] ; c2 + movdqa xmm2, xmm0 ; a + + paddw xmm0, xmm1 ; a + b + psubw xmm2, xmm1 ; a - b + + movdqa xmm1, xmm0 ; a + b + pmulhw xmm0, xmm6 ; 00 01 02 03 + + paddw xmm0, xmm1 ; output 00 01 02 03 + pmulhw xmm6, xmm2 ; 20 21 22 23 + + paddw xmm2, xmm6 ; output 20 21 22 23 + + ; output 1 and 3 + movdqa xmm6, [rdx + 16] ; c1 + movdqa xmm7, [rdx + 48] ; c3 + + movdqa xmm1, xmm4 ; c + movdqa xmm3, xmm5 ; d + + pmulhw xmm1, xmm7 ; c * c3 + pmulhw xmm3, xmm6 ; d * c1 + + paddw xmm3, xmm5 ; d * c1 rounded + paddw xmm1, xmm3 ; output 10 11 12 13 + + movdqa xmm3, xmm4 ; c + pmulhw xmm5, xmm7 ; d * c3 + + pmulhw xmm4, xmm6 ; c * c1 + paddw xmm3, xmm4 ; round c* c1 + + psubw xmm5, xmm3 ; output 30 31 32 33 + movdqa xmm3, xmm5 + + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 + movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 + + movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 + + + movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 + + movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a = 0 + 3 + paddw xmm1, xmm2 ; b = 1 + 2 + + psubw xmm4, xmm2 ; c = 1 - 2 + psubw xmm5, xmm3 ; d = 0 - 3 + + + ; output 0 and 2 + movdqa xmm6, [rdx + 32] ; c2 + movdqa xmm2, xmm0 ; a + + paddw xmm0, xmm1 ; a + b + psubw xmm2, xmm1 ; a - b + + movdqa xmm1, xmm0 ; a + b + pmulhw xmm0, xmm6 ; 00 01 02 03 + + paddw xmm0, xmm1 ; output 00 01 02 03 + pmulhw xmm6, xmm2 ; 20 21 22 23 + + paddw xmm2, xmm6 ; output 20 21 22 23 + + ; output 1 and 3 + movdqa xmm6, [rdx + 16] ; c1 + movdqa xmm7, [rdx + 48] ; c3 + + movdqa xmm1, xmm4 ; c + movdqa xmm3, xmm5 ; d + + pmulhw xmm1, xmm7 ; c * c3 + pmulhw xmm3, xmm6 ; d * c1 + + paddw xmm3, xmm5 ; d * c1 rounded + paddw xmm1, xmm3 ; output 10 11 12 13 + + movdqa xmm3, xmm4 ; c + pmulhw xmm5, xmm7 ; d * c3 + + pmulhw xmm4, xmm6 ; c * c1 + paddw xmm3, xmm4 ; round c* c1 + + psubw xmm5, xmm3 ; output 30 31 32 33 + movdqa xmm3, xmm5 + ; done with vertical + + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm5, xmm5; + psrlw xmm4, 15 + psrlw xmm5, 15 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + psraw xmm0, 1 + psraw xmm1, 1 + psraw xmm2, 1 + psraw xmm3, 1 + + movq QWORD PTR[rdi ], xmm0 + movq QWORD PTR[rdi+ 8], xmm1 + movq QWORD PTR[rdi+16], xmm2 + movq QWORD PTR[rdi+24], xmm3 + + psrldq xmm0, 8 + psrldq xmm1, 8 + psrldq xmm2, 8 + psrldq xmm3, 8 + + movq QWORD PTR[rdi+32], xmm0 + movq QWORD PTR[rdi+40], xmm1 + movq QWORD PTR[rdi+48], xmm2 + movq QWORD PTR[rdi+56], xmm3 + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +;static const unsigned int dct1st_stage_rounding_mmx[2] = +align 16 +dct1st_stage_rounding_mmx: + times 2 dd 8192 + + +;static const unsigned int dct2nd_stage_rounding_mmx[2] = +align 16 +dct2nd_stage_rounding_mmx: + times 2 dd 32768 + + +;static const short dct_matrix[4][4]= +align 16 +dct_matrix: + times 4 dw 23170 + + dw 30274 + dw 12540 + dw -12540 + dw -30274 + + dw 23170 + times 2 dw -23170 + dw 23170 + + dw 12540 + dw -30274 + dw 30274 + dw -12540 + + +;static const unsigned short dct_const_mmx[4 * 4]= +align 16 +dct_const_mmx: + times 4 dw 0 + times 4 dw 60547 + times 4 dw 46341 + times 4 dw 25080 + + +;static const unsigned short dct_const_xmm[8 * 4]= +align 16 +dct_const_xmm: + times 8 dw 0 + times 8 dw 60547 + times 8 dw 46341 + times 8 dw 25080 -- cgit v1.2.3