; ; Copyright (c) 2012 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA align 16 x_s1sqr2: times 4 dw 0x8A8C align 16 x_c1sqr2less1: times 4 dw 0x4E7B align 16 pw_16: times 4 dw 16 SECTION .text ; /**************************************************************************** ; * Notes: ; * ; * This implementation makes use of 16 bit fixed point version of two multiply ; * constants: ; * 1. sqrt(2) * cos (pi/8) ; * 2. sqrt(2) * sin (pi/8) ; * Because the first constant is bigger than 1, to maintain the same 16 bit ; * fixed point precision as the second one, we use a trick of ; * x * a = x + x*(a-1) ; * so ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). ; * ; * For the second constant, because of the 16bit version is 35468, which ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative ; * number. ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x ; * ; **************************************************************************/ INIT_MMX ;void short_idct4x4llm_mmx(short *input, short *output, int pitch) cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit mova m0, [inpq +0] mova m1, [inpq +8] mova m2, [inpq+16] mova m3, [inpq+24] psubw m0, m2 ; b1= 0-2 paddw m2, m2 ; mova m5, m1 paddw m2, m0 ; a1 =0+2 pmulhw m5, [x_s1sqr2] ; paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) mova m7, m3 ; pmulhw m7, [x_c1sqr2less1] ; paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) psubw m7, m5 ; c1 mova m5, m1 mova m4, m3 pmulhw m5, [x_c1sqr2less1] paddw m5, m1 pmulhw m3, [x_s1sqr2] paddw m3, m4 paddw m3, m5 ; d1 mova m6, m2 ; a1 mova m4, m0 ; b1 paddw m2, m3 ;0 paddw m4, m7 ;1 psubw m0, m7 ;2 psubw m6, m3 ;3 mova m1, m2 ; 03 02 01 00 mova m3, m4 ; 23 22 21 20 punpcklwd m1, m0 ; 11 01 10 00 punpckhwd m2, m0 ; 13 03 12 02 punpcklwd m3, m6 ; 31 21 30 20 punpckhwd m4, m6 ; 33 23 32 22 mova m0, m1 ; 11 01 10 00 mova m5, m2 ; 13 03 12 02 punpckldq m0, m3 ; 30 20 10 00 punpckhdq m1, m3 ; 31 21 11 01 punpckldq m2, m4 ; 32 22 12 02 punpckhdq m5, m4 ; 33 23 13 03 mova m3, m5 ; 33 23 13 03 psubw m0, m2 ; b1= 0-2 paddw m2, m2 ; mova m5, m1 paddw m2, m0 ; a1 =0+2 pmulhw m5, [x_s1sqr2] ; paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) mova m7, m3 ; pmulhw m7, [x_c1sqr2less1] ; paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) psubw m7, m5 ; c1 mova m5, m1 mova m4, m3 pmulhw m5, [x_c1sqr2less1] paddw m5, m1 pmulhw m3, [x_s1sqr2] paddw m3, m4 paddw m3, m5 ; d1 paddw m0, [pw_16] paddw m2, [pw_16] mova m6, m2 ; a1 mova m4, m0 ; b1 paddw m2, m3 ;0 paddw m4, m7 ;1 psubw m0, m7 ;2 psubw m6, m3 ;3 psraw m2, 5 psraw m0, 5 psraw m4, 5 psraw m6, 5 mova m1, m2 ; 03 02 01 00 mova m3, m4 ; 23 22 21 20 punpcklwd m1, m0 ; 11 01 10 00 punpckhwd m2, m0 ; 13 03 12 02 punpcklwd m3, m6 ; 31 21 30 20 punpckhwd m4, m6 ; 33 23 32 22 mova m0, m1 ; 11 01 10 00 mova m5, m2 ; 13 03 12 02 punpckldq m0, m3 ; 30 20 10 00 punpckhdq m1, m3 ; 31 21 11 01 punpckldq m2, m4 ; 32 22 12 02 punpckhdq m5, m4 ; 33 23 13 03 mova [outq], m0 mova [outq+r2], m1 mova [outq+pitq*2], m2 add outq, pitq mova [outq+pitq*2], m5 RET ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit movh m0, [inpq] paddw m0, [pw_16] psraw m0, 5 punpcklwd m0, m0 punpckldq m0, m0 mova [outq], m0 mova [outq+pitq], m0 mova [outq+pitq*2], m0 add r1, r2 mova [outq+pitq*2], m0 RET ;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride %if ARCH_X86_64 movsxd strideq, dword stridem %else mov strideq, stridem %endif pxor m0, m0 movh m5, in_dcq ; dc paddw m5, [pw_16] psraw m5, 5 punpcklwd m5, m5 punpckldq m5, m5 movh m1, [predq] punpcklbw m1, m0 paddsw m1, m5 packuswb m1, m0 ; pack and unpack to saturate movh [dstq], m1 movh m2, [predq+pitq] punpcklbw m2, m0 paddsw m2, m5 packuswb m2, m0 ; pack and unpack to saturate movh [dstq+strideq], m2 movh m3, [predq+2*pitq] punpcklbw m3, m0 paddsw m3, m5 packuswb m3, m0 ; pack and unpack to saturate movh [dstq+2*strideq], m3 add dstq, strideq add predq, pitq movh m4, [predq+2*pitq] punpcklbw m4, m0 paddsw m4, m5 packuswb m4, m0 ; pack and unpack to saturate movh [dstq+2*strideq], m4 RET