; ; Copyright (c) 2012 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86inc.asm" SECTION_RODATA align 16 x_s1sqr2: times 4 dw 0x8A8C align 16 x_c1sqr2less1: times 4 dw 0x4E7B align 16 pw_16: times 4 dw 16 SECTION .text INIT_MMX ;void dequantize_b_impl_mmx(short *sq, short *dq, short *q) cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3 mova m1, [sqq] pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers. mova [dqq+ 0], m1 mova m1, [sqq+8] pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers. mova [dqq+ 8], m1 mova m1, [sqq+16] pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers. mova [dqq+16], m1 mova m1, [sqq+24] pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers. mova [dqq+24], m1 RET ;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride) cglobal dequant_idct_add_mmx, 6,6,0,inp,dq,pred,dest,pit,stride mova m0, [inpq+ 0] pmullw m0, [dqq] mova m1, [inpq+ 8] pmullw m1, [dqq+ 8] mova m2, [inpq+16] pmullw m2, [dqq+16] mova m3, [inpq+24] pmullw m3, [dqq+24] pxor m7, m7 mova [inpq], m7 mova [inpq+8], m7 mova [inpq+16], m7 mova [inpq+24], m7 psubw m0, m2 ; b1= 0-2 paddw m2, m2 ; mova m5, m1 paddw m2, m0 ; a1 =0+2 pmulhw m5, [x_s1sqr2]; paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) mova m7, m3 ; pmulhw m7, [x_c1sqr2less1]; paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) psubw m7, m5 ; c1 mova m5, m1 mova m4, m3 pmulhw m5, [x_c1sqr2less1] paddw m5, m1 pmulhw m3, [x_s1sqr2] paddw m3, m4 paddw m3, m5 ; d1 mova m6, m2 ; a1 mova m4, m0 ; b1 paddw m2, m3 ;0 paddw m4, m7 ;1 psubw m0, m7 ;2 psubw m6, m3 ;3 mova m1, m2 ; 03 02 01 00 mova m3, m4 ; 23 22 21 20 punpcklwd m1, m0 ; 11 01 10 00 punpckhwd m2, m0 ; 13 03 12 02 punpcklwd m3, m6 ; 31 21 30 20 punpckhwd m4, m6 ; 33 23 32 22 mova m0, m1 ; 11 01 10 00 mova m5, m2 ; 13 03 12 02 punpckldq m0, m3 ; 30 20 10 00 punpckhdq m1, m3 ; 31 21 11 01 punpckldq m2, m4 ; 32 22 12 02 punpckhdq m5, m4 ; 33 23 13 03 mova m3, m5 ; 33 23 13 03 psubw m0, m2 ; b1= 0-2 paddw m2, m2 ; mova m5, m1 paddw m2, m0 ; a1 =0+2 pmulhw m5, [x_s1sqr2]; paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) mova m7, m3 ; pmulhw m7, [x_c1sqr2less1]; paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) psubw m7, m5 ; c1 mova m5, m1 mova m4, m3 pmulhw m5, [x_c1sqr2less1] paddw m5, m1 pmulhw m3, [x_s1sqr2] paddw m3, m4 paddw m3, m5 ; d1 paddw m0, [pw_16] paddw m2, [pw_16] mova m6, m2 ; a1 mova m4, m0 ; b1 paddw m2, m3 ;0 paddw m4, m7 ;1 psubw m0, m7 ;2 psubw m6, m3 ;3 psraw m2, 5 psraw m0, 5 psraw m4, 5 psraw m6, 5 mova m1, m2 ; 03 02 01 00 mova m3, m4 ; 23 22 21 20 punpcklwd m1, m0 ; 11 01 10 00 punpckhwd m2, m0 ; 13 03 12 02 punpcklwd m3, m6 ; 31 21 30 20 punpckhwd m4, m6 ; 33 23 32 22 mova m0, m1 ; 11 01 10 00 mova m5, m2 ; 13 03 12 02 punpckldq m0, m3 ; 30 20 10 00 punpckhdq m1, m3 ; 31 21 11 01 punpckldq m2, m4 ; 32 22 12 02 punpckhdq m5, m4 ; 33 23 13 03 pxor m7, m7 movh m4, [predq] punpcklbw m4, m7 paddsw m0, m4 packuswb m0, m7 movh [destq], m0 movh m4, [predq+pitq] punpcklbw m4, m7 paddsw m1, m4 packuswb m1, m7 movh [destq+strideq], m1 movh m4, [predq+2*pitq] punpcklbw m4, m7 paddsw m2, m4 packuswb m2, m7 movh [destq+strideq*2], m2 add destq, strideq add predq, pitq movh m4, [predq+2*pitq] punpcklbw m4, m7 paddsw m5, m4 packuswb m5, m7 movh [destq+strideq*2], m5 RET ;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc) cglobal dequant_dc_idct_add_mmx, 7,7,0,inp,dq,pred,dest,pit,stride,Dc mova m0, [inpq+ 0] pmullw m0, [dqq+ 0] mova m1, [inpq+ 8] pmullw m1, [dqq+ 8] mova m2, [inpq+16] pmullw m2, [dqq+16] mova m3, [inpq+24] pmullw m3, [dqq+24] pxor m7, m7 mova [inpq+ 0], m7 mova [inpq+ 8], m7 mova [inpq+16], m7 mova [inpq+24], m7 ; move lower word of Dc to lower word of m0 psrlq m0, 16 psllq m0, 16 and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision. movh m7, Dcq por m0, m7 psubw m0, m2 ; b1= 0-2 paddw m2, m2 ; mova m5, m1 paddw m2, m0 ; a1 =0+2 pmulhw m5, [x_s1sqr2]; paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) mova m7, m3 ; pmulhw m7, [x_c1sqr2less1]; paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) psubw m7, m5 ; c1 mova m5, m1 mova m4, m3 pmulhw m5, [x_c1sqr2less1] paddw m5, m1 pmulhw m3, [x_s1sqr2] paddw m3, m4 paddw m3, m5 ; d1 mova m6, m2 ; a1 mova m4, m0 ; b1 paddw m2, m3 ;0 paddw m4, m7 ;1 psubw m0, m7 ;2 psubw m6, m3 ;3 mova m1, m2 ; 03 02 01 00 mova m3, m4 ; 23 22 21 20 punpcklwd m1, m0 ; 11 01 10 00 punpckhwd m2, m0 ; 13 03 12 02 punpcklwd m3, m6 ; 31 21 30 20 punpckhwd m4, m6 ; 33 23 32 22 mova m0, m1 ; 11 01 10 00 mova m5, m2 ; 13 03 12 02 punpckldq m0, m3 ; 30 20 10 00 punpckhdq m1, m3 ; 31 21 11 01 punpckldq m2, m4 ; 32 22 12 02 punpckhdq m5, m4 ; 33 23 13 03 mova m3, m5 ; 33 23 13 03 psubw m0, m2 ; b1= 0-2 paddw m2, m2 ; mova m5, m1 paddw m2, m0 ; a1 =0+2 pmulhw m5, [x_s1sqr2]; paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) mova m7, m3 ; pmulhw m7, [x_c1sqr2less1]; paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) psubw m7, m5 ; c1 mova m5, m1 mova m4, m3 pmulhw m5, [x_c1sqr2less1] paddw m5, m1 pmulhw m3, [x_s1sqr2] paddw m3, m4 paddw m3, m5 ; d1 paddw m0, [pw_16] paddw m2, [pw_16] mova m6, m2 ; a1 mova m4, m0 ; b1 paddw m2, m3 ;0 paddw m4, m7 ;1 psubw m0, m7 ;2 psubw m6, m3 ;3 psraw m2, 5 psraw m0, 5 psraw m4, 5 psraw m6, 5 mova m1, m2 ; 03 02 01 00 mova m3, m4 ; 23 22 21 20 punpcklwd m1, m0 ; 11 01 10 00 punpckhwd m2, m0 ; 13 03 12 02 punpcklwd m3, m6 ; 31 21 30 20 punpckhwd m4, m6 ; 33 23 32 22 mova m0, m1 ; 11 01 10 00 mova m5, m2 ; 13 03 12 02 punpckldq m0, m3 ; 30 20 10 00 punpckhdq m1, m3 ; 31 21 11 01 punpckldq m2, m4 ; 32 22 12 02 punpckhdq m5, m4 ; 33 23 13 03 pxor m7, m7 movh m4, [predq] punpcklbw m4, m7 paddsw m0, m4 packuswb m0, m7 movh [destq], m0 movh m4, [predq+pitq] punpcklbw m4, m7 paddsw m1, m4 packuswb m1, m7 movh [destq+strideq], m1 movh m4, [predq+2*pitq] punpcklbw m4, m7 paddsw m2, m4 packuswb m2, m7 movh [destq+strideq*2], m2 add destq, strideq add predq, pitq movh m4, [predq+2*pitq] punpcklbw m4, m7 paddsw m5, m4 packuswb m5, m7 movh [destq+strideq*2], m5 RET