summaryrefslogtreecommitdiff
path: root/vp9/common/x86/vp9_idctllm_mmx.asm
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2012-11-27 13:59:17 -0800
committerJohn Koleszar <jkoleszar@google.com>2012-11-27 14:12:30 -0800
commitfcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1 (patch)
tree68e128e48e3f5ab1de1c163fa3a12ea47f5d8d51 /vp9/common/x86/vp9_idctllm_mmx.asm
parent3bf7b131c8ebb6b4d63a8b70d69066dcbc4ed896 (diff)
downloadlibvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.tar
libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.tar.gz
libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.tar.bz2
libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.zip
Add vp9_ prefix to all vp9 files
Support for gyp which doesn't support multiple objects in the same static library having the same basename. Change-Id: Ib947eefbaf68f8b177a796d23f875ccdfa6bc9dc
Diffstat (limited to 'vp9/common/x86/vp9_idctllm_mmx.asm')
-rw-r--r--vp9/common/x86/vp9_idctllm_mmx.asm241
1 files changed, 241 insertions, 0 deletions
diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm
new file mode 100644
index 000000000..15e81addb
--- /dev/null
+++ b/vp9/common/x86/vp9_idctllm_mmx.asm
@@ -0,0 +1,241 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+align 16
+x_s1sqr2: times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1: times 4 dw 0x4E7B
+align 16
+pw_16: times 4 dw 16
+
+SECTION .text
+
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; * 1. sqrt(2) * cos (pi/8)
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; * x * a = x + x*(a-1)
+; * so
+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+INIT_MMX
+
+;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
+cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
+ mova m0, [inpq +0]
+ mova m1, [inpq +8]
+
+ mova m2, [inpq+16]
+ mova m3, [inpq+24]
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2] ;
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1] ;
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova m3, m5 ; 33 23 13 03
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2] ;
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1] ;
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ paddw m0, [pw_16]
+
+ paddw m2, [pw_16]
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+ psraw m2, 5
+
+ psraw m0, 5
+ psraw m4, 5
+
+ psraw m6, 5
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova [outq], m0
+
+ mova [outq+r2], m1
+ mova [outq+pitq*2], m2
+
+ add outq, pitq
+ mova [outq+pitq*2], m5
+ RET
+
+;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
+cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
+ movh m0, [inpq]
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklwd m0, m0
+ punpckldq m0, m0
+
+ mova [outq], m0
+ mova [outq+pitq], m0
+
+ mova [outq+pitq*2], m0
+ add r1, r2
+
+ mova [outq+pitq*2], m0
+ RET
+
+
+;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
+%if ARCH_X86_64
+ movsxd strideq, dword stridem
+%else
+ mov strideq, stridem
+%endif
+ pxor m0, m0
+
+ movh m5, in_dcq ; dc
+ paddw m5, [pw_16]
+
+ psraw m5, 5
+
+ punpcklwd m5, m5
+ punpckldq m5, m5
+
+ movh m1, [predq]
+ punpcklbw m1, m0
+ paddsw m1, m5
+ packuswb m1, m0 ; pack and unpack to saturate
+ movh [dstq], m1
+
+ movh m2, [predq+pitq]
+ punpcklbw m2, m0
+ paddsw m2, m5
+ packuswb m2, m0 ; pack and unpack to saturate
+ movh [dstq+strideq], m2
+
+ movh m3, [predq+2*pitq]
+ punpcklbw m3, m0
+ paddsw m3, m5
+ packuswb m3, m0 ; pack and unpack to saturate
+ movh [dstq+2*strideq], m3
+
+ add dstq, strideq
+ add predq, pitq
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m0
+ paddsw m4, m5
+ packuswb m4, m0 ; pack and unpack to saturate
+ movh [dstq+2*strideq], m4
+ RET
+