diff options
author | John Koleszar <jkoleszar@google.com> | 2012-11-27 13:59:17 -0800 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2012-11-27 14:12:30 -0800 |
commit | fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1 (patch) | |
tree | 68e128e48e3f5ab1de1c163fa3a12ea47f5d8d51 /vp9/common/x86/vp9_idctllm_mmx.asm | |
parent | 3bf7b131c8ebb6b4d63a8b70d69066dcbc4ed896 (diff) | |
download | libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.tar libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.tar.gz libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.tar.bz2 libvpx-fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1.zip |
Add vp9_ prefix to all vp9 files
Support for gyp which doesn't support multiple objects in the same
static library having the same basename.
Change-Id: Ib947eefbaf68f8b177a796d23f875ccdfa6bc9dc
Diffstat (limited to 'vp9/common/x86/vp9_idctllm_mmx.asm')
-rw-r--r-- | vp9/common/x86/vp9_idctllm_mmx.asm | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm new file mode 100644 index 000000000..15e81addb --- /dev/null +++ b/vp9/common/x86/vp9_idctllm_mmx.asm @@ -0,0 +1,241 @@ +; +; Copyright (c) 2012 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +align 16 +x_s1sqr2: times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: times 4 dw 0x4E7B +align 16 +pw_16: times 4 dw 16 + +SECTION .text + + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point version of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + +INIT_MMX + +;void short_idct4x4llm_mmx(short *input, short *output, int pitch) +cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit + mova m0, [inpq +0] + mova m1, [inpq +8] + + mova m2, [inpq+16] + mova m3, [inpq+24] + + psubw m0, m2 ; b1= 0-2 + paddw m2, m2 ; + + mova m5, m1 + paddw m2, m0 ; a1 =0+2 + + pmulhw m5, [x_s1sqr2] ; + paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) + + mova m7, m3 ; + pmulhw m7, [x_c1sqr2less1] ; + + paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) + psubw m7, m5 ; c1 + + mova m5, m1 + mova m4, m3 + + pmulhw m5, [x_c1sqr2less1] + paddw m5, m1 + + pmulhw m3, [x_s1sqr2] + paddw m3, m4 + + paddw m3, m5 ; d1 + mova m6, m2 ; a1 + + mova m4, m0 ; b1 + paddw m2, m3 ;0 + + paddw m4, m7 ;1 + psubw m0, m7 ;2 + + psubw m6, m3 ;3 + + mova m1, m2 ; 03 02 01 00 + mova m3, m4 ; 23 22 21 20 + + punpcklwd m1, m0 ; 11 01 10 00 + punpckhwd m2, m0 ; 13 03 12 02 + + punpcklwd m3, m6 ; 31 21 30 20 + punpckhwd m4, m6 ; 33 23 32 22 + + mova m0, m1 ; 11 01 10 00 + mova m5, m2 ; 13 03 12 02 + + punpckldq m0, m3 ; 30 20 10 00 + punpckhdq m1, m3 ; 31 21 11 01 + + punpckldq m2, m4 ; 32 22 12 02 + punpckhdq m5, m4 ; 33 23 13 03 + + mova m3, m5 ; 33 23 13 03 + + psubw m0, m2 ; b1= 0-2 + paddw m2, m2 ; + + mova m5, m1 + paddw m2, m0 ; a1 =0+2 + + pmulhw m5, [x_s1sqr2] ; + paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) + + mova m7, m3 ; + pmulhw m7, [x_c1sqr2less1] ; + + paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) + psubw m7, m5 ; c1 + + mova m5, m1 + mova m4, m3 + + pmulhw m5, [x_c1sqr2less1] + paddw m5, m1 + + pmulhw m3, [x_s1sqr2] + paddw m3, m4 + + paddw m3, m5 ; d1 + paddw m0, [pw_16] + + paddw m2, [pw_16] + mova m6, m2 ; a1 + + mova m4, m0 ; b1 + paddw m2, m3 ;0 + + paddw m4, m7 ;1 + psubw m0, m7 ;2 + + psubw m6, m3 ;3 + psraw m2, 5 + + psraw m0, 5 + psraw m4, 5 + + psraw m6, 5 + + mova m1, m2 ; 03 02 01 00 + mova m3, m4 ; 23 22 21 20 + + punpcklwd m1, m0 ; 11 01 10 00 + punpckhwd m2, m0 ; 13 03 12 02 + + punpcklwd m3, m6 ; 31 21 30 20 + punpckhwd m4, m6 ; 33 23 32 22 + + mova m0, m1 ; 11 01 10 00 + mova m5, m2 ; 13 03 12 02 + + punpckldq m0, m3 ; 30 20 10 00 + punpckhdq m1, m3 ; 31 21 11 01 + + punpckldq m2, m4 ; 32 22 12 02 + punpckhdq m5, m4 ; 33 23 13 03 + + mova [outq], m0 + + mova [outq+r2], m1 + mova [outq+pitq*2], m2 + + add outq, pitq + mova [outq+pitq*2], m5 + RET + +;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) +cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit + movh m0, [inpq] + paddw m0, [pw_16] + psraw m0, 5 + punpcklwd m0, m0 + punpckldq m0, m0 + + mova [outq], m0 + mova [outq+pitq], m0 + + mova [outq+pitq*2], m0 + add r1, r2 + + mova [outq+pitq*2], m0 + RET + + +;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) +cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride +%if ARCH_X86_64 + movsxd strideq, dword stridem +%else + mov strideq, stridem +%endif + pxor m0, m0 + + movh m5, in_dcq ; dc + paddw m5, [pw_16] + + psraw m5, 5 + + punpcklwd m5, m5 + punpckldq m5, m5 + + movh m1, [predq] + punpcklbw m1, m0 + paddsw m1, m5 + packuswb m1, m0 ; pack and unpack to saturate + movh [dstq], m1 + + movh m2, [predq+pitq] + punpcklbw m2, m0 + paddsw m2, m5 + packuswb m2, m0 ; pack and unpack to saturate + movh [dstq+strideq], m2 + + movh m3, [predq+2*pitq] + punpcklbw m3, m0 + paddsw m3, m5 + packuswb m3, m0 ; pack and unpack to saturate + movh [dstq+2*strideq], m3 + + add dstq, strideq + add predq, pitq + movh m4, [predq+2*pitq] + punpcklbw m4, m0 + paddsw m4, m5 + packuswb m4, m0 ; pack and unpack to saturate + movh [dstq+2*strideq], m4 + RET + |