summaryrefslogtreecommitdiff
path: root/vp9/common/ppc/vp9_idct_altivec.asm
diff options
context:
space:
mode:
authorYaowu Xu <yaowu@google.com>2013-03-12 11:24:04 -0700
committerYaowu Xu <yaowu@google.com>2013-03-13 08:35:46 -0700
commit005552639b276709868a4a0f86f27a737c7c6917 (patch)
tree0430483c13b8d3cffaa6efeef734313f0faa17c9 /vp9/common/ppc/vp9_idct_altivec.asm
parentbd9cd9a1859aa464b3092f2023b3a4040166572d (diff)
downloadlibvpx-005552639b276709868a4a0f86f27a737c7c6917.tar
libvpx-005552639b276709868a4a0f86f27a737c7c6917.tar.gz
libvpx-005552639b276709868a4a0f86f27a737c7c6917.tar.bz2
libvpx-005552639b276709868a4a0f86f27a737c7c6917.zip
removed reference to "LLM" and "x8"
The commit changed the name of files and function to remove obselete reference to LLM and x8. Change-Id: I973b20fc1a55149ed68b5408b3874768e6f88516
Diffstat (limited to 'vp9/common/ppc/vp9_idct_altivec.asm')
-rw-r--r--vp9/common/ppc/vp9_idct_altivec.asm189
1 files changed, 189 insertions, 0 deletions
diff --git a/vp9/common/ppc/vp9_idct_altivec.asm b/vp9/common/ppc/vp9_idct_altivec.asm
new file mode 100644
index 000000000..b87aa4200
--- /dev/null
+++ b/vp9/common/ppc/vp9_idct_altivec.asm
@@ -0,0 +1,189 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl short_idct4x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+ .align 2
+short_idct4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ load_c v8, sinpi8sqrt2, 0, r9, r10
+ load_c v9, cospi8sqrt2minus1, 0, r9, r10
+ load_c v10, hi_hi, 0, r9, r10
+ load_c v11, lo_lo, 0, r9, r10
+ load_c v12, shift_16, 0, r9, r10
+
+ li r10, 16
+ lvx v0, 0, r3 ;# input ip[0], ip[ 4]
+ lvx v1, r10, r3 ;# input ip[8], ip[12]
+
+ ;# first pass
+ vupkhsh v2, v0
+ vupkhsh v3, v1
+ vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
+
+ vupklsh v0, v0
+ vmulosh v4, v0, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vupklsh v1, v1
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v3, v1, v8
+ vsraw v3, v3, v12
+ vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v0, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v0
+
+ vaddsws v3, v3, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ ;# transpose input
+ vmrghw v4, v0, v1 ;# a0 b0 a1 b1
+ vmrghw v5, v2, v3 ;# c0 d0 c1 d1
+
+ vmrglw v6, v0, v1 ;# a2 b2 a3 b3
+ vmrglw v7, v2, v3 ;# c2 d2 c3 d3
+
+ vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
+ vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
+
+ vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
+ vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
+
+ ;# second pass
+ vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
+
+ vmulosh v4, v1, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v3, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v3
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v2, v3, v8
+ vsraw v2, v2, v12
+ vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vaddsws v3, v2, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ vspltish v6, 4
+ vspltish v7, 3
+
+ vpkswss v0, v0, v1
+ vpkswss v1, v2, v3
+
+ vaddshs v0, v0, v6
+ vaddshs v1, v1, v6
+
+ vsrah v0, v0, v7
+ vsrah v1, v1, v7
+
+ ;# transpose output
+ vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
+ vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+ vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
+ vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ stvx v0, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ stvx v1, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 4
+sinpi8sqrt2:
+ .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+ .align 4
+cospi8sqrt2minus1:
+ .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+ .align 4
+shift_16:
+ .long 16, 16, 16, 16
+
+ .align 4
+hi_hi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+lo_lo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31