Initial WebM release

author: John Koleszar <jkoleszar@google.com> 2010-05-18 11:58:33 -0400
committer: John Koleszar <jkoleszar@google.com> 2010-05-18 11:58:33 -0400
commit: 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree: 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder/ppc/fdct_altivec.asm
download: libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2
libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip
1 files changed, 204 insertions, 0 deletions
diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm
new file mode 100644
index 000000000..eaab14c79
--- /dev/null
+++ b/vp8/encoder/ppc/fdct_altivec.asm
@@ -0,0 +1,204 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_short_fdct4x4_ppc
+    .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;#   in normalization (fwd is twice unitary, inv is half unitary)
+;#   and that they are of course transposes of each other.
+;#
+;#   The following three accomplish most of implementation and
+;#   are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    li      r6, 16
+
+    load_c v0, dct_tab, 0, r9, r10
+    lvx     v1,   r6, r10
+    addi    r10, r10, 32
+    lvx     v2,    0, r10
+    lvx     v3,   r6, r10
+
+    load_c v4, ppc_dctperm_tab,  0, r9, r10
+    load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+    load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
+;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
+;#   For fwd transform, indices are horizontal positions, then frequencies.
+;#   For inverse transform, frequencies then positions.
+;#   The two resulting  A0..A3  B0..B3  are later combined
+;#   and vertically transformed.
+
+.macro two_rows_horiz Dst
+    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
+
+    vmsumshm v10, v0, v8, v6
+    vmsumshm v10, v1, v9, v10
+    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
+
+    vmsumshm v11, v2, v8, v6
+    vmsumshm v11, v3, v9, v11
+    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
+
+    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
+    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;#   forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
+    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v10, v8, v7
+
+    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
+    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v8, v8, v7
+
+    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
+.endm
+
+.macro two_rows_h Dest
+    stw     r0,  0(r8)
+    lwz     r0,  4(r3)
+    stw     r0,  4(r8)
+    lwzux   r0, r3,r5
+    stw     r0,  8(r8)
+    lwz     r0,  4(r3)
+    stw     r0, 12(r8)
+    lvx     v8,  0,r8
+    two_rows_horiz \Dest
+.endm
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8,  r1, 0
+    addi    r10, r3, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    ;# Next block
+    addi    r3, r10, 8
+    addi    r4, r4, 32
+    lvx     v6, 0, r9           ;# v6 = Hround
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .data
+    .align 4
+ppc_dctperm_tab:
+    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+    .align 4
+dct_tab:
+    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+    .align 4
+round_tab:
+    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
author	John Koleszar <jkoleszar@google.com>	2010-05-18 11:58:33 -0400
committer	John Koleszar <jkoleszar@google.com>	2010-05-18 11:58:33 -0400
commit	0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch)
tree	1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder/ppc/fdct_altivec.asm
download	libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip