diff options
author | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2010-05-18 11:58:33 -0400 |
commit | 0ea50ce9cb4b65eee6afa1d041fe8beb5abda667 (patch) | |
tree | 1f3b9019f28bc56fd3156f96e5a9653a983ee61b /vp8/encoder/ppc/fdct_altivec.asm | |
download | libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.gz libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.tar.bz2 libvpx-0ea50ce9cb4b65eee6afa1d041fe8beb5abda667.zip |
Initial WebM release
Diffstat (limited to 'vp8/encoder/ppc/fdct_altivec.asm')
-rw-r--r-- | vp8/encoder/ppc/fdct_altivec.asm | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm new file mode 100644 index 000000000..eaab14c79 --- /dev/null +++ b/vp8/encoder/ppc/fdct_altivec.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + .globl vp8_short_fdct4x4_ppc + .globl vp8_short_fdct8x4_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +;# Forward and inverse DCTs are nearly identical; only differences are +;# in normalization (fwd is twice unitary, inv is half unitary) +;# and that they are of course transposes of each other. +;# +;# The following three accomplish most of implementation and +;# are used only by ppc_idct.c and ppc_fdct.c. +.macro prologue + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfffc + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + li r6, 16 + + load_c v0, dct_tab, 0, r9, r10 + lvx v1, r6, r10 + addi r10, r10, 32 + lvx v2, 0, r10 + lvx v3, r6, r10 + + load_c v4, ppc_dctperm_tab, 0, r9, r10 + load_c v5, ppc_dctperm_tab, r6, r9, r10 + + load_c v6, round_tab, 0, r10, r9 +.endm + +.macro epilogue + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE +.endm + +;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. +;# a/A are the even rows 0,2 b/B are the odd rows 1,3 +;# For fwd transform, indices are horizontal positions, then frequencies. +;# For inverse transform, frequencies then positions. +;# The two resulting A0..A3 B0..B3 are later combined +;# and vertically transformed. + +.macro two_rows_horiz Dst + vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 + + vmsumshm v10, v0, v8, v6 + vmsumshm v10, v1, v9, v10 + vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 + + vmsumshm v11, v2, v8, v6 + vmsumshm v11, v3, v9, v11 + vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 + + vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 + vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 +.endm + +;# Vertical xf on two rows. DCT values in comments are for inverse transform; +;# forward transform uses transpose. + +.macro two_rows_vert Ceven, Codd + vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times + vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" + vmsumshm v8, v8, v12, v6 + vmsumshm v8, v9, v13, v8 + vsraw v10, v8, v7 + + vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 + vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 + vmsumshm v8, v8, v12, v6 + vmsumshm v8, v9, v13, v8 + vsraw v8, v8, v7 + + vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 +.endm + +.macro two_rows_h Dest + stw r0, 0(r8) + lwz r0, 4(r3) + stw r0, 4(r8) + lwzux r0, r3,r5 + stw r0, 8(r8) + lwz r0, 4(r3) + stw r0, 12(r8) + lvx v8, 0,r8 + two_rows_horiz \Dest +.endm + + .align 2 +;# r3 short *input +;# r4 short *output +;# r5 int pitch +vp8_short_fdct4x4_ppc: + + prologue + + vspltisw v7, 14 ;# == 14, fits in 5 signed bits + addi r8, r1, 0 + + + lwz r0, 0(r3) + two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 + + lwzux r0, r3, r5 + two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 + + lvx v6, r6, r9 ;# v6 = Vround + vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter + + two_rows_vert v0, v1 + stvx v8, 0, r4 + two_rows_vert v2, v3 + stvx v8, r6, r4 + + epilogue + + blr + + .align 2 +;# r3 short *input +;# r4 short *output +;# r5 int pitch +vp8_short_fdct8x4_ppc: + prologue + + vspltisw v7, 14 ;# == 14, fits in 5 signed bits + addi r8, r1, 0 + addi r10, r3, 0 + + lwz r0, 0(r3) + two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 + + lwzux r0, r3, r5 + two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 + + lvx v6, r6, r9 ;# v6 = Vround + vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter + + two_rows_vert v0, v1 + stvx v8, 0, r4 + two_rows_vert v2, v3 + stvx v8, r6, r4 + + ;# Next block + addi r3, r10, 8 + addi r4, r4, 32 + lvx v6, 0, r9 ;# v6 = Hround + + vspltisw v7, 14 ;# == 14, fits in 5 signed bits + addi r8, r1, 0 + + lwz r0, 0(r3) + two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 + + lwzux r0, r3, r5 + two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 + + lvx v6, r6, r9 ;# v6 = Vround + vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter + + two_rows_vert v0, v1 + stvx v8, 0, r4 + two_rows_vert v2, v3 + stvx v8, r6, r4 + + epilogue + + blr + + .data + .align 4 +ppc_dctperm_tab: + .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 + .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 + + .align 4 +dct_tab: + .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 + .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 + + .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 + .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 + + .align 4 +round_tab: + .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) + .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) |