12 files changed, 5235 insertions, 0 deletions
diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm
new file mode 100644
index 000000000..a4ce91583
--- /dev/null
+++ b/vp8/common/ppc/copy_altivec.asm
@@ -0,0 +1,47 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;#  but the output will be.  So two reads and a perm
+;#  for the input, but only one store for the output.
+copy_mem16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xe000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+cp_16x16_loop:
+    lvsl    v0,  0, r3          ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v1, v1, v2, v0
+
+    stvx    v1,  0, r5
+
+    add     r3, r3, r4          ;# increment source pointer
+    add     r5, r5, r6          ;# increment destination pointer
+
+    bdnz    cp_16x16_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm
new file mode 100644
index 000000000..4da2e94f9
--- /dev/null
+++ b/vp8/common/ppc/filter_altivec.asm
@@ -0,0 +1,1013 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl sixtap_predict_ppc
+    .globl sixtap_predict8x4_ppc
+    .globl sixtap_predict8x8_ppc
+    .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+    load_c \V0, HFilter, r5, r9, r10
+
+    addi    r5,  r5, 16
+    lvx     \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+    load_c v0, VFilter, r6, r3, r10
+
+    vspltish v5, 8
+    vspltish v6, 3
+    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v1, v0, 1
+    vspltb  v2, v0, 2
+    vspltb  v3, v0, 3
+    vspltb  v4, v0, 4
+    vspltb  v5, v0, 5
+    vspltb  v0, v0, 0
+.endm
+
+.macro vpre_load
+    Vprolog
+    li      r10,  16
+    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
+    lvx     v11, r10, r9
+    addi    r9,   r9, 32
+    lvx     v12,   0, r9
+    lvx     v13, r10, r9
+    addi    r9,   r9, 32
+    lvx     v14,   0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+                                ;# (Re,Ro) += (V*T)
+    vmuleub \TMP, \V, \T        ;# trashes v8
+    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
+    vmuloub \TMP, \V, \T
+    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
+    vadduhm v16, v6, v8
+    vmuloub  v8, \P0, v0
+    vadduhm v17, v6, v8
+    Msum v16, v17, \P2, v2, v8
+    Msum v16, v17, \P3, v3, v8
+    Msum v16, v17, \P5, v5, v8
+
+    vmuleub v18, \P1, v1        ;# 2 negative taps
+    vmuloub v19, \P1, v1
+    Msum v18, v19, \P4, v4, v8
+
+    vsubuhs v16, v16, v18       ;# subtract neg from pos
+    vsubuhs v17, v17, v19
+    vsrh    v16, v16, v7        ;# divide by 128
+    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
+    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
+    vmrglh  v19, v16, v17
+    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
+    vadduhm v21, v20, v24
+    vmuloub v24, \P0, v13
+    vadduhm v22, v20, v24
+    Msum v21, v22, \P2, v15, v25
+    Msum v21, v22, \P3, v16, v25
+    Msum v21, v22, \P5, v18, v25
+
+    vmuleub v23, \P1, v14       ;# 2 negative taps
+    vmuloub v24, \P1, v14
+    Msum v23, v24, \P4, v17, v25
+
+    vsubuhs v21, v21, v23       ;# subtract neg from pos
+    vsubuhs v22, v22, v24
+    vsrh    v21, v21, v19       ;# divide by 128
+    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
+    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
+    vmrglh  v24, v21, v22
+    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+    stvx    \P0, 0, r7
+    add     r7, r7, r8      ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+    addi    r9,   r9, 16        ;# P5 = newest input row
+    lvx     \P5,   0, r9
+    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+    luma_v v10, v11, v12, v13, v14, v15
+    luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+    luma_vtwo
+    luma_v v12, v13, v14, v15, v10, v11
+    luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+    luma_vfour
+    luma_v v14, v15, v10, v11, v12, v13
+    luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+    vmsummbm \R, v13, \I, v15
+    vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     \VD,   0, \RS
+    lvx     v20, r10, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
+    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
+    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
+    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
+    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
+
+    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
+    vsrh    \R, \R, v19
+
+    vpkuhus \R, \R, \R          ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v20,   0, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, v20, v20, v21
+.endm
+    .text
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xff87
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    vertical_only_4x4
+
+    ;# load up horizontal filter
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_4x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_4x4
+
+vertical_only_4x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0, r3, r4, 1
+    Read8x8 v1, r3, r4, 1
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_4x4:
+    load_c   v20, b_hilo_4x4, 0, r9, r10
+    load_c   v21, b_hilo, 0, r9, r10
+
+    ;# reposition input so that it can go through the
+    ;# filtering phase with one pass.
+    vperm   v0, v0, v1, v20     ;# 0 1 x x
+    vperm   v2, v2, v3, v20     ;# 2 3 x x
+    vperm   v4, v4, v5, v20     ;# 4 5 x x
+    vperm   v6, v6, v7, v20     ;# 6 7 x x
+
+    vperm   v0, v0, v2, v21     ;# 0 1 2 3
+    vperm   v4, v4, v6, v21     ;# 4 5 6 7
+
+    vsldoi  v1, v0, v4, 4
+    vsldoi  v2, v0, v4, 8
+    vsldoi  v3, v0, v4, 12
+
+    vsldoi  v5, v4, v8, 4
+
+    load_c   v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+    stvx    v0, 0, r1
+
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 4(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 8(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 12(r1)
+    stw     r0, 0(r7)
+
+    b       exit_4x4
+
+store_4x4:
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v4, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v5, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x4
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_8x4
+
+second_pass_pre_copy_8x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x4:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+    b       exit_8x4
+
+store_8x4:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x4
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned2_8x4:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;#  register there is no need to loop.  Everything can stay in registers.
+sixtap_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x8
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 1
+    Read8x8 v9, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+    interp_8x8 v9
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x8
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0,  r9, r4, 1
+    Read8x8 v1,  r9, r4, 0
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v10
+    interp_8x8 v11
+    interp_8x8 v12
+
+    b       second_pass_8x8
+
+second_pass_pre_copy_8x8:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+    Read8x8 v9,  r3, r4, 1
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x8:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
+    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
+    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
+    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+    b       exit_8x8
+
+store_8x8:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x8
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+    w_8x8   v8, r7, r0, r8
+    w_8x8   v9, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned2_8x8:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+    vperm   v8, v8, v9, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+    addi    r7, r7, 16
+    stvx    v8, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
+;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
+;#  temporary buffer because the source buffer can't be modified and the buffer
+;#  for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    ;# Three possiblities
+    ;#  1. First filter is null.  Don't use a temp buffer.
+    ;#  2. Second filter is null.  Don't use a temp buffer.
+    ;#  3. Neither are null, use temp buffer.
+
+    ;# First Pass (horizontal edge)
+    ;#  setup pointers for src
+    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
+    ;#  to second pass.  this is based on if x_offset is 0.
+
+    ;# load up horizontal filter
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    load_hfilter v4, v5
+
+    beq-    copy_horizontal_16x21
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v14, b_hperm, 0, r9, r10
+
+    ;# These statements are guessing that there won't be a second pass,
+    ;#  but if there is then inside the bypass they need to be set
+    li      r0, 16              ;# prepare for no vertical filter
+
+    ;# Change the output pointer and pitch to be the actual
+    ;#  desination instead of a temporary buffer.
+    addi    r9, r7, 0
+    addi    r5, r8, 0
+
+    ;# no vertical filter, so write the output from the first pass
+    ;#  directly into the output buffer.
+    beq-    no_vertical_filter_bypass
+
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# setup counter for the number of lines that are going to be filtered
+    li      r0, 21
+
+    ;# use the stack as temporary storage
+    la      r9, 48(r1)
+    li      r5, 16
+
+no_vertical_filter_bypass:
+
+    mtctr   r0
+
+    ;# rounding added in on the multiply
+    vspltisw v10, 8
+    vspltisw v12, 3
+    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v13, 7
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+horizontal_loop_16x16:
+
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+    lvx     v3, r12, r3
+
+    vperm   v8, v1, v2, v15
+    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
+
+    vsldoi  v11, v8, v9, 4
+
+    ;# set 0
+    vmsummbm v6, v4, v8, v12    ;# taps times elements
+    vmsummbm v0, v5, v11, v6
+
+    ;# set 1
+    vsldoi  v10, v8, v9, 1
+    vsldoi  v11, v8, v9, 5
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v1, v5, v11, v6
+
+    ;# set 2
+    vsldoi  v10, v8, v9, 2
+    vsldoi  v11, v8, v9, 6
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v2, v5, v11, v6
+
+    ;# set 3
+    vsldoi  v10, v8, v9, 3
+    vsldoi  v11, v8, v9, 7
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v3, v5, v11, v6
+
+    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
+
+    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
+    vsrh    v1, v1, v13
+
+    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
+    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
+
+    stvx    v0,  0, r9
+    add     r9, r9, r5
+
+    add     r3, r3, r4
+
+    bdnz    horizontal_loop_16x16
+
+    ;# check again to see if vertical filter needs to be done.
+    cmpi    cr0, r6, 0
+    beq     cr0, end_16x16
+
+    ;# yes there is, so go to the second pass
+    b       second_pass_16x16
+
+copy_horizontal_16x21:
+    li      r10, 21
+    mtctr   r10
+
+    li      r10, 16
+
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# this is done above if there is a horizontal filter,
+    ;#  if not it needs to be done down here.
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+    ;# always write to the stack when doing a horizontal copy
+    la      r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v8, v1, v2, v15
+
+    stvx    v8,  0, r9
+    addi    r9, r9, 16
+
+    add     r3, r3, r4
+
+    bdnz    copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+    ;# always read from the stack when doing a vertical filter
+    la      r9, 48(r1)
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v7, 7
+
+    vpre_load
+
+    luma_vsix
+    luma_vsix
+    luma_vfour
+
+end_16x16:
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+HFilter:
+    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
+    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
+    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
+    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
+    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
+    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
+    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
+    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
+    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
+    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
+    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+
+    .align 4
+VFilter:
+    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+
+    .align 4
+b_hperm:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+B_0123:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+B_4567:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+    .align 4
+B_89AB:
+    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+    .align 4
+b_hilo:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+b_hilo_4x4:
+    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm
new file mode 100644
index 000000000..fd8aa665f
--- /dev/null
+++ b/vp8/common/ppc/filter_bilinear_altivec.asm
@@ -0,0 +1,677 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl bilinear_predict4x4_ppc
+    .globl bilinear_predict8x4_ppc
+    .globl bilinear_predict8x8_ppc
+    .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r9, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r9, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r9, r0
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro HFilter V
+    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_4x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_4x4_b:
+
+    stvx    v0, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v1, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_8x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_8x4_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+    hfilter_8 v4, 1
+    hfilter_8 v5, 1
+    hfilter_8 v6, 1
+    hfilter_8 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x8_b
+
+    hfilter_8 v8, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+    load_and_align_8  v5, 1
+    load_and_align_8  v6, 1
+    load_and_align_8  v7, 1
+    load_and_align_8  v8, 0
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+store_out_8x8_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  1
+    load_and_align_16  v1,  1
+    load_and_align_16  v2,  1
+    load_and_align_16  v3,  1
+    load_and_align_16  v4,  1
+    load_and_align_16  v5,  1
+    load_and_align_16  v6,  1
+    load_and_align_16  v7,  1
+    load_and_align_16  v8,  1
+    load_and_align_16  v9,  1
+    load_and_align_16  v10, 1
+    load_and_align_16  v11, 1
+    load_and_align_16  v12, 1
+    load_and_align_16  v13, 1
+    load_and_align_16  v14, 1
+    load_and_align_16  v15, 1
+    load_and_align_16  v16, 0
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+    write_16 v0,  1
+    write_16 v1,  1
+    write_16 v2,  1
+    write_16 v3,  1
+    write_16 v4,  1
+    write_16 v5,  1
+    write_16 v6,  1
+    write_16 v7,  1
+    write_16 v8,  1
+    write_16 v9,  1
+    write_16 v10, 1
+    write_16 v11, 1
+    write_16 v12, 1
+    write_16 v13, 1
+    write_16 v14, 1
+    write_16 v15, 0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm
new file mode 100644
index 000000000..117d9cfc8
--- /dev/null
+++ b/vp8/common/ppc/idctllm_altivec.asm
@@ -0,0 +1,189 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+    .align 2
+short_idct4x4llm_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    load_c v8, sinpi8sqrt2, 0, r9, r10
+    load_c v9, cospi8sqrt2minus1, 0, r9, r10
+    load_c v10, hi_hi, 0, r9, r10
+    load_c v11, lo_lo, 0, r9, r10
+    load_c v12, shift_16, 0, r9, r10
+
+    li      r10,  16
+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
+    lvx     v1, r10, r3         ;# input ip[8], ip[12]
+
+    ;# first pass
+    vupkhsh v2, v0
+    vupkhsh v3, v1
+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
+
+    vupklsh v0, v0
+    vmulosh v4, v0, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vupklsh v1, v1
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v3, v1, v8
+    vsraw   v3, v3, v12
+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v0, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v0
+
+    vaddsws v3, v3, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    ;# transpose input
+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
+
+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
+
+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
+
+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
+
+    ;# second pass
+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
+
+    vmulosh v4, v1, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v3, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v3
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v2, v3, v8
+    vsraw   v2, v2, v12
+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vaddsws v3, v2, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    vspltish v6, 4
+    vspltish v7, 3
+
+    vpkswss v0, v0, v1
+    vpkswss v1, v2, v3
+
+    vaddshs v0, v0, v6
+    vaddshs v1, v1, v6
+
+    vsrah   v0, v0, v7
+    vsrah   v1, v1, v7
+
+    ;# transpose output
+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    stvx    v0,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    add     r4, r4, r5
+
+    stvx    v1,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 4
+sinpi8sqrt2:
+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+    .align 4
+cospi8sqrt2minus1:
+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+    .align 4
+shift_16:
+    .long      16,    16,    16,    16
+
+    .align 4
+hi_hi:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+lo_lo:
+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c
new file mode 100644
index 000000000..71bf6e2d7
--- /dev/null
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+    unsigned char *s,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+    unsigned char *u,   // source pointer
+    unsigned char *v,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+    unsigned char *s,   // source pointer
+    int p,              // pitch
+    const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    // These should all be done at once with one call, instead of 3
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm
new file mode 100644
index 000000000..61df4e976
--- /dev/null
+++ b/vp8/common/ppc/loopfilter_filters_altivec.asm
@@ -0,0 +1,1253 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl mbloop_filter_horizontal_edge_y_ppc
+    .globl loop_filter_horizontal_edge_y_ppc
+    .globl mbloop_filter_vertical_edge_y_ppc
+    .globl loop_filter_vertical_edge_y_ppc
+
+    .globl mbloop_filter_horizontal_edge_uv_ppc
+    .globl loop_filter_horizontal_edge_uv_ppc
+    .globl mbloop_filter_vertical_edge_uv_ppc
+    .globl loop_filter_vertical_edge_uv_ppc
+
+    .globl loop_filter_simple_horizontal_edge_ppc
+    .globl loop_filter_simple_vertical_edge_ppc
+
+    .text
+;# We often need to perform transposes (and other transpose-like operations)
+;#   on matrices of data.  This is simplified by the fact that we usually
+;#   operate on hunks of data whose dimensions are powers of 2, or at least
+;#   divisible by highish powers of 2.
+;#
+;#   These operations can be very confusing.  They become more straightforward
+;#   when we think of them as permutations of address bits: Concatenate a
+;#   group of vector registers and think of it as occupying a block of
+;#   memory beginning at address zero.  The low four bits 0...3 of the
+;#   address then correspond to position within a register, the higher-order
+;#   address bits select the register.
+;#
+;#   Although register selection, at the code level, is arbitrary, things
+;#   are simpler if we use contiguous ranges of register numbers, simpler
+;#   still if the low-order bits of the register number correspond to
+;#   conceptual address bits.  We do this whenever reasonable.
+;#
+;#   A 16x16 transpose can then be thought of as an operation on
+;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
+;#   memory and the effect of a transpose is to interchange address bit
+;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
+;#   column, which is interchanged with the row addressed by bits 4..7.
+;#
+;#   The altivec merge instructions provide a rapid means of effecting
+;#   many of these transforms.  They operate at three widths (8,16,32).
+;#   Writing V(x) for vector register #x, paired merges permute address
+;#   indices as follows.
+;#
+;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
+;#
+;#      vmrghb  V( x),          V( y), V( y + (1<<s))
+;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
+;#
+;#      vmrghh  V( x),          V( y), V( y + (1<<s))
+;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
+;#
+;#      vmrghw  V( x),          V( y), V( y + (1<<s))
+;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   Unfortunately, there is no doubleword merge instruction.
+;#   The following sequence uses "vperm" is a substitute.
+;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;#   are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
+;#
+;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
+;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;#   Except for bits s and d, the other relationships between register
+;#   number (= high-order part of address) bits are at the disposal of
+;#   the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;#   edges together.  This requires a single 16x16 transpose, which, in
+;#   the above language, amounts to the following permutation of address
+;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
+;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;#   Except for the fact that the destination registers get written
+;#   before we are done referencing the old contents, the cyclic transform
+;#   is effected by
+;#
+;#      x = 0;  do {
+;#          vmrghb V(2x),   V(x), V(x+8);
+;#          vmrghb V(2x+1), V(x), V(x+8);
+;#      } while( ++x < 8);
+;#
+;#   For clarity, and because we can afford it, we do this transpose
+;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
+;#   leaving the final result in 16 .. 31, as the lower registers are
+;#   used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+    vmrghb  \A, \X, \Y
+    vmrglb  \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+    Tpair v16,v17,  v0,v8
+    Tpair v18,v19,  v1,v9
+    Tpair v20,v21,  v2,v10
+    Tpair v22,v23,  v3,v11
+    Tpair v24,v25,  v4,v12
+    Tpair v26,v27,  v5,v13
+    Tpair v28,v29,  v6,v14
+    Tpair v30,v31,  v7,v15
+.endm
+
+.macro t16_odd
+    Tpair v0,v1, v16,v24
+    Tpair v2,v3, v17,v25
+    Tpair v4,v5, v18,v26
+    Tpair v6,v7, v19,v27
+    Tpair v8,v9, v20,v28
+    Tpair v10,v11, v21,v29
+    Tpair v12,v13, v22,v30
+    Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+    t16_odd
+    t16_even
+    t16_odd
+    t16_even
+.endm
+
+;# Vertical edge filtering requires transposes.  For the simple filter,
+;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;#  v0 =  0  1 ... 14 15
+;#  v1 = 16 17 ... 30 31
+;#  v2 = 32 33 ... 47 48
+;#  v3 = 49 50 ... 62 63
+;#
+;#  In frame-buffer memory, the layout is:
+;#
+;#     0  16  32  48
+;#     1  17  33  49
+;#     ...
+;#    15  31  47  63.
+;#
+;#  We begin by reading the data 32 bits at a time (using scalar operations)
+;#  into a temporary array, reading the rows of the array into vector registers,
+;#  with the following layout:
+;#
+;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
+;#  v1 =  1 17 33 49  5 21 ...                      45 61
+;#  v2 =  2 18 ...                                  46 62
+;#  v3 =  3 19 ...                                  47 63
+;#
+;#  From the "address-bit" perspective discussed above, we simply need to
+;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;#  In other words, we transpose each of the four 4x4 submatrices.
+;#
+;#  This transformation is its own inverse, and we need to perform it
+;#  again before writing the pixels back into the frame buffer.
+;#
+;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;#  defined above.  We think of both groups of 4 registers as having
+;#  "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
+
+    vmrghb  v4, v0, v1
+    vmrglb  v5, v0, v1
+    vmrghb  v6, v2, v3
+    vmrglb  v7, v2, v3
+
+    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
+
+    vmrghh  v0, v4, v6
+    vmrglh  v1, v4, v6
+    vmrghh  v2, v5, v7
+    vmrglh  v3, v5, v7
+
+    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
+
+    vmrghw  v4, v0, v1
+    vmrglw  v5, v0, v1
+    vmrghw  v6, v2, v3
+    vmrglw  v7, v2, v3
+
+    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
+
+    vperm   v0, v4, v6, \Vlo
+    vperm   v1, v4, v6, \Vhi
+    vperm   v2, v5, v7, \Vlo
+    vperm   v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;#   We read 8 columns of data, initially in the following pattern:
+;#
+;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
+;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
+;#  ...
+;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;#   and wish to convert to:
+;#
+;#  (0,0) ... (0,15)
+;#  (1,0) ... (1,15)
+;#  ...
+;#  (7,0) ... (7,15).
+;#
+;#  In "address bit" language, we wish to map
+;#
+;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
+;#
+;#  This can be accomplished by 4 iterations of the cyclic transform
+;#
+;#  I -> (I+1) mod 7;
+;#
+;#  each iteration can be realized by (d=0, s=2):
+;#
+;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
+;#
+;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
+;#  preserving v8 = sign converter.
+;#
+;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;#  result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+    Tpair v10, v11,  v0, v4
+    Tpair v12, v13,  v1, v5
+    Tpair v14, v15,  v2, v6
+    Tpair v16, v17,  v3, v7
+.endm
+
+.macro t8x16_even
+    Tpair v0, v1,  v10, v14
+    Tpair v2, v3,  v11, v15
+    Tpair v4, v5,  v12, v16
+    Tpair v6, v7,  v13, v17
+.endm
+
+.macro transpose8x16_fwd
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+    t8x16_even
+.endm
+
+.macro transpose8x16_inv
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+.endm
+
+.macro Transpose16x16
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;#   into vector register Vreg.  Trashes r0
+.macro load_g Vreg, Gptr
+    lwz     r0, \Gptr
+    lvx     \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here.  if the answer is negative
+;# it will be clamped to 0.  orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+    vsububs \RES, \A, \B
+    vsububs \TMP, \B, \A
+    vor     \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+    vsububs \TMP, \A, \B
+    vmaxub  \RES, \RES, \TMP
+    vsububs \TMP, \B, \A
+    vmaxub  \RES, \RES, \TMP
+.endm
+
+.macro Masks
+    ;# build masks
+    ;# input is all 8 bit unsigned (0-255).  need to
+    ;# do abs(vala-valb) > limit.  but no need to compare each
+    ;# value to the limit.  find the max of the absolute differences
+    ;# and compare that to the limit.
+    ;# First hev
+    Abs     v14, v13, v2, v3    ;# |P1 - P0|
+    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
+
+    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
+
+    ;# Next limit
+    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
+    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
+    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
+    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
+
+    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
+
+    ;# flimit
+    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
+
+    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
+
+    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
+    ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+    ;# build constants
+    lvx     \FL, 0, \RFL        ;# flimit
+    lvx     \LI, 0, \RLI        ;# limit
+    lvx     \TH, 0, \RTH        ;# thresh
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+    ;# setup strides/pointers to be able to access
+    ;# all of the data
+    add     r5, r4, r4          ;# r5 = 2 * stride
+    sub     r6, r3, r5          ;# r6 -> 2 rows back
+    neg     r7, r4              ;# r7 = -stride
+
+    ;# load 16 pixels worth of data to work on
+    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
+    lvx     v0,  0, r0          ;# P3  (read only)
+    lvx     v1, r7, r6          ;# P2
+    lvx     v2,  0, r6          ;# P1
+    lvx     v3, r7, r3          ;# P0
+    lvx     v4,  0, r3          ;# Q0
+    lvx     v5, r4, r3          ;# Q1
+    lvx     v6, r5, r3          ;# Q2
+    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
+    lvx     v7, r4, r0          ;# Q3  (read only)
+.endm
+
+;# Expects
+;#  v10 == HEV
+;#  v13 == tmp
+;#  v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+    vxor    \P1, \P1, v11       ;# SP1
+    vxor    \P0, \P0, v11       ;# SP0
+    vxor    \Q0, \Q0, v11       ;# SQ0
+    vxor    \Q1, \Q1, v11       ;# SQ1
+
+    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
+.if \HEV_PRESENT
+    vand    v13, v13, v10       ;# f &= hev
+.endif
+    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
+
+    vsrab   v13, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
+    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+    Masks
+
+    ;# start the fitering here
+    vxor    v1, v1, v11         ;# SP2
+    vxor    v2, v2, v11         ;# SP1
+    vxor    v3, v3, v11         ;# SP0
+    vxor    v4, v4, v11         ;# SQ0
+    vxor    v5, v5, v11         ;# SQ1
+    vxor    v6, v6, v11         ;# SQ2
+
+    ;# add outer taps if we have high edge variance
+    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
+
+    vsubsbs v14, v4, v3         ;# SQ0-SP0
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+    vand    v15, v13, v10       ;# f2 = f & hev
+
+    ;# save bottom 3 bits so that we round one side +4 and the other +3
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
+
+    vsrab   v14, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
+    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
+
+    ;# only apply wider filter if not high edge variance
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vspltisb v9, 2
+    vnor    v8, v8, v8
+    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
+    vspltisb v8, 9
+
+    ;# roughly 1/7th difference across boundary
+    vspltish v10, 7
+    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v8, v13
+    vaddshs v14, v14, v9        ;# +=  63
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
+    vaddsbs v1, v1, v10
+
+    vxor    v6, v6, v11
+    vxor    v1, v1, v11
+
+    ;# roughly 2/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v8, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
+    vaddsbs v2, v2, v10
+
+    vxor    v5, v5, v11
+    vxor    v2, v2, v11
+
+    ;# roughly 3/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v12, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
+    vaddsbs v3, v3, v10
+
+    vxor    v4, v4, v11
+    vxor    v3, v3, v11
+.endm
+
+.macro SBFilter
+    Masks
+
+    common_adjust v3, v4, v2, v5, 1
+
+    ;# outer tap adjustments
+    vspltisb v8, 1
+
+    vaddubm v13, v13, v8        ;# f  += 1
+    vsrab   v13, v13, v8        ;# f >>= 1
+
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
+    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
+
+    vxor    v2, v2, v11
+    vxor    v3, v3, v11
+    vxor    v4, v4, v11
+    vxor    v5, v5, v11
+.endm
+
+    .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    vp8_mbfilter
+
+    stvx     v1, r7, r6         ;# P2
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+    stvx     v6, r5, r3         ;# Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    SBFilter
+
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
+;#  So we can read in an entire mb aligned.  However if we want to filter the mb
+;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
+;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
+;#  of a waste.  So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;#  memory to align and order them up.  Then they are read in using the
+;#  vector register file.
+.macro RLVmb V, R
+    lwzux   r0, r3, r4
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r3, r4
+    stw     r0,12(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 8(\R)
+    stw     r0,-4(r3)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+    sub     r3, r3, r4
+
+    RLVmb v0, r9
+    RLVmb v1, r9
+    RLVmb v2, r9
+    RLVmb v3, r9
+    RLVmb v4, r9
+    RLVmb v5, r9
+    RLVmb v6, r9
+    RLVmb v7, r9
+
+    transpose8x16_fwd
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    vp8_mbfilter
+
+    transpose8x16_inv
+
+    add r3, r3, r4
+    neg r4, r4
+
+    WLVmb v17, r9
+    WLVmb v16, r9
+    WLVmb v15, r9
+    WLVmb v14, r9
+    WLVmb v13, r9
+    WLVmb v12, r9
+    WLVmb v11, r9
+    WLVmb v10, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RL V, R, P
+    lvx     \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro WL V, R, P
+    stvx    \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+                                ;# K = |P0-P1| already
+    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
+    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
+    vcmpgtub v10, v14, v0
+
+    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
+
+    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
+    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
+    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
+
+    vmaxub   v14, v14, v4       ;# M = max interior abs diff
+    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
+
+    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
+    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
+    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
+
+    ;# replace P1,Q1 w/signed versions
+    common_adjust \P0, \Q0, \P1, \Q1, 1
+
+    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
+    vsrab   v13, v13, v1
+    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
+    vsubsbs \Q1, \Q1, v13
+    vaddsbs \P1, \P1, v13
+
+    vxor    \P1, \P1, v11       ;# P1
+    vxor    \P0, \P0, v11       ;# P0
+    vxor    \Q0, \Q0, v11       ;# Q0
+    vxor    \Q1, \Q1, v11       ;# Q1
+.endm
+
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    addi    r9, r3, 0
+    RL      v16, r9, r4
+    RL      v17, r9, r4
+    RL      v18, r9, r4
+    RL      v19, r9, r4
+    RL      v20, r9, r4
+    RL      v21, r9, r4
+    RL      v22, r9, r4
+    RL      v23, r9, r4
+    RL      v24, r9, r4
+    RL      v25, r9, r4
+    RL      v26, r9, r4
+    RL      v27, r9, r4
+    RL      v28, r9, r4
+    RL      v29, r9, r4
+    RL      v30, r9, r4
+    lvx     v31, 0, r9
+
+    Transpose16x16
+
+    vspltisb v1, 1
+
+    build_constants r5, r6, r7, v3, v2, v0
+
+    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
+
+    Fil v16, v17, v18, v19,  v20, v21, v22, v23
+    Fil v20, v21, v22, v23,  v24, v25, v26, v27
+    Fil v24, v25, v26, v27,  v28, v29, v30, v31
+
+    Transpose16x16
+
+    addi    r9, r3, 0
+    WL      v16, r9, r4
+    WL      v17, r9, r4
+    WL      v18, r9, r4
+    WL      v19, r9, r4
+    WL      v20, r9, r4
+    WL      v21, r9, r4
+    WL      v22, r9, r4
+    WL      v23, r9, r4
+    WL      v24, r9, r4
+    WL      v25, r9, r4
+    WL      v26, r9, r4
+    WL      v27, r9, r4
+    WL      v28, r9, r4
+    WL      v29, r9, r4
+    WL      v30, r9, r4
+    stvx    v31, 0, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+    andi.   r7, r3, 8       ;# row origin modulo 16
+    add     r7, r7, r7      ;# selects selectors
+    lis     r12, _chromaSelectors@ha
+    la      r0,  _chromaSelectors@l(r12)
+    lwzux   r0, r7, r0      ;# leave selector addr in r7
+
+    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+    lvx     \U, \Offs, r3
+    lvx     \V, \Offs, r4
+    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
+    vperm   \V, \New, \V, \Vmask
+    stvx    \U, \Offs, r3           ;# Write to frame buffer
+    stvx    \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+    neg     r9, r5          ;# r9 = -1 * stride
+    add     r8, r9, r9      ;# r8 = -2 * stride
+    add     r10, r5, r5     ;# r10 = 2 * stride
+
+    active_chroma_sel v12
+
+    ;# P3, Q3 are read-only; need not save addresses or sibling pels
+    add     r6, r8, r8      ;# r6 = -4 * stride
+    hread_uv v0, v14, v15, r6, v12
+    add     r6, r10, r5     ;# r6 =  3 * stride
+    hread_uv v7, v14, v15, r6, v12
+
+    ;# Others are read/write; save addresses and sibling pels
+
+    add     r6, r8, r9      ;# r6 = -3 * stride
+    hread_uv v1, v16, v17, r6,  v12
+    hread_uv v2, v18, v19, r8,  v12
+    hread_uv v3, v20, v21, r9,  v12
+    hread_uv v4, v22, v23, 0,   v12
+    hread_uv v5, v24, v25, r5,  v12
+    hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+    load_g   \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+    load_g   \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+    uresult_sel v11
+    vresult_sel v12
+    hwrite_uv v2, v18, v19, r8, v11, v12
+    hwrite_uv v3, v20, v21, r9, v11, v12
+    hwrite_uv v4, v22, v23, 0,  v11, v12
+    hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    vp8_mbfilter
+
+    store_chroma_h
+
+    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
+    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    SBFilter
+
+    store_chroma_h
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro R V, R
+    lwzux   r0, r3, r5
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r4, r5
+    stw     r0,12(\R)
+    lwz     r0,-4(r4)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+
+.macro W V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r4, r5
+    lwz     r0, 8(\R)
+    stw     r0,-4(r4)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r5
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+    sub r3, r3, r5          ;# back up one line for simplicity
+    sub r4, r4, r5
+
+    R v0, \R
+    R v1, \R
+    R v2, \R
+    R v3, \R
+    R v4, \R
+    R v5, \R
+    R v6, \R
+    R v7, \R
+
+    transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+    transpose8x16_inv
+
+    add     r3, r3, r5
+    add     r4, r4, r5
+    neg     r5, r5          ;# Write rows back in reverse order
+
+    W v17, \R
+    W v16, \R
+    W v15, \R
+    W v14, \R
+    W v13, \R
+    W v12, \R
+    W v11, \R
+    W v10, \R
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    vp8_mbfilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    SBFilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
+    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
+
+    ;# preserve unsigned v0 and v3
+    common_adjust v1, v2, v0, v3, 0
+
+    vxor v1, v1, v11
+    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+    addi    r8,  0, 16
+    addi    r7, r5, 32
+
+    lvx     v0,  0, r5
+    lvx     v1, r8, r5
+    lvx     v2,  0, r7
+    lvx     v3, r8, r7
+
+    lis     r12, _B_hihi@ha
+    la      r0,  _B_hihi@l(r12)
+    lvx     v16, 0, r0
+
+    lis     r12, _B_lolo@ha
+    la      r0,  _B_lolo@l(r12)
+    lvx     v17, 0, r0
+
+    Transpose4times4x4 v16, v17
+    vp8_simple_filter
+
+    vxor v0, v0, v11
+    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
+
+    Transpose4times4x4 v16, v17
+
+    stvx    v0,  0, r5
+    stvx    v1, r8, r5
+    stvx    v2,  0, r7
+    stvx    v3, r8, r7
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    neg     r5, r4              ;# r5 = -1 * stride
+    add     r6, r5, r5          ;# r6 = -2 * stride
+
+    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
+    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
+    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
+    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
+
+    vp8_simple_filter
+
+    stvx    v1, r5, r3          ;# store P0
+    stvx    v2,  0, r3          ;# store Q0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RLV Offs
+    stw     r0, (\Offs*4)(r5)
+    lwzux   r0, r7, r4
+.endm
+
+.macro WLV Offs
+    lwz     r0, (\Offs*4)(r5)
+    stwux   r0, r7, r4
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    la r5, -96(r1)              ;# temporary space for reading in vectors
+
+    ;# Store 4 pels at word "Offs" in temp array, then advance r7
+    ;#   to next row and read another 4 pels from the frame buffer.
+
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r7          ;# read first 4 pels
+
+    ;# 16 unaligned word accesses
+    RLV 0
+    RLV 4
+    RLV 8
+    RLV 12
+    RLV 1
+    RLV 5
+    RLV 9
+    RLV 13
+    RLV 2
+    RLV 6
+    RLV 10
+    RLV 14
+    RLV 3
+    RLV 7
+    RLV 11
+
+    stw     r0, (15*4)(r5)      ;# write last 4 pels
+
+    simple_vertical
+
+    ;# Read temp array, write frame buffer.
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r5          ;# read/write first 4 pels
+    stwx    r0,  0, r7
+
+    WLV 4
+    WLV 8
+    WLV 12
+    WLV 1
+    WLV 5
+    WLV 9
+    WLV 13
+    WLV 2
+    WLV 6
+    WLV 10
+    WLV 14
+    WLV 3
+    WLV 7
+    WLV 11
+    WLV 15
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+_chromaSelectors:
+    .long   _B_hihi
+    .long   _B_Ures0
+    .long   _B_Vres0
+    .long   0
+    .long   _B_lolo
+    .long   _B_Ures8
+    .long   _B_Vres8
+    .long   0
+
+    .align 4
+_B_Vres8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
+
+    .align 4
+_B_Ures8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
+
+    .align 4
+_B_lolo:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_Vres0:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+    .align 4
+_B_Ures0:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_hihi:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm
new file mode 100644
index 000000000..f81d86f74
--- /dev/null
+++ b/vp8/common/ppc/platform_altivec.asm
@@ -0,0 +1,59 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl save_platform_context
+    .globl restore_platform_context
+
+.macro W V P
+    stvx    \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+.macro R V P
+    lvx     \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+;# r3 context_ptr
+    .align 2
+save_platform_contex:
+    W v20, r3
+    W v21, r3
+    W v22, r3
+    W v23, r3
+    W v24, r3
+    W v25, r3
+    W v26, r3
+    W v27, r3
+    W v28, r3
+    W v29, r3
+    W v30, r3
+    W v31, r3
+
+    blr
+
+;# r3 context_ptr
+    .align 2
+restore_platform_context:
+    R v20, r3
+    R v21, r3
+    R v22, r3
+    R v23, r3
+    R v24, r3
+    R v25, r3
+    R v26, r3
+    R v27, r3
+    R v28, r3
+    R v29, r3
+    R v30, r3
+    R v31, r3
+
+    blr
diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm
new file mode 100644
index 000000000..dd39e05a8
--- /dev/null
+++ b/vp8/common/ppc/recon_altivec.asm
@@ -0,0 +1,175 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl recon4b_ppc
+    .globl recon2b_ppc
+    .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
+    addi    \Pred, \Pred, 16        ;# next pred
+    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff           ;# v3 = d0..d7
+    vaddshs v2, v2, v3              ;# v2 = r0..r7
+    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff           ;# v3 = d8..d15
+    addi    \Diff, \Diff, 32        ;# next diff
+    vaddshs v3, v3, v1              ;# v3 = r8..r15
+    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, \Dst            ;# to dst
+    add     \Dst, \Dst, \Stride     ;# next dst
+.endm
+
+    .text
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon4b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
+    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff       ;# v3 = d0..d7
+    vaddshs v2, v2, v3          ;# v2 = r0..r7
+    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff       ;# v2 = d8..d15
+    vaddshs v3, v3, v1          ;# v3 = r8..r15
+    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
+    stvx    v2,  0, r10         ;# 2 rows to dst from buf
+    lwz     r0, 0(r10)
+.if \write_first_four_pels
+    stw     r0, 0(\Dst)
+    .else
+    stwux   r0, \Dst, \Stride
+.endif
+    lwz     r0, 4(r10)
+    stw     r0, 4(\Dst)
+    lwz     r0, 8(r10)
+    stwux   r0, \Dst, \Stride       ;# advance dst to next row
+    lwz     r0, 12(r10)
+    stw     r0, 4(\Dst)
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+
+recon2b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    la      r10, -48(r1)                ;# buf
+
+    two_rows_of8 r3, r4, r5, r6, 1
+
+    addi    r4, r4, 16;                 ;# next pred
+    addi    r3, r3, 32;                 ;# next diff
+
+    two_rows_of8 r3, r4, r5, r6, 0
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro get_two_diff_rows
+    stw     r0, 0(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 4(r10)
+    lwzu    r0, 32(r3)
+    stw     r0, 8(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 12(r10)
+    lvx     v3, 0, r10
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon_b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+
+    la      r10, -48(r1)    ;# buf
+
+    lwz     r0, 0(r4)
+    stw     r0, 0(r10)
+    lwz     r0, 16(r4)
+    stw     r0, 4(r10)
+    lwz     r0, 32(r4)
+    stw     r0, 8(r10)
+    lwz     r0, 48(r4)
+    stw     r0, 12(r10)
+
+    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
+
+    lwz r0, 0(r3)           ;# v3 = d0..d7
+
+    get_two_diff_rows
+
+    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
+    vaddshs v2, v2, v3;     ;# v2 = r0..r7
+
+    lwzu r0, 32(r3)         ;# v3 = d8..d15
+
+    get_two_diff_rows
+
+    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
+    vaddshs v3, v3, v1;     ;# v3 = r8..r15
+
+    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
+
+    lwz     r0, 0(r10)
+    stw     r0, 0(r5)
+    lwz     r0, 4(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 8(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 12(r10)
+    stwx    r0, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
diff --git a/vp8/common/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm
new file mode 100644
index 000000000..e5f26380f
--- /dev/null
+++ b/vp8/common/ppc/sad_altivec.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_sad16x16_ppc
+    .globl vp8_sad16x8_ppc
+    .globl vp8_sad8x16_ppc
+    .globl vp8_sad8x8_ppc
+    .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v8, 0              ;# zero out total to start
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+    lvsl    v3,  0, r5          ;# only needs to be done once per block
+
+    ;# preload a line of data before getting into the loop
+    lvx     v4, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    add     r5, r5, r6
+    add     r3, r3, r4
+
+    vperm   v5, v1, v2, v3
+
+    .align 4
+\loop_label:
+    ;# compute difference on first row
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+
+    ;# load up next set of data
+    lvx     v9, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    ;# perform abs() of difference
+    vor     v6, v6, v7
+    add     r3, r3, r4
+
+    ;# add to the running tally
+    vsum4ubs v8, v6, v8
+
+    ;# now onto the next line
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+    lvx     v4, 0, r3
+
+    ;# compute difference on second row
+    vsububs v6, v9, v5
+    lvx     v1,  0, r5
+    vsububs v7, v5, v9
+    lvx     v2, r10, r5
+    vor     v6, v6, v7
+    add     r3, r3, r4
+    vsum4ubs v8, v6, v8
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+    .align 4
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v7, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v7
+
+    SAD_16
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_16_loop sad16x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_16_loop sad16x8_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_8_loop sad8x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_8_loop sad8x8_loop
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r7, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r7,  4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    vspltisw v8, 0              ;# zero out total to start
+
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v7, v6, v8
+    vsumsws v7, v7, v8
+
+    stvx    v7, 0, r1
+    lwz     r3, 12(r1)
+
+    epilogue
+
+    blr
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
new file mode 100644
index 000000000..87f4cac72
--- /dev/null
+++ b/vp8/common/ppc/systemdependent.c
@@ -0,0 +1,170 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp8_post_proc_down_and_across_mb_row)(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int cols,
+    unsigned char *f,
+    int size
+);
+
+extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp8_post_proc_down_and_across_mb_row_c
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int cols,
+    unsigned char *f,
+    int size
+);
+void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp8_copy_mem16x16;
+extern copy_mem_block_function *vp8_copy_mem8x8;
+extern copy_mem_block_function *vp8_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp8_sixtap_predict_c;
+extern subpixel_predict_function vp8_sixtap_predict8x4_c;
+extern subpixel_predict_function vp8_sixtap_predict8x8_c;
+extern subpixel_predict_function vp8_sixtap_predict16x16_c;
+extern subpixel_predict_function vp8_bilinear_predict4x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x8_c;
+extern subpixel_predict_function vp8_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp8_copy_mem16x16_c;
+extern copy_mem_block_function vp8_copy_mem8x8_c;
+extern copy_mem_block_function vp8_copy_mem8x4_c;
+
+void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp8_loop_filter_mbv_c;
+extern loop_filter_block_function vp8_loop_filter_bv_c;
+extern loop_filter_block_function vp8_loop_filter_mbh_c;
+extern loop_filter_block_function vp8_loop_filter_bh_c;
+
+extern loop_filter_block_function vp8_loop_filter_mbvs_c;
+extern loop_filter_block_function vp8_loop_filter_bvs_c;
+extern loop_filter_block_function vp8_loop_filter_mbhs_c;
+extern loop_filter_block_function vp8_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp8_clear_c(void)
+{
+}
+
+void vp8_machine_specific_config(void)
+{
+    // Pure C:
+    vp8_clear_system_state                = vp8_clear_c;
+    vp8_recon_b                          = vp8_recon_b_c;
+    vp8_recon4b                         = vp8_recon4b_c;
+    vp8_recon2b                         = vp8_recon2b_c;
+
+    vp8_bilinear_predict16x16            = bilinear_predict16x16_ppc;
+    vp8_bilinear_predict8x8              = bilinear_predict8x8_ppc;
+    vp8_bilinear_predict8x4              = bilinear_predict8x4_ppc;
+    vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
+
+    vp8_sixtap_predict16x16              = sixtap_predict16x16_ppc;
+    vp8_sixtap_predict8x8                = sixtap_predict8x8_ppc;
+    vp8_sixtap_predict8x4                = sixtap_predict8x4_ppc;
+    vp8_sixtap_predict                   = sixtap_predict_ppc;
+
+    vp8_short_idct4x4_1                  = vp8_short_idct4x4llm_1_c;
+    vp8_short_idct4x4                    = short_idct4x4llm_ppc;
+    vp8_dc_only_idct                      = vp8_dc_only_idct_c;
+
+    vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
+    vp8_lf_bvfull                        = loop_filter_bv_ppc;
+    vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
+    vp8_lf_bhfull                        = loop_filter_bh_ppc;
+
+    vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
+    vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
+    vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
+    vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
+
+    vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
+    vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
+    vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
+    vp8_plane_add_noise                   = vp8_plane_add_noise_c;
+
+    vp8_copy_mem16x16                    = copy_mem16x16_ppc;
+    vp8_copy_mem8x8                      = vp8_copy_mem8x8_c;
+    vp8_copy_mem8x4                      = vp8_copy_mem8x4_c;
+
+}
diff --git a/vp8/common/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm
new file mode 100644
index 000000000..fb8d5bb1d
--- /dev/null
+++ b/vp8/common/ppc/variance_altivec.asm
@@ -0,0 +1,375 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_get8x8var_ppc
+    .globl vp8_get16x16var_ppc
+    .globl vp8_mse16x16_ppc
+    .globl vp8_variance16x16_ppc
+    .globl vp8_variance16x8_ppc
+    .globl vp8_variance8x16_ppc
+    .globl vp8_variance8x8_ppc
+    .globl vp8_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v7, 0              ;# zero for merging
+    vspltisw v8, 0              ;# zero out total to start
+    vspltisw v9, 0              ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  v2, v7, v4
+    vmrghb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    vmrglb  v2, v7, v4
+    vmrglb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v0, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v0
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, get8x8var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, get16x16var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+    prologue
+
+    mtctr   r10
+
+mse16x16_loop:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+
+    bdnz    mse16x16_loop
+
+    vsumsws v9, v9, v7
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stw     r3, 0(r7)           ;# sse
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x16_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, variance16x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x8_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_16 7, variance16x8_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_8 7, variance8x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, variance8x8_loop, 0
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r10,0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r10, 4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    compute_sum_sse
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
+
+    epilogue
+
+    blr
diff --git a/vp8/common/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm
new file mode 100644
index 000000000..2308373a1
--- /dev/null
+++ b/vp8/common/ppc/variance_subpixel_altivec.asm
@@ -0,0 +1,865 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_sub_pixel_variance4x4_ppc
+    .globl vp8_sub_pixel_variance8x8_ppc
+    .globl vp8_sub_pixel_variance8x16_ppc
+    .globl vp8_sub_pixel_variance16x8_ppc
+    .globl vp8_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r12, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r12, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r12, r0
+
+    ;# index to the next set of vectors in the row.
+    li      r12, 32
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  \t1, \z0, \src
+    vmrghb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    vmrglb  \t1, \z0, \src
+    vmrglb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    ;# Now compute sse.
+    vsububs \t1, \src, \ref
+    vsububs \t2, \ref, \src
+    vor     \t1, \t1, \t2
+
+    vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+    vsumsws \sum, \sum, \z0
+    vsumsws \sse, \sse, \z0
+
+    stvx    \sum, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    \sse, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r9)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+    load_and_align_16  v16, r7, r8, \increment_counter
+    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+    lvsl    v17,  0, \R         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, \R
+    lvx     v22, r10, \R
+
+.if \increment_counter
+    add     \R, \R, \P
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_4x4_b
+
+    hfilter_8 v4, v10, v11, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+compute_sum_sse_4x4_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    load_and_align_16 v4, r7, r8, 1
+    load_and_align_16 v5, r7, r8, 1
+    load_and_align_16 v6, r7, r8, 1
+    load_and_align_16 v7, r7, r8, 1
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_c v10, b_hilo_b, 0, r12, r0
+
+    vperm   v0, v0, v1, v10
+    vperm   v1, v2, v3, v10
+
+    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 4
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+    hfilter_8 v4, v10, v11, 1
+    hfilter_8 v5, v10, v11, 1
+    hfilter_8 v6, v10, v11, 1
+    hfilter_8 v7, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x8_b
+
+    hfilter_8 v8, v10, v11, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 1
+    load_and_align_16 v5, r3, r4, 1
+    load_and_align_16 v6, r3, r4, 1
+    load_and_align_16 v7, r3, r4, 1
+    load_and_align_16 v8, r3, r4, 0
+
+    beq     compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0, v1
+    vfilter_16 v1, v2
+    vfilter_16 v2, v3
+    vfilter_16 v3, v4
+    vfilter_16 v4, v5
+    vfilter_16 v5, v6
+    vfilter_16 v6, v7
+    vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_and_align_16 v4,  r7, r8, 1
+    load_and_align_16 v5,  r7, r8, 1
+    load_and_align_16 v6,  r7, r8, 1
+    load_and_align_16 v7,  r7, r8, 1
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 0
+
+    vmrghb  v4, v4,  v5
+    vmrghb  v5, v6,  v7
+    vmrghb  v6, v8,  v9
+    vmrghb  v7, v10, v11
+
+    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 6
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x16_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v29, b_0123_b, 0, r12, r0
+    load_c v30, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0,  v29, v30, 1
+    hfilter_8 v1,  v29, v30, 1
+    hfilter_8 v2,  v29, v30, 1
+    hfilter_8 v3,  v29, v30, 1
+    hfilter_8 v4,  v29, v30, 1
+    hfilter_8 v5,  v29, v30, 1
+    hfilter_8 v6,  v29, v30, 1
+    hfilter_8 v7,  v29, v30, 1
+    hfilter_8 v8,  v29, v30, 1
+    hfilter_8 v9,  v29, v30, 1
+    hfilter_8 v10, v29, v30, 1
+    hfilter_8 v11, v29, v30, 1
+    hfilter_8 v12, v29, v30, 1
+    hfilter_8 v13, v29, v30, 1
+    hfilter_8 v14, v29, v30, 1
+    hfilter_8 v15, v29, v30, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x16_b
+
+    hfilter_8 v16, v29, v30, 0
+
+    b   second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0,  r3, r4, 1
+    load_and_align_16 v1,  r3, r4, 1
+    load_and_align_16 v2,  r3, r4, 1
+    load_and_align_16 v3,  r3, r4, 1
+    load_and_align_16 v4,  r3, r4, 1
+    load_and_align_16 v5,  r3, r4, 1
+    load_and_align_16 v6,  r3, r4, 1
+    load_and_align_16 v7,  r3, r4, 1
+    load_and_align_16 v8,  r3, r4, 1
+    load_and_align_16 v9,  r3, r4, 1
+    load_and_align_16 v10, r3, r4, 1
+    load_and_align_16 v11, r3, r4, 1
+    load_and_align_16 v12, r3, r4, 1
+    load_and_align_16 v13, r3, r4, 1
+    load_and_align_16 v14, r3, r4, 1
+    load_and_align_16 v15, r3, r4, 1
+    load_and_align_16 v16, r3, r4, 0
+
+    beq     compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0,  v1
+    vmrghb  v1, v2,  v3
+    vmrghb  v2, v4,  v5
+    vmrghb  v3, v6,  v7
+    vmrghb  v4, v8,  v9
+    vmrghb  v5, v10, v11
+    vmrghb  v6, v12, v13
+    vmrghb  v7, v14, v15
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 1
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 0
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x8_pre_copy_b
+
+    hfilter_16 v0, 1
+    hfilter_16 v1, 1
+    hfilter_16 v2, 1
+    hfilter_16 v3, 1
+    hfilter_16 v4, 1
+    hfilter_16 v5, 1
+    hfilter_16 v6, 1
+    hfilter_16 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x8_b
+
+    hfilter_16 v8, 0
+
+    b   second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+
+    beq     compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+compute_sum_sse_16x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0, 1
+    compute_sum_sse_16 v1, 1
+    compute_sum_sse_16 v2, 1
+    compute_sum_sse_16 v3, 1
+    compute_sum_sse_16 v4, 1
+    compute_sum_sse_16 v5, 1
+    compute_sum_sse_16 v6, 1
+    compute_sum_sse_16 v7, 0
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+    load_and_align_16  v9,  r3, r4, 1
+    load_and_align_16  v10, r3, r4, 1
+    load_and_align_16  v11, r3, r4, 1
+    load_and_align_16  v12, r3, r4, 1
+    load_and_align_16  v13, r3, r4, 1
+    load_and_align_16  v14, r3, r4, 1
+    load_and_align_16  v15, r3, r4, 1
+    load_and_align_16  v16, r3, r4, 0
+
+    beq     compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0,  1
+    compute_sum_sse_16 v1,  1
+    compute_sum_sse_16 v2,  1
+    compute_sum_sse_16 v3,  1
+    compute_sum_sse_16 v4,  1
+    compute_sum_sse_16 v5,  1
+    compute_sum_sse_16 v6,  1
+    compute_sum_sse_16 v7,  1
+    compute_sum_sse_16 v8,  1
+    compute_sum_sse_16 v9,  1
+    compute_sum_sse_16 v10, 1
+    compute_sum_sse_16 v11, 1
+    compute_sum_sse_16 v12, 1
+    compute_sum_sse_16 v13, 1
+    compute_sum_sse_16 v14, 1
+    compute_sum_sse_16 v15, 0
+
+    variance_final v18, v19, v23, 8
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23