summaryrefslogtreecommitdiff
path: root/vp8/common/ppc
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/common/ppc')
-rw-r--r--vp8/common/ppc/copy_altivec.asm47
-rw-r--r--vp8/common/ppc/filter_altivec.asm1013
-rw-r--r--vp8/common/ppc/filter_bilinear_altivec.asm677
-rw-r--r--vp8/common/ppc/idctllm_altivec.asm189
-rw-r--r--vp8/common/ppc/loopfilter_altivec.c135
-rw-r--r--vp8/common/ppc/loopfilter_filters_altivec.asm1253
-rw-r--r--vp8/common/ppc/platform_altivec.asm59
-rw-r--r--vp8/common/ppc/recon_altivec.asm175
-rw-r--r--vp8/common/ppc/sad_altivec.asm277
-rw-r--r--vp8/common/ppc/systemdependent.c170
-rw-r--r--vp8/common/ppc/variance_altivec.asm375
-rw-r--r--vp8/common/ppc/variance_subpixel_altivec.asm865
12 files changed, 5235 insertions, 0 deletions
diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm
new file mode 100644
index 000000000..a4ce91583
--- /dev/null
+++ b/vp8/common/ppc/copy_altivec.asm
@@ -0,0 +1,47 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;# but the output will be. So two reads and a perm
+;# for the input, but only one store for the output.
+copy_mem16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xe000
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r10, 16
+ mtctr r10
+
+cp_16x16_loop:
+ lvsl v0, 0, r3 ;# permutate value for alignment
+
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+
+ vperm v1, v1, v2, v0
+
+ stvx v1, 0, r5
+
+ add r3, r3, r4 ;# increment source pointer
+ add r5, r5, r6 ;# increment destination pointer
+
+ bdnz cp_16x16_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm
new file mode 100644
index 000000000..4da2e94f9
--- /dev/null
+++ b/vp8/common/ppc/filter_altivec.asm
@@ -0,0 +1,1013 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl sixtap_predict_ppc
+ .globl sixtap_predict8x4_ppc
+ .globl sixtap_predict8x8_ppc
+ .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+ load_c \V0, HFilter, r5, r9, r10
+
+ addi r5, r5, 16
+ lvx \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+ load_c v0, VFilter, r6, r3, r10
+
+ vspltish v5, 8
+ vspltish v6, 3
+ vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v1, v0, 1
+ vspltb v2, v0, 2
+ vspltb v3, v0, 3
+ vspltb v4, v0, 4
+ vspltb v5, v0, 5
+ vspltb v0, v0, 0
+.endm
+
+.macro vpre_load
+ Vprolog
+ li r10, 16
+ lvx v10, 0, r9 ;# v10..v14 = first 5 rows
+ lvx v11, r10, r9
+ addi r9, r9, 32
+ lvx v12, 0, r9
+ lvx v13, r10, r9
+ addi r9, r9, 32
+ lvx v14, 0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+ ;# (Re,Ro) += (V*T)
+ vmuleub \TMP, \V, \T ;# trashes v8
+ vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
+ vmuloub \TMP, \V, \T
+ vadduhm \Ro, \Ro, \TMP ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+ vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
+ vadduhm v16, v6, v8
+ vmuloub v8, \P0, v0
+ vadduhm v17, v6, v8
+ Msum v16, v17, \P2, v2, v8
+ Msum v16, v17, \P3, v3, v8
+ Msum v16, v17, \P5, v5, v8
+
+ vmuleub v18, \P1, v1 ;# 2 negative taps
+ vmuloub v19, \P1, v1
+ Msum v18, v19, \P4, v4, v8
+
+ vsubuhs v16, v16, v18 ;# subtract neg from pos
+ vsubuhs v17, v17, v19
+ vsrh v16, v16, v7 ;# divide by 128
+ vsrh v17, v17, v7 ;# v16 v17 = evens, odds
+ vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
+ vmrglh v19, v16, v17
+ vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+ vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
+ vadduhm v21, v20, v24
+ vmuloub v24, \P0, v13
+ vadduhm v22, v20, v24
+ Msum v21, v22, \P2, v15, v25
+ Msum v21, v22, \P3, v16, v25
+ Msum v21, v22, \P5, v18, v25
+
+ vmuleub v23, \P1, v14 ;# 2 negative taps
+ vmuloub v24, \P1, v14
+ Msum v23, v24, \P4, v17, v25
+
+ vsubuhs v21, v21, v23 ;# subtract neg from pos
+ vsubuhs v22, v22, v24
+ vsrh v21, v21, v19 ;# divide by 128
+ vsrh v22, v22, v19 ;# v16 v17 = evens, odds
+ vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
+ vmrglh v24, v21, v22
+ vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+ vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+ stvx \P0, 0, r7
+ add r7, r7, r8 ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+ addi r9, r9, 16 ;# P5 = newest input row
+ lvx \P5, 0, r9
+ Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+ luma_v v10, v11, v12, v13, v14, v15
+ luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+ luma_vtwo
+ luma_v v12, v13, v14, v15, v10, v11
+ luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+ luma_vfour
+ luma_v v14, v15, v10, v11, v12, v13
+ luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+ vmsummbm \R, v13, \I, v15
+ vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+ lvsl v21, 0, \RS ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx \VD, 0, \RS
+ lvx v20, r10, \RS
+
+.if \increment_counter
+ add \RS, \RS, \RP
+.endif
+
+ vperm \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+ vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
+ vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
+ Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
+ vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
+ Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
+
+ vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
+ vsrh \R, \R, v19
+
+ vpkuhus \R, \R, \R ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+ lvsl v21, 0, \RS ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v20, 0, \RS
+
+.if \increment_counter
+ add \RS, \RS, \RP
+.endif
+
+ vperm \VD, v20, v20, v21
+.endm
+ .text
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xff87
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- vertical_only_4x4
+
+ ;# load up horizontal filter
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_4x4
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+
+ b second_pass_4x4
+
+vertical_only_4x4:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_4x4:
+ load_c v20, b_hilo_4x4, 0, r9, r10
+ load_c v21, b_hilo, 0, r9, r10
+
+ ;# reposition input so that it can go through the
+ ;# filtering phase with one pass.
+ vperm v0, v0, v1, v20 ;# 0 1 x x
+ vperm v2, v2, v3, v20 ;# 2 3 x x
+ vperm v4, v4, v5, v20 ;# 4 5 x x
+ vperm v6, v6, v7, v20 ;# 6 7 x x
+
+ vperm v0, v0, v2, v21 ;# 0 1 2 3
+ vperm v4, v4, v6, v21 ;# 4 5 6 7
+
+ vsldoi v1, v0, v4, 4
+ vsldoi v2, v0, v4, 8
+ vsldoi v3, v0, v4, 12
+
+ vsldoi v5, v4, v8, 4
+
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+ stvx v0, 0, r1
+
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 4(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 8(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 12(r1)
+ stw r0, 0(r7)
+
+ b exit_4x4
+
+store_4x4:
+
+ stvx v2, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v3, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v4, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v5, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+
+exit_4x4:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro w_8x8 V, D, R, P
+ stvx \V, 0, r1
+ lwz \R, 0(r1)
+ stw \R, 0(r7)
+ lwz \R, 4(r1)
+ stw \R, 4(r7)
+ add \D, \D, \P
+.endm
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- second_pass_pre_copy_8x4
+
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_8x4
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+
+ b second_pass_8x4
+
+second_pass_pre_copy_8x4:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_8x4:
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+ vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
+ vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
+ vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x4
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned_8x4:
+
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+
+ b exit_8x4
+
+store_8x4:
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned2_8x4
+
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned2_8x4:
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+
+exit_8x4:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;# register there is no need to loop. Everything can stay in registers.
+sixtap_predict8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- second_pass_pre_copy_8x8
+
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+ Read8x8 v9, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+ interp_8x8 v9
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_8x8
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v10, r3, r4, 1
+ Read8x8 v11, r3, r4, 1
+ Read8x8 v12, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v10
+ interp_8x8 v11
+ interp_8x8 v12
+
+ b second_pass_8x8
+
+second_pass_pre_copy_8x8:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+ Read8x8 v9, r3, r4, 1
+ Read8x8 v10, r3, r4, 1
+ Read8x8 v11, r3, r4, 1
+ Read8x8 v12, r3, r4, 0
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_8x8:
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+ vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
+ vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
+ vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
+ vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
+ vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
+ vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
+ vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x8
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned_8x8:
+
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+
+ b exit_8x8
+
+store_8x8:
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned2_8x8
+
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+ w_8x8 v8, r7, r0, r8
+ w_8x8 v9, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned2_8x8:
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+ vperm v8, v8, v9, v10
+
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+ addi r7, r7, 16
+ stvx v8, 0, r7
+
+exit_8x8:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
+;# edges. One of the filters can be null, but both won't be. Needs to use a
+;# temporary buffer because the source buffer can't be modified and the buffer
+;# for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xf000
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ ;# Three possiblities
+ ;# 1. First filter is null. Don't use a temp buffer.
+ ;# 2. Second filter is null. Don't use a temp buffer.
+ ;# 3. Neither are null, use temp buffer.
+
+ ;# First Pass (horizontal edge)
+ ;# setup pointers for src
+ ;# if possiblity (1) then setup the src pointer to be the orginal and jump
+ ;# to second pass. this is based on if x_offset is 0.
+
+ ;# load up horizontal filter
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ load_hfilter v4, v5
+
+ beq- copy_horizontal_16x21
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v14, b_hperm, 0, r9, r10
+
+ ;# These statements are guessing that there won't be a second pass,
+ ;# but if there is then inside the bypass they need to be set
+ li r0, 16 ;# prepare for no vertical filter
+
+ ;# Change the output pointer and pitch to be the actual
+ ;# desination instead of a temporary buffer.
+ addi r9, r7, 0
+ addi r5, r8, 0
+
+ ;# no vertical filter, so write the output from the first pass
+ ;# directly into the output buffer.
+ beq- no_vertical_filter_bypass
+
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+
+ ;# setup counter for the number of lines that are going to be filtered
+ li r0, 21
+
+ ;# use the stack as temporary storage
+ la r9, 48(r1)
+ li r5, 16
+
+no_vertical_filter_bypass:
+
+ mtctr r0
+
+ ;# rounding added in on the multiply
+ vspltisw v10, 8
+ vspltisw v12, 3
+ vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v13, 7
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+ li r12, 32
+
+horizontal_loop_16x16:
+
+ lvsl v15, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+ lvx v3, r12, r3
+
+ vperm v8, v1, v2, v15
+ vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
+
+ vsldoi v11, v8, v9, 4
+
+ ;# set 0
+ vmsummbm v6, v4, v8, v12 ;# taps times elements
+ vmsummbm v0, v5, v11, v6
+
+ ;# set 1
+ vsldoi v10, v8, v9, 1
+ vsldoi v11, v8, v9, 5
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v1, v5, v11, v6
+
+ ;# set 2
+ vsldoi v10, v8, v9, 2
+ vsldoi v11, v8, v9, 6
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v2, v5, v11, v6
+
+ ;# set 3
+ vsldoi v10, v8, v9, 3
+ vsldoi v11, v8, v9, 7
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v3, v5, v11, v6
+
+ vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
+
+ vsrh v0, v0, v13 ;# divide v0, v1 by 128
+ vsrh v1, v1, v13
+
+ vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
+ vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
+
+ stvx v0, 0, r9
+ add r9, r9, r5
+
+ add r3, r3, r4
+
+ bdnz horizontal_loop_16x16
+
+ ;# check again to see if vertical filter needs to be done.
+ cmpi cr0, r6, 0
+ beq cr0, end_16x16
+
+ ;# yes there is, so go to the second pass
+ b second_pass_16x16
+
+copy_horizontal_16x21:
+ li r10, 21
+ mtctr r10
+
+ li r10, 16
+
+ sub r3, r3, r4
+ sub r3, r3, r4
+
+ ;# this is done above if there is a horizontal filter,
+ ;# if not it needs to be done down here.
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+ ;# always write to the stack when doing a horizontal copy
+ la r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+ lvsl v15, 0, r3 ;# permutate value for alignment
+
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+
+ vperm v8, v1, v2, v15
+
+ stvx v8, 0, r9
+ addi r9, r9, 16
+
+ add r3, r3, r4
+
+ bdnz copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+ ;# always read from the stack when doing a vertical filter
+ la r9, 48(r1)
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v7, 7
+
+ vpre_load
+
+ luma_vsix
+ luma_vsix
+ luma_vfour
+
+end_16x16:
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+HFilter:
+ .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
+ .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
+ .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
+ .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
+ .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
+ .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
+ .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
+ .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
+ .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
+ .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
+ .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
+ .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
+ .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
+ .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
+
+ .align 4
+VFilter:
+ .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ .align 4
+b_hperm:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+B_0123:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+B_4567:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+ .align 4
+B_89AB:
+ .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+ .align 4
+b_hilo:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+b_hilo_4x4:
+ .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm
new file mode 100644
index 000000000..fd8aa665f
--- /dev/null
+++ b/vp8/common/ppc/filter_bilinear_altivec.asm
@@ -0,0 +1,677 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl bilinear_predict4x4_ppc
+ .globl bilinear_predict8x4_ppc
+ .globl bilinear_predict8x8_ppc
+ .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r9, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+ li r12, 32
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r9, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r9, r0
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro HFilter V
+ vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+ stvx \V, 0, r7
+
+.if \increment_counter
+ add r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+ stvx \V, 0, r1
+ lwz \R, 0(r1)
+ stw \R, 0(r7)
+ lwz \R, 4(r1)
+ stw \R, 4(r7)
+ add \D, \D, \P
+.endm
+
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_4x4_b
+
+ hfilter_8 v4, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+store_out_4x4_b:
+
+ stvx v0, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v1, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v2, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v3, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+
+exit_4x4:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_8x4_b
+
+ hfilter_8 v4, 0
+
+ b second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+
+second_pass_8x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+store_out_8x4_b:
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x4_b
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned_8x4_b:
+ load_c v10, b_hilo_b, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+
+exit_8x4:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+ hfilter_8 v4, 1
+ hfilter_8 v5, 1
+ hfilter_8 v6, 1
+ hfilter_8 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_8x8_b
+
+ hfilter_8 v8, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+ load_and_align_8 v5, 1
+ load_and_align_8 v6, 1
+ load_and_align_8 v7, 1
+ load_and_align_8 v8, 0
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+store_out_8x8_b:
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x8_b
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned_8x8_b:
+ load_c v10, b_hilo_b, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+
+exit_8x8:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+ stvx \V, 0, r7
+
+.if \increment_counter
+ add r7, r7, r8
+.endif
+.endm
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, 1
+ load_and_align_16 v1, 1
+ load_and_align_16 v2, 1
+ load_and_align_16 v3, 1
+ load_and_align_16 v4, 1
+ load_and_align_16 v5, 1
+ load_and_align_16 v6, 1
+ load_and_align_16 v7, 1
+ load_and_align_16 v8, 1
+ load_and_align_16 v9, 1
+ load_and_align_16 v10, 1
+ load_and_align_16 v11, 1
+ load_and_align_16 v12, 1
+ load_and_align_16 v13, 1
+ load_and_align_16 v14, 1
+ load_and_align_16 v15, 1
+ load_and_align_16 v16, 0
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+ write_16 v0, 1
+ write_16 v1, 1
+ write_16 v2, 1
+ write_16 v3, 1
+ write_16 v4, 1
+ write_16 v5, 1
+ write_16 v6, 1
+ write_16 v7, 1
+ write_16 v8, 1
+ write_16 v9, 1
+ write_16 v10, 1
+ write_16 v11, 1
+ write_16 v12, 1
+ write_16 v13, 1
+ write_16 v14, 1
+ write_16 v15, 0
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm
new file mode 100644
index 000000000..117d9cfc8
--- /dev/null
+++ b/vp8/common/ppc/idctllm_altivec.asm
@@ -0,0 +1,189 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+ .align 2
+short_idct4x4llm_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ load_c v8, sinpi8sqrt2, 0, r9, r10
+ load_c v9, cospi8sqrt2minus1, 0, r9, r10
+ load_c v10, hi_hi, 0, r9, r10
+ load_c v11, lo_lo, 0, r9, r10
+ load_c v12, shift_16, 0, r9, r10
+
+ li r10, 16
+ lvx v0, 0, r3 ;# input ip[0], ip[ 4]
+ lvx v1, r10, r3 ;# input ip[8], ip[12]
+
+ ;# first pass
+ vupkhsh v2, v0
+ vupkhsh v3, v1
+ vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
+
+ vupklsh v0, v0
+ vmulosh v4, v0, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vupklsh v1, v1
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v3, v1, v8
+ vsraw v3, v3, v12
+ vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v0, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v0
+
+ vaddsws v3, v3, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ ;# transpose input
+ vmrghw v4, v0, v1 ;# a0 b0 a1 b1
+ vmrghw v5, v2, v3 ;# c0 d0 c1 d1
+
+ vmrglw v6, v0, v1 ;# a2 b2 a3 b3
+ vmrglw v7, v2, v3 ;# c2 d2 c3 d3
+
+ vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
+ vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
+
+ vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
+ vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
+
+ ;# second pass
+ vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
+
+ vmulosh v4, v1, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v3, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v3
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v2, v3, v8
+ vsraw v2, v2, v12
+ vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vaddsws v3, v2, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ vspltish v6, 4
+ vspltish v7, 3
+
+ vpkswss v0, v0, v1
+ vpkswss v1, v2, v3
+
+ vaddshs v0, v0, v6
+ vaddshs v1, v1, v6
+
+ vsrah v0, v0, v7
+ vsrah v1, v1, v7
+
+ ;# transpose output
+ vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
+ vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+ vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
+ vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ stvx v0, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ stvx v1, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 4
+sinpi8sqrt2:
+ .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+ .align 4
+cospi8sqrt2minus1:
+ .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+ .align 4
+shift_16:
+ .long 16, 16, 16, 16
+
+ .align 4
+hi_hi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+lo_lo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c
new file mode 100644
index 000000000..71bf6e2d7
--- /dev/null
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+ unsigned char *s, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+ unsigned char *u, // source pointer
+ unsigned char *v, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+ unsigned char *s, // source pointer
+ int p, // pitch
+ const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ // These should all be done at once with one call, instead of 3
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm
new file mode 100644
index 000000000..61df4e976
--- /dev/null
+++ b/vp8/common/ppc/loopfilter_filters_altivec.asm
@@ -0,0 +1,1253 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl mbloop_filter_horizontal_edge_y_ppc
+ .globl loop_filter_horizontal_edge_y_ppc
+ .globl mbloop_filter_vertical_edge_y_ppc
+ .globl loop_filter_vertical_edge_y_ppc
+
+ .globl mbloop_filter_horizontal_edge_uv_ppc
+ .globl loop_filter_horizontal_edge_uv_ppc
+ .globl mbloop_filter_vertical_edge_uv_ppc
+ .globl loop_filter_vertical_edge_uv_ppc
+
+ .globl loop_filter_simple_horizontal_edge_ppc
+ .globl loop_filter_simple_vertical_edge_ppc
+
+ .text
+;# We often need to perform transposes (and other transpose-like operations)
+;# on matrices of data. This is simplified by the fact that we usually
+;# operate on hunks of data whose dimensions are powers of 2, or at least
+;# divisible by highish powers of 2.
+;#
+;# These operations can be very confusing. They become more straightforward
+;# when we think of them as permutations of address bits: Concatenate a
+;# group of vector registers and think of it as occupying a block of
+;# memory beginning at address zero. The low four bits 0...3 of the
+;# address then correspond to position within a register, the higher-order
+;# address bits select the register.
+;#
+;# Although register selection, at the code level, is arbitrary, things
+;# are simpler if we use contiguous ranges of register numbers, simpler
+;# still if the low-order bits of the register number correspond to
+;# conceptual address bits. We do this whenever reasonable.
+;#
+;# A 16x16 transpose can then be thought of as an operation on
+;# a 256-element block of memory. It takes 8 bits 0...7 to address this
+;# memory and the effect of a transpose is to interchange address bit
+;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the
+;# column, which is interchanged with the row addressed by bits 4..7.
+;#
+;# The altivec merge instructions provide a rapid means of effecting
+;# many of these transforms. They operate at three widths (8,16,32).
+;# Writing V(x) for vector register #x, paired merges permute address
+;# indices as follows.
+;#
+;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:
+;#
+;# vmrghb V( x), V( y), V( y + (1<<s))
+;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# =0= 1->2 2->3 3->(4+d) (4+s)->1:
+;#
+;# vmrghh V( x), V( y), V( y + (1<<s))
+;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# =0= =1= 2->3 3->(4+d) (4+s)->2:
+;#
+;# vmrghw V( x), V( y), V( y + (1<<s))
+;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# Unfortunately, there is no doubleword merge instruction.
+;# The following sequence uses "vperm" is a substitute.
+;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;# are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:
+;#
+;# vperm V( x), V( y), V( y + (1<<s)), Vhihi
+;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;# Except for bits s and d, the other relationships between register
+;# number (= high-order part of address) bits are at the disposal of
+;# the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;# edges together. This requires a single 16x16 transpose, which, in
+;# the above language, amounts to the following permutation of address
+;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by
+;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;# Except for the fact that the destination registers get written
+;# before we are done referencing the old contents, the cyclic transform
+;# is effected by
+;#
+;# x = 0; do {
+;# vmrghb V(2x), V(x), V(x+8);
+;# vmrghb V(2x+1), V(x), V(x+8);
+;# } while( ++x < 8);
+;#
+;# For clarity, and because we can afford it, we do this transpose
+;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,
+;# leaving the final result in 16 .. 31, as the lower registers are
+;# used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+ vmrghb \A, \X, \Y
+ vmrglb \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+ Tpair v16,v17, v0,v8
+ Tpair v18,v19, v1,v9
+ Tpair v20,v21, v2,v10
+ Tpair v22,v23, v3,v11
+ Tpair v24,v25, v4,v12
+ Tpair v26,v27, v5,v13
+ Tpair v28,v29, v6,v14
+ Tpair v30,v31, v7,v15
+.endm
+
+.macro t16_odd
+ Tpair v0,v1, v16,v24
+ Tpair v2,v3, v17,v25
+ Tpair v4,v5, v18,v26
+ Tpair v6,v7, v19,v27
+ Tpair v8,v9, v20,v28
+ Tpair v10,v11, v21,v29
+ Tpair v12,v13, v22,v30
+ Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+ t16_odd
+ t16_even
+ t16_odd
+ t16_even
+.endm
+
+;# Vertical edge filtering requires transposes. For the simple filter,
+;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;# each. Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;# v0 = 0 1 ... 14 15
+;# v1 = 16 17 ... 30 31
+;# v2 = 32 33 ... 47 48
+;# v3 = 49 50 ... 62 63
+;#
+;# In frame-buffer memory, the layout is:
+;#
+;# 0 16 32 48
+;# 1 17 33 49
+;# ...
+;# 15 31 47 63.
+;#
+;# We begin by reading the data 32 bits at a time (using scalar operations)
+;# into a temporary array, reading the rows of the array into vector registers,
+;# with the following layout:
+;#
+;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60
+;# v1 = 1 17 33 49 5 21 ... 45 61
+;# v2 = 2 18 ... 46 62
+;# v3 = 3 19 ... 47 63
+;#
+;# From the "address-bit" perspective discussed above, we simply need to
+;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;# In other words, we transpose each of the four 4x4 submatrices.
+;#
+;# This transformation is its own inverse, and we need to perform it
+;# again before writing the pixels back into the frame buffer.
+;#
+;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;# defined above. We think of both groups of 4 registers as having
+;# "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+ ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=
+
+ vmrghb v4, v0, v1
+ vmrglb v5, v0, v1
+ vmrghb v6, v2, v3
+ vmrglb v7, v2, v3
+
+ ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1
+
+ vmrghh v0, v4, v6
+ vmrglh v1, v4, v6
+ vmrghh v2, v5, v7
+ vmrglh v3, v5, v7
+
+ ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=
+
+ vmrghw v4, v0, v1
+ vmrglw v5, v0, v1
+ vmrghw v6, v2, v3
+ vmrglw v7, v2, v3
+
+ ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3
+
+ vperm v0, v4, v6, \Vlo
+ vperm v1, v4, v6, \Vhi
+ vperm v2, v5, v7, \Vlo
+ vperm v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;# We read 8 columns of data, initially in the following pattern:
+;#
+;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)
+;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)
+;# ...
+;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;# and wish to convert to:
+;#
+;# (0,0) ... (0,15)
+;# (1,0) ... (1,15)
+;# ...
+;# (7,0) ... (7,15).
+;#
+;# In "address bit" language, we wish to map
+;#
+;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.
+;#
+;# This can be accomplished by 4 iterations of the cyclic transform
+;#
+;# I -> (I+1) mod 7;
+;#
+;# each iteration can be realized by (d=0, s=2):
+;#
+;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);
+;#
+;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;
+;# preserving v8 = sign converter.
+;#
+;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;# result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+ Tpair v10, v11, v0, v4
+ Tpair v12, v13, v1, v5
+ Tpair v14, v15, v2, v6
+ Tpair v16, v17, v3, v7
+.endm
+
+.macro t8x16_even
+ Tpair v0, v1, v10, v14
+ Tpair v2, v3, v11, v15
+ Tpair v4, v5, v12, v16
+ Tpair v6, v7, v13, v17
+.endm
+
+.macro transpose8x16_fwd
+ t8x16_odd
+ t8x16_even
+ t8x16_odd
+ t8x16_even
+.endm
+
+.macro transpose8x16_inv
+ t8x16_odd
+ t8x16_even
+ t8x16_odd
+.endm
+
+.macro Transpose16x16
+ vmrghb v0, v16, v24
+ vmrglb v1, v16, v24
+ vmrghb v2, v17, v25
+ vmrglb v3, v17, v25
+ vmrghb v4, v18, v26
+ vmrglb v5, v18, v26
+ vmrghb v6, v19, v27
+ vmrglb v7, v19, v27
+ vmrghb v8, v20, v28
+ vmrglb v9, v20, v28
+ vmrghb v10, v21, v29
+ vmrglb v11, v21, v29
+ vmrghb v12, v22, v30
+ vmrglb v13, v22, v30
+ vmrghb v14, v23, v31
+ vmrglb v15, v23, v31
+ vmrghb v16, v0, v8
+ vmrglb v17, v0, v8
+ vmrghb v18, v1, v9
+ vmrglb v19, v1, v9
+ vmrghb v20, v2, v10
+ vmrglb v21, v2, v10
+ vmrghb v22, v3, v11
+ vmrglb v23, v3, v11
+ vmrghb v24, v4, v12
+ vmrglb v25, v4, v12
+ vmrghb v26, v5, v13
+ vmrglb v27, v5, v13
+ vmrghb v28, v6, v14
+ vmrglb v29, v6, v14
+ vmrghb v30, v7, v15
+ vmrglb v31, v7, v15
+ vmrghb v0, v16, v24
+ vmrglb v1, v16, v24
+ vmrghb v2, v17, v25
+ vmrglb v3, v17, v25
+ vmrghb v4, v18, v26
+ vmrglb v5, v18, v26
+ vmrghb v6, v19, v27
+ vmrglb v7, v19, v27
+ vmrghb v8, v20, v28
+ vmrglb v9, v20, v28
+ vmrghb v10, v21, v29
+ vmrglb v11, v21, v29
+ vmrghb v12, v22, v30
+ vmrglb v13, v22, v30
+ vmrghb v14, v23, v31
+ vmrglb v15, v23, v31
+ vmrghb v16, v0, v8
+ vmrglb v17, v0, v8
+ vmrghb v18, v1, v9
+ vmrglb v19, v1, v9
+ vmrghb v20, v2, v10
+ vmrglb v21, v2, v10
+ vmrghb v22, v3, v11
+ vmrglb v23, v3, v11
+ vmrghb v24, v4, v12
+ vmrglb v25, v4, v12
+ vmrghb v26, v5, v13
+ vmrglb v27, v5, v13
+ vmrghb v28, v6, v14
+ vmrglb v29, v6, v14
+ vmrghb v30, v7, v15
+ vmrglb v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;# into vector register Vreg. Trashes r0
+.macro load_g Vreg, Gptr
+ lwz r0, \Gptr
+ lvx \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here. if the answer is negative
+;# it will be clamped to 0. orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+ vsububs \RES, \A, \B
+ vsububs \TMP, \B, \A
+ vor \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+ vsububs \TMP, \A, \B
+ vmaxub \RES, \RES, \TMP
+ vsububs \TMP, \B, \A
+ vmaxub \RES, \RES, \TMP
+.endm
+
+.macro Masks
+ ;# build masks
+ ;# input is all 8 bit unsigned (0-255). need to
+ ;# do abs(vala-valb) > limit. but no need to compare each
+ ;# value to the limit. find the max of the absolute differences
+ ;# and compare that to the limit.
+ ;# First hev
+ Abs v14, v13, v2, v3 ;# |P1 - P0|
+ max_abs v14, v13, v5, v4 ;# |Q1 - Q0|
+
+ vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded
+
+ ;# Next limit
+ max_abs v14, v13, v0, v1 ;# |P3 - P2|
+ max_abs v14, v13, v1, v2 ;# |P2 - P1|
+ max_abs v14, v13, v6, v5 ;# |Q2 - Q1|
+ max_abs v14, v13, v7, v6 ;# |Q3 - Q2|
+
+ vcmpgtub v9, v14, v9 ;# R = true if limit exceeded
+
+ ;# flimit
+ Abs v14, v13, v3, v4 ;# |P0 - Q0|
+
+ vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded
+
+ vor v8, v8, v9 ;# R = true if flimit or limit exceeded
+ ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+ ;# build constants
+ lvx \FL, 0, \RFL ;# flimit
+ lvx \LI, 0, \RLI ;# limit
+ lvx \TH, 0, \RTH ;# thresh
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+ ;# setup strides/pointers to be able to access
+ ;# all of the data
+ add r5, r4, r4 ;# r5 = 2 * stride
+ sub r6, r3, r5 ;# r6 -> 2 rows back
+ neg r7, r4 ;# r7 = -stride
+
+ ;# load 16 pixels worth of data to work on
+ sub r0, r6, r5 ;# r0 -> 4 rows back (temp)
+ lvx v0, 0, r0 ;# P3 (read only)
+ lvx v1, r7, r6 ;# P2
+ lvx v2, 0, r6 ;# P1
+ lvx v3, r7, r3 ;# P0
+ lvx v4, 0, r3 ;# Q0
+ lvx v5, r4, r3 ;# Q1
+ lvx v6, r5, r3 ;# Q2
+ add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)
+ lvx v7, r4, r0 ;# Q3 (read only)
+.endm
+
+;# Expects
+;# v10 == HEV
+;# v13 == tmp
+;# v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+ vxor \P1, \P1, v11 ;# SP1
+ vxor \P0, \P0, v11 ;# SP0
+ vxor \Q0, \Q0, v11 ;# SQ0
+ vxor \Q1, \Q1, v11 ;# SQ1
+
+ vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)
+.if \HEV_PRESENT
+ vand v13, v13, v10 ;# f &= hev
+.endif
+ vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+ vandc v13, v13, v8 ;# f &= mask
+
+ vspltisb v8, 3
+ vspltisb v9, 4
+
+ vaddsbs v14, v13, v9 ;# f1 = c (f+4)
+ vaddsbs v15, v13, v8 ;# f2 = c (f+3)
+
+ vsrab v13, v14, v8 ;# f1 >>= 3
+ vsrab v15, v15, v8 ;# f2 >>= 3
+
+ vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)
+ vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+ Masks
+
+ ;# start the fitering here
+ vxor v1, v1, v11 ;# SP2
+ vxor v2, v2, v11 ;# SP1
+ vxor v3, v3, v11 ;# SP0
+ vxor v4, v4, v11 ;# SQ0
+ vxor v5, v5, v11 ;# SQ1
+ vxor v6, v6, v11 ;# SQ2
+
+ ;# add outer taps if we have high edge variance
+ vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)
+
+ vsubsbs v14, v4, v3 ;# SQ0-SP0
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+ vandc v13, v13, v8 ;# f &= mask
+ vand v15, v13, v10 ;# f2 = f & hev
+
+ ;# save bottom 3 bits so that we round one side +4 and the other +3
+ vspltisb v8, 3
+ vspltisb v9, 4
+
+ vaddsbs v14, v15, v9 ;# f1 = c (f+4)
+ vaddsbs v15, v15, v8 ;# f2 = c (f+3)
+
+ vsrab v14, v14, v8 ;# f1 >>= 3
+ vsrab v15, v15, v8 ;# f2 >>= 3
+
+ vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)
+ vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)
+
+ ;# only apply wider filter if not high edge variance
+ vandc v13, v13, v10 ;# f &= ~hev
+
+ vspltisb v9, 2
+ vnor v8, v8, v8
+ vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+ vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f
+ vspltisb v8, 9
+
+ ;# roughly 1/7th difference across boundary
+ vspltish v10, 7
+ vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v8, v13
+ vaddshs v14, v14, v9 ;# += 63
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v6, v6, v10 ;# subtract from Q and add to P
+ vaddsbs v1, v1, v10
+
+ vxor v6, v6, v11
+ vxor v1, v1, v11
+
+ ;# roughly 2/7th difference across boundary
+ vspltish v10, 7
+ vaddubm v12, v8, v8
+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v12, v13
+ vaddshs v14, v14, v9
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v5, v5, v10 ;# subtract from Q and add to P
+ vaddsbs v2, v2, v10
+
+ vxor v5, v5, v11
+ vxor v2, v2, v11
+
+ ;# roughly 3/7th difference across boundary
+ vspltish v10, 7
+ vaddubm v12, v12, v8
+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v12, v13
+ vaddshs v14, v14, v9
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v4, v4, v10 ;# subtract from Q and add to P
+ vaddsbs v3, v3, v10
+
+ vxor v4, v4, v11
+ vxor v3, v3, v11
+.endm
+
+.macro SBFilter
+ Masks
+
+ common_adjust v3, v4, v2, v5, 1
+
+ ;# outer tap adjustments
+ vspltisb v8, 1
+
+ vaddubm v13, v13, v8 ;# f += 1
+ vsrab v13, v13, v8 ;# f >>= 1
+
+ vandc v13, v13, v10 ;# f &= ~hev
+
+ vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)
+ vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)
+
+ vxor v2, v2, v11
+ vxor v3, v3, v11
+ vxor v4, v4, v11
+ vxor v5, v5, v11
+.endm
+
+ .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ load_data_y
+
+ vp8_mbfilter
+
+ stvx v1, r7, r6 ;# P2
+ stvx v2, 0, r6 ;# P1
+ stvx v3, r7, r3 ;# P0
+ stvx v4, 0, r3 ;# Q0
+ stvx v5, r4, r3 ;# Q1
+ stvx v6, r5, r3 ;# Q2
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ load_data_y
+
+ SBFilter
+
+ stvx v2, 0, r6 ;# P1
+ stvx v3, r7, r3 ;# P0
+ stvx v4, 0, r3 ;# Q0
+ stvx v5, r4, r3 ;# Q1
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.
+;# So we can read in an entire mb aligned. However if we want to filter the mb
+;# edge we run into problems. For the loopfilter we require 4 bytes before the mb
+;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit
+;# of a waste. So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;# memory to align and order them up. Then they are read in using the
+;# vector register file.
+.macro RLVmb V, R
+ lwzux r0, r3, r4
+ stw r0, 4(\R)
+ lwz r0,-4(r3)
+ stw r0, 0(\R)
+ lwzux r0, r3, r4
+ stw r0,12(\R)
+ lwz r0,-4(r3)
+ stw r0, 8(\R)
+ lvx \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+ stvx \V, 0, \R
+ lwz r0,12(\R)
+ stwux r0, r3, r4
+ lwz r0, 8(\R)
+ stw r0,-4(r3)
+ lwz r0, 4(\R)
+ stwux r0, r3, r4
+ lwz r0, 0(\R)
+ stw r0,-4(r3)
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+ sub r3, r3, r4
+
+ RLVmb v0, r9
+ RLVmb v1, r9
+ RLVmb v2, r9
+ RLVmb v3, r9
+ RLVmb v4, r9
+ RLVmb v5, r9
+ RLVmb v6, r9
+ RLVmb v7, r9
+
+ transpose8x16_fwd
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ vp8_mbfilter
+
+ transpose8x16_inv
+
+ add r3, r3, r4
+ neg r4, r4
+
+ WLVmb v17, r9
+ WLVmb v16, r9
+ WLVmb v15, r9
+ WLVmb v14, r9
+ WLVmb v13, r9
+ WLVmb v12, r9
+ WLVmb v11, r9
+ WLVmb v10, r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro RL V, R, P
+ lvx \V, 0, \R
+ add \R, \R, \P
+.endm
+
+.macro WL V, R, P
+ stvx \V, 0, \R
+ add \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+ ;# K = |P0-P1| already
+ Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|
+ vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)
+ vcmpgtub v10, v14, v0
+
+ Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]
+
+ max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)
+ max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)
+ max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)
+
+ vmaxub v14, v14, v4 ;# M = max interior abs diff
+ vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded
+
+ Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)
+ vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded
+ vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded
+
+ ;# replace P1,Q1 w/signed versions
+ common_adjust \P0, \Q0, \P1, \Q1, 1
+
+ vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant
+ vsrab v13, v13, v1
+ vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev
+ vsubsbs \Q1, \Q1, v13
+ vaddsbs \P1, \P1, v13
+
+ vxor \P1, \P1, v11 ;# P1
+ vxor \P0, \P0, v11 ;# P0
+ vxor \Q0, \Q0, v11 ;# Q0
+ vxor \Q1, \Q1, v11 ;# Q1
+.endm
+
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ addi r9, r3, 0
+ RL v16, r9, r4
+ RL v17, r9, r4
+ RL v18, r9, r4
+ RL v19, r9, r4
+ RL v20, r9, r4
+ RL v21, r9, r4
+ RL v22, r9, r4
+ RL v23, r9, r4
+ RL v24, r9, r4
+ RL v25, r9, r4
+ RL v26, r9, r4
+ RL v27, r9, r4
+ RL v28, r9, r4
+ RL v29, r9, r4
+ RL v30, r9, r4
+ lvx v31, 0, r9
+
+ Transpose16x16
+
+ vspltisb v1, 1
+
+ build_constants r5, r6, r7, v3, v2, v0
+
+ Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|
+
+ Fil v16, v17, v18, v19, v20, v21, v22, v23
+ Fil v20, v21, v22, v23, v24, v25, v26, v27
+ Fil v24, v25, v26, v27, v28, v29, v30, v31
+
+ Transpose16x16
+
+ addi r9, r3, 0
+ WL v16, r9, r4
+ WL v17, r9, r4
+ WL v18, r9, r4
+ WL v19, r9, r4
+ WL v20, r9, r4
+ WL v21, r9, r4
+ WL v22, r9, r4
+ WL v23, r9, r4
+ WL v24, r9, r4
+ WL v25, r9, r4
+ WL v26, r9, r4
+ WL v27, r9, r4
+ WL v28, r9, r4
+ WL v29, r9, r4
+ WL v30, r9, r4
+ stvx v31, 0, r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+ andi. r7, r3, 8 ;# row origin modulo 16
+ add r7, r7, r7 ;# selects selectors
+ lis r12, _chromaSelectors@ha
+ la r0, _chromaSelectors@l(r12)
+ lwzux r0, r7, r0 ;# leave selector addr in r7
+
+ lvx \V, 0, r0 ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+ lvx \U, \Offs, r3
+ lvx \V, \Offs, r4
+ vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+ vperm \U, \New, \U, \Umask ;# Combine new pels with siblings
+ vperm \V, \New, \V, \Vmask
+ stvx \U, \Offs, r3 ;# Write to frame buffer
+ stvx \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+ neg r9, r5 ;# r9 = -1 * stride
+ add r8, r9, r9 ;# r8 = -2 * stride
+ add r10, r5, r5 ;# r10 = 2 * stride
+
+ active_chroma_sel v12
+
+ ;# P3, Q3 are read-only; need not save addresses or sibling pels
+ add r6, r8, r8 ;# r6 = -4 * stride
+ hread_uv v0, v14, v15, r6, v12
+ add r6, r10, r5 ;# r6 = 3 * stride
+ hread_uv v7, v14, v15, r6, v12
+
+ ;# Others are read/write; save addresses and sibling pels
+
+ add r6, r8, r9 ;# r6 = -3 * stride
+ hread_uv v1, v16, v17, r6, v12
+ hread_uv v2, v18, v19, r8, v12
+ hread_uv v3, v20, v21, r9, v12
+ hread_uv v4, v22, v23, 0, v12
+ hread_uv v5, v24, v25, r5, v12
+ hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+ load_g \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+ load_g \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+ uresult_sel v11
+ vresult_sel v12
+ hwrite_uv v2, v18, v19, r8, v11, v12
+ hwrite_uv v3, v20, v21, r9, v11, v12
+ hwrite_uv v4, v22, v23, 0, v11, v12
+ hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ load_chroma_h
+
+ vp8_mbfilter
+
+ store_chroma_h
+
+ hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2
+ hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ load_chroma_h
+
+ SBFilter
+
+ store_chroma_h
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro R V, R
+ lwzux r0, r3, r5
+ stw r0, 4(\R)
+ lwz r0,-4(r3)
+ stw r0, 0(\R)
+ lwzux r0, r4, r5
+ stw r0,12(\R)
+ lwz r0,-4(r4)
+ stw r0, 8(\R)
+ lvx \V, 0, \R
+.endm
+
+
+.macro W V, R
+ stvx \V, 0, \R
+ lwz r0,12(\R)
+ stwux r0, r4, r5
+ lwz r0, 8(\R)
+ stw r0,-4(r4)
+ lwz r0, 4(\R)
+ stwux r0, r3, r5
+ lwz r0, 0(\R)
+ stw r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+ sub r3, r3, r5 ;# back up one line for simplicity
+ sub r4, r4, r5
+
+ R v0, \R
+ R v1, \R
+ R v2, \R
+ R v3, \R
+ R v4, \R
+ R v5, \R
+ R v6, \R
+ R v7, \R
+
+ transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+ transpose8x16_inv
+
+ add r3, r3, r5
+ add r4, r4, r5
+ neg r5, r5 ;# Write rows back in reverse order
+
+ W v17, \R
+ W v16, \R
+ W v15, \R
+ W v14, \R
+ W v13, \R
+ W v12, \R
+ W v11, \R
+ W v10, \R
+.endm
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+
+ chroma_vread r9
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ vp8_mbfilter
+
+ chroma_vwrite r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+
+ chroma_vread r9
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ SBFilter
+
+ chroma_vwrite r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+ Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)
+ vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit
+
+ ;# preserve unsigned v0 and v3
+ common_adjust v1, v2, v0, v3, 0
+
+ vxor v1, v1, v11
+ vxor v2, v2, v11 ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+ addi r8, 0, 16
+ addi r7, r5, 32
+
+ lvx v0, 0, r5
+ lvx v1, r8, r5
+ lvx v2, 0, r7
+ lvx v3, r8, r7
+
+ lis r12, _B_hihi@ha
+ la r0, _B_hihi@l(r12)
+ lvx v16, 0, r0
+
+ lis r12, _B_lolo@ha
+ la r0, _B_lolo@l(r12)
+ lvx v17, 0, r0
+
+ Transpose4times4x4 v16, v17
+ vp8_simple_filter
+
+ vxor v0, v0, v11
+ vxor v3, v3, v11 ;# cvt Q0, P0 back to pels
+
+ Transpose4times4x4 v16, v17
+
+ stvx v0, 0, r5
+ stvx v1, r8, r5
+ stvx v2, 0, r7
+ stvx v3, r8, r7
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ ;# build constants
+ lvx v8, 0, r5 ;# flimit
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+
+ neg r5, r4 ;# r5 = -1 * stride
+ add r6, r5, r5 ;# r6 = -2 * stride
+
+ lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge
+ lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge
+ lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge
+ lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge
+
+ vp8_simple_filter
+
+ stvx v1, r5, r3 ;# store P0
+ stvx v2, 0, r3 ;# store Q0
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro RLV Offs
+ stw r0, (\Offs*4)(r5)
+ lwzux r0, r7, r4
+.endm
+
+.macro WLV Offs
+ lwz r0, (\Offs*4)(r5)
+ stwux r0, r7, r4
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ ;# build constants
+ lvx v8, 0, r5 ;# flimit
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+
+ la r5, -96(r1) ;# temporary space for reading in vectors
+
+ ;# Store 4 pels at word "Offs" in temp array, then advance r7
+ ;# to next row and read another 4 pels from the frame buffer.
+
+ subi r7, r3, 2 ;# r7 -> 2 pels before start
+ lwzx r0, 0, r7 ;# read first 4 pels
+
+ ;# 16 unaligned word accesses
+ RLV 0
+ RLV 4
+ RLV 8
+ RLV 12
+ RLV 1
+ RLV 5
+ RLV 9
+ RLV 13
+ RLV 2
+ RLV 6
+ RLV 10
+ RLV 14
+ RLV 3
+ RLV 7
+ RLV 11
+
+ stw r0, (15*4)(r5) ;# write last 4 pels
+
+ simple_vertical
+
+ ;# Read temp array, write frame buffer.
+ subi r7, r3, 2 ;# r7 -> 2 pels before start
+ lwzx r0, 0, r5 ;# read/write first 4 pels
+ stwx r0, 0, r7
+
+ WLV 4
+ WLV 8
+ WLV 12
+ WLV 1
+ WLV 5
+ WLV 9
+ WLV 13
+ WLV 2
+ WLV 6
+ WLV 10
+ WLV 14
+ WLV 3
+ WLV 7
+ WLV 11
+ WLV 15
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+_chromaSelectors:
+ .long _B_hihi
+ .long _B_Ures0
+ .long _B_Vres0
+ .long 0
+ .long _B_lolo
+ .long _B_Ures8
+ .long _B_Vres8
+ .long 0
+
+ .align 4
+_B_Vres8:
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15
+
+ .align 4
+_B_Ures8:
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7
+
+ .align 4
+_B_lolo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+ .align 4
+_B_Vres0:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+ .align 4
+_B_Ures0:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+
+ .align 4
+_B_hihi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm
new file mode 100644
index 000000000..f81d86f74
--- /dev/null
+++ b/vp8/common/ppc/platform_altivec.asm
@@ -0,0 +1,59 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl save_platform_context
+ .globl restore_platform_context
+
+.macro W V P
+ stvx \V, 0, \P
+ addi \P, \P, 16
+.endm
+
+.macro R V P
+ lvx \V, 0, \P
+ addi \P, \P, 16
+.endm
+
+;# r3 context_ptr
+ .align 2
+save_platform_contex:
+ W v20, r3
+ W v21, r3
+ W v22, r3
+ W v23, r3
+ W v24, r3
+ W v25, r3
+ W v26, r3
+ W v27, r3
+ W v28, r3
+ W v29, r3
+ W v30, r3
+ W v31, r3
+
+ blr
+
+;# r3 context_ptr
+ .align 2
+restore_platform_context:
+ R v20, r3
+ R v21, r3
+ R v22, r3
+ R v23, r3
+ R v24, r3
+ R v25, r3
+ R v26, r3
+ R v27, r3
+ R v28, r3
+ R v29, r3
+ R v30, r3
+ R v31, r3
+
+ blr
diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm
new file mode 100644
index 000000000..dd39e05a8
--- /dev/null
+++ b/vp8/common/ppc/recon_altivec.asm
@@ -0,0 +1,175 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl recon4b_ppc
+ .globl recon2b_ppc
+ .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+ lvx v1, 0, \Pred ;# v1 = pred = p0..p15
+ addi \Pred, \Pred, 16 ;# next pred
+ vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
+ lvx v3, 0, \Diff ;# v3 = d0..d7
+ vaddshs v2, v2, v3 ;# v2 = r0..r7
+ vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
+ lvx v3, r8, \Diff ;# v3 = d8..d15
+ addi \Diff, \Diff, 32 ;# next diff
+ vaddshs v3, v3, v1 ;# v3 = r8..r15
+ vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
+ stvx v2, 0, \Dst ;# to dst
+ add \Dst, \Dst, \Stride ;# next dst
+.endm
+
+ .text
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+recon4b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+ li r8, 16
+
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+ lvx v1, 0, \Pred ;# v1 = pred = p0..p15
+ vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
+ lvx v3, 0, \Diff ;# v3 = d0..d7
+ vaddshs v2, v2, v3 ;# v2 = r0..r7
+ vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
+ lvx v3, r8, \Diff ;# v2 = d8..d15
+ vaddshs v3, v3, v1 ;# v3 = r8..r15
+ vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
+ stvx v2, 0, r10 ;# 2 rows to dst from buf
+ lwz r0, 0(r10)
+.if \write_first_four_pels
+ stw r0, 0(\Dst)
+ .else
+ stwux r0, \Dst, \Stride
+.endif
+ lwz r0, 4(r10)
+ stw r0, 4(\Dst)
+ lwz r0, 8(r10)
+ stwux r0, \Dst, \Stride ;# advance dst to next row
+ lwz r0, 12(r10)
+ stw r0, 4(\Dst)
+.endm
+
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+
+recon2b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+ li r8, 16
+
+ la r10, -48(r1) ;# buf
+
+ two_rows_of8 r3, r4, r5, r6, 1
+
+ addi r4, r4, 16; ;# next pred
+ addi r3, r3, 32; ;# next diff
+
+ two_rows_of8 r3, r4, r5, r6, 0
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
+
+.macro get_two_diff_rows
+ stw r0, 0(r10)
+ lwz r0, 4(r3)
+ stw r0, 4(r10)
+ lwzu r0, 32(r3)
+ stw r0, 8(r10)
+ lwz r0, 4(r3)
+ stw r0, 12(r10)
+ lvx v3, 0, r10
+.endm
+
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+recon_b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+
+ la r10, -48(r1) ;# buf
+
+ lwz r0, 0(r4)
+ stw r0, 0(r10)
+ lwz r0, 16(r4)
+ stw r0, 4(r10)
+ lwz r0, 32(r4)
+ stw r0, 8(r10)
+ lwz r0, 48(r4)
+ stw r0, 12(r10)
+
+ lvx v1, 0, r10; ;# v1 = pred = p0..p15
+
+ lwz r0, 0(r3) ;# v3 = d0..d7
+
+ get_two_diff_rows
+
+ vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
+ vaddshs v2, v2, v3; ;# v2 = r0..r7
+
+ lwzu r0, 32(r3) ;# v3 = d8..d15
+
+ get_two_diff_rows
+
+ vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
+ vaddshs v3, v3, v1; ;# v3 = r8..r15
+
+ vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
+ stvx v2, 0, r10; ;# 16 pels to dst from buf
+
+ lwz r0, 0(r10)
+ stw r0, 0(r5)
+ lwz r0, 4(r10)
+ stwux r0, r5, r6
+ lwz r0, 8(r10)
+ stwux r0, r5, r6
+ lwz r0, 12(r10)
+ stwx r0, r5, r6
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
diff --git a/vp8/common/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm
new file mode 100644
index 000000000..e5f26380f
--- /dev/null
+++ b/vp8/common/ppc/sad_altivec.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_sad16x16_ppc
+ .globl vp8_sad16x8_ppc
+ .globl vp8_sad8x16_ppc
+ .globl vp8_sad8x8_ppc
+ .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v8, 0 ;# zero out total to start
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+ lvsl v3, 0, r5 ;# only needs to be done once per block
+
+ ;# preload a line of data before getting into the loop
+ lvx v4, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ add r5, r5, r6
+ add r3, r3, r4
+
+ vperm v5, v1, v2, v3
+
+ .align 4
+\loop_label:
+ ;# compute difference on first row
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+
+ ;# load up next set of data
+ lvx v9, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ ;# perform abs() of difference
+ vor v6, v6, v7
+ add r3, r3, r4
+
+ ;# add to the running tally
+ vsum4ubs v8, v6, v8
+
+ ;# now onto the next line
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+ lvx v4, 0, r3
+
+ ;# compute difference on second row
+ vsububs v6, v9, v5
+ lvx v1, 0, r5
+ vsububs v7, v5, v9
+ lvx v2, r10, r5
+ vor v6, v6, v7
+ add r3, r3, r4
+ vsum4ubs v8, v6, v8
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+ .align 4
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v7, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v7
+
+ SAD_16
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_16_loop sad16x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_16_loop sad16x8_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_8_loop sad8x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_8_loop sad8x8_loop
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r7, 0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r7, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ vspltisw v8, 0 ;# zero out total to start
+
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v7, v6, v8
+ vsumsws v7, v7, v8
+
+ stvx v7, 0, r1
+ lwz r3, 12(r1)
+
+ epilogue
+
+ blr
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
new file mode 100644
index 000000000..87f4cac72
--- /dev/null
+++ b/vp8/common/ppc/systemdependent.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp8_post_proc_down_and_across_mb_row)(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int cols,
+ unsigned char *f,
+ int size
+);
+
+extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp8_post_proc_down_and_across_mb_row_c
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int cols,
+ unsigned char *f,
+ int size
+);
+void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp8_copy_mem16x16;
+extern copy_mem_block_function *vp8_copy_mem8x8;
+extern copy_mem_block_function *vp8_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp8_sixtap_predict_c;
+extern subpixel_predict_function vp8_sixtap_predict8x4_c;
+extern subpixel_predict_function vp8_sixtap_predict8x8_c;
+extern subpixel_predict_function vp8_sixtap_predict16x16_c;
+extern subpixel_predict_function vp8_bilinear_predict4x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x8_c;
+extern subpixel_predict_function vp8_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp8_copy_mem16x16_c;
+extern copy_mem_block_function vp8_copy_mem8x8_c;
+extern copy_mem_block_function vp8_copy_mem8x4_c;
+
+void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp8_loop_filter_mbv_c;
+extern loop_filter_block_function vp8_loop_filter_bv_c;
+extern loop_filter_block_function vp8_loop_filter_mbh_c;
+extern loop_filter_block_function vp8_loop_filter_bh_c;
+
+extern loop_filter_block_function vp8_loop_filter_mbvs_c;
+extern loop_filter_block_function vp8_loop_filter_bvs_c;
+extern loop_filter_block_function vp8_loop_filter_mbhs_c;
+extern loop_filter_block_function vp8_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp8_clear_c(void)
+{
+}
+
+void vp8_machine_specific_config(void)
+{
+ // Pure C:
+ vp8_clear_system_state = vp8_clear_c;
+ vp8_recon_b = vp8_recon_b_c;
+ vp8_recon4b = vp8_recon4b_c;
+ vp8_recon2b = vp8_recon2b_c;
+
+ vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc;
+ vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc;
+ vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc;
+ vp8_bilinear_predict = bilinear_predict4x4_ppc;
+
+ vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc;
+ vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc;
+ vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc;
+ vp8_sixtap_predict = sixtap_predict_ppc;
+
+ vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c;
+ vp8_short_idct4x4 = short_idct4x4llm_ppc;
+ vp8_dc_only_idct = vp8_dc_only_idct_c;
+
+ vp8_lf_mbvfull = loop_filter_mbv_ppc;
+ vp8_lf_bvfull = loop_filter_bv_ppc;
+ vp8_lf_mbhfull = loop_filter_mbh_ppc;
+ vp8_lf_bhfull = loop_filter_bh_ppc;
+
+ vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
+ vp8_lf_bvsimple = loop_filter_bvs_ppc;
+ vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
+ vp8_lf_bhsimple = loop_filter_bhs_ppc;
+
+ vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
+ vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
+ vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
+ vp8_plane_add_noise = vp8_plane_add_noise_c;
+
+ vp8_copy_mem16x16 = copy_mem16x16_ppc;
+ vp8_copy_mem8x8 = vp8_copy_mem8x8_c;
+ vp8_copy_mem8x4 = vp8_copy_mem8x4_c;
+
+}
diff --git a/vp8/common/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm
new file mode 100644
index 000000000..fb8d5bb1d
--- /dev/null
+++ b/vp8/common/ppc/variance_altivec.asm
@@ -0,0 +1,375 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_get8x8var_ppc
+ .globl vp8_get16x16var_ppc
+ .globl vp8_mse16x16_ppc
+ .globl vp8_variance16x16_ppc
+ .globl vp8_variance16x8_ppc
+ .globl vp8_variance8x16_ppc
+ .globl vp8_variance8x8_ppc
+ .globl vp8_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v7, 0 ;# zero for merging
+ vspltisw v8, 0 ;# zero out total to start
+ vspltisw v9, 0 ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb v2, v7, v4
+ vmrghb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ vmrglb v2, v7, v4
+ vmrglb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, \DS ;# (sum*sum) >> DS
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v0, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v0
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, get8x8var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, get16x16var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+ prologue
+
+ mtctr r10
+
+mse16x16_loop:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+
+ bdnz mse16x16_loop
+
+ vsumsws v9, v9, v7
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stw r3, 0(r7) ;# sse
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x16_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, variance16x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x8_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_16 7, variance16x8_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_8 7, variance8x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, variance8x8_loop, 0
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r10,0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r10, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ compute_sum_sse
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, 4 ;# (sum*sum) >> 4
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
+
+ epilogue
+
+ blr
diff --git a/vp8/common/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm
new file mode 100644
index 000000000..2308373a1
--- /dev/null
+++ b/vp8/common/ppc/variance_subpixel_altivec.asm
@@ -0,0 +1,865 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_sub_pixel_variance4x4_ppc
+ .globl vp8_sub_pixel_variance8x8_ppc
+ .globl vp8_sub_pixel_variance8x16_ppc
+ .globl vp8_sub_pixel_variance16x8_ppc
+ .globl vp8_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r12, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r12, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r12, r0
+
+ ;# index to the next set of vectors in the row.
+ li r12, 32
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb \t1, \z0, \src
+ vmrghb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ vmrglb \t1, \z0, \src
+ vmrglb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ ;# Now compute sse.
+ vsububs \t1, \src, \ref
+ vsububs \t2, \ref, \src
+ vor \t1, \t1, \t2
+
+ vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+ vsumsws \sum, \sum, \z0
+ vsumsws \sse, \sse, \z0
+
+ stvx \sum, 0, r1
+ lwz r3, 12(r1)
+
+ stvx \sse, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r9) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+ load_and_align_16 v16, r7, r8, \increment_counter
+ compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+ lvsl v17, 0, \R ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, \R
+ lvx v22, r10, \R
+
+.if \increment_counter
+ add \R, \R, \P
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_4x4_b
+
+ hfilter_8 v4, v10, v11, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+compute_sum_sse_4x4_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_c v10, b_hilo_b, 0, r12, r0
+
+ vperm v0, v0, v1, v10
+ vperm v1, v2, v3, v10
+
+ compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 4
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+ hfilter_8 v4, v10, v11, 1
+ hfilter_8 v5, v10, v11, 1
+ hfilter_8 v6, v10, v11, 1
+ hfilter_8 v7, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x8_b
+
+ hfilter_8 v8, v10, v11, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 0
+
+ beq compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 0
+
+ vmrghb v4, v4, v5
+ vmrghb v5, v6, v7
+ vmrghb v6, v8, v9
+ vmrghb v7, v10, v11
+
+ compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 6
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x16_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v29, b_0123_b, 0, r12, r0
+ load_c v30, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v29, v30, 1
+ hfilter_8 v1, v29, v30, 1
+ hfilter_8 v2, v29, v30, 1
+ hfilter_8 v3, v29, v30, 1
+ hfilter_8 v4, v29, v30, 1
+ hfilter_8 v5, v29, v30, 1
+ hfilter_8 v6, v29, v30, 1
+ hfilter_8 v7, v29, v30, 1
+ hfilter_8 v8, v29, v30, 1
+ hfilter_8 v9, v29, v30, 1
+ hfilter_8 v10, v29, v30, 1
+ hfilter_8 v11, v29, v30, 1
+ hfilter_8 v12, v29, v30, 1
+ hfilter_8 v13, v29, v30, 1
+ hfilter_8 v14, v29, v30, 1
+ hfilter_8 v15, v29, v30, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x16_b
+
+ hfilter_8 v16, v29, v30, 0
+
+ b second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+ vmrghb v4, v8, v9
+ vmrghb v5, v10, v11
+ vmrghb v6, v12, v13
+ vmrghb v7, v14, v15
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 1
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v0, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 0
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v4, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v5, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x8_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x8_b
+
+ hfilter_16 v8, 0
+
+ b second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+
+ beq compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_16x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 0
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 1
+ compute_sum_sse_16 v8, 1
+ compute_sum_sse_16 v9, 1
+ compute_sum_sse_16 v10, 1
+ compute_sum_sse_16 v11, 1
+ compute_sum_sse_16 v12, 1
+ compute_sum_sse_16 v13, 1
+ compute_sum_sse_16 v14, 1
+ compute_sum_sse_16 v15, 0
+
+ variance_final v18, v19, v23, 8
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23