diff options
Diffstat (limited to 'vp8/common/ppc')
-rw-r--r-- | vp8/common/ppc/copy_altivec.asm | 47 | ||||
-rw-r--r-- | vp8/common/ppc/filter_altivec.asm | 1013 | ||||
-rw-r--r-- | vp8/common/ppc/filter_bilinear_altivec.asm | 677 | ||||
-rw-r--r-- | vp8/common/ppc/idctllm_altivec.asm | 189 | ||||
-rw-r--r-- | vp8/common/ppc/loopfilter_altivec.c | 135 | ||||
-rw-r--r-- | vp8/common/ppc/loopfilter_filters_altivec.asm | 1253 | ||||
-rw-r--r-- | vp8/common/ppc/platform_altivec.asm | 59 | ||||
-rw-r--r-- | vp8/common/ppc/recon_altivec.asm | 175 | ||||
-rw-r--r-- | vp8/common/ppc/sad_altivec.asm | 277 | ||||
-rw-r--r-- | vp8/common/ppc/systemdependent.c | 170 | ||||
-rw-r--r-- | vp8/common/ppc/variance_altivec.asm | 375 | ||||
-rw-r--r-- | vp8/common/ppc/variance_subpixel_altivec.asm | 865 |
12 files changed, 5235 insertions, 0 deletions
diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm new file mode 100644 index 000000000..a4ce91583 --- /dev/null +++ b/vp8/common/ppc/copy_altivec.asm @@ -0,0 +1,47 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl copy_mem16x16_ppc + +;# r3 unsigned char *src +;# r4 int src_stride +;# r5 unsigned char *dst +;# r6 int dst_stride + +;# Make the assumption that input will not be aligned, +;# but the output will be. So two reads and a perm +;# for the input, but only one store for the output. +copy_mem16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xe000 + mtspr 256, r12 ;# set VRSAVE + + li r10, 16 + mtctr r10 + +cp_16x16_loop: + lvsl v0, 0, r3 ;# permutate value for alignment + + lvx v1, 0, r3 + lvx v2, r10, r3 + + vperm v1, v1, v2, v0 + + stvx v1, 0, r5 + + add r3, r3, r4 ;# increment source pointer + add r5, r5, r6 ;# increment destination pointer + + bdnz cp_16x16_loop + + mtspr 256, r11 ;# reset old VRSAVE + + blr diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm new file mode 100644 index 000000000..4da2e94f9 --- /dev/null +++ b/vp8/common/ppc/filter_altivec.asm @@ -0,0 +1,1013 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl sixtap_predict_ppc + .globl sixtap_predict8x4_ppc + .globl sixtap_predict8x8_ppc + .globl sixtap_predict16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_hfilter V0, V1 + load_c \V0, HFilter, r5, r9, r10 + + addi r5, r5, 16 + lvx \V1, r5, r10 +.endm + +;# Vertical filtering +.macro Vprolog + load_c v0, VFilter, r6, r3, r10 + + vspltish v5, 8 + vspltish v6, 3 + vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v1, v0, 1 + vspltb v2, v0, 2 + vspltb v3, v0, 3 + vspltb v4, v0, 4 + vspltb v5, v0, 5 + vspltb v0, v0, 0 +.endm + +.macro vpre_load + Vprolog + li r10, 16 + lvx v10, 0, r9 ;# v10..v14 = first 5 rows + lvx v11, r10, r9 + addi r9, r9, 32 + lvx v12, 0, r9 + lvx v13, r10, r9 + addi r9, r9, 32 + lvx v14, 0, r9 +.endm + +.macro Msum Re, Ro, V, T, TMP + ;# (Re,Ro) += (V*T) + vmuleub \TMP, \V, \T ;# trashes v8 + vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary + vmuloub \TMP, \V, \T + vadduhm \Ro, \Ro, \TMP ;# Ro = odds +.endm + +.macro vinterp_no_store P0 P1 P2 P3 P4 P5 + vmuleub v8, \P0, v0 ;# 64 + 4 positive taps + vadduhm v16, v6, v8 + vmuloub v8, \P0, v0 + vadduhm v17, v6, v8 + Msum v16, v17, \P2, v2, v8 + Msum v16, v17, \P3, v3, v8 + Msum v16, v17, \P5, v5, v8 + + vmuleub v18, \P1, v1 ;# 2 negative taps + vmuloub v19, \P1, v1 + Msum v18, v19, \P4, v4, v8 + + vsubuhs v16, v16, v18 ;# subtract neg from pos + vsubuhs v17, v17, v19 + vsrh v16, v16, v7 ;# divide by 128 + vsrh v17, v17, v7 ;# v16 v17 = evens, odds + vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order + vmrglh v19, v16, v17 + vpkuhus \P0, v18, v19 ;# P0 = 8-bit result +.endm + +.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 + vmuleub v24, \P0, v13 ;# 64 + 4 positive taps + vadduhm v21, v20, v24 + vmuloub v24, \P0, v13 + vadduhm v22, v20, v24 + Msum v21, v22, \P2, v15, v25 + Msum v21, v22, \P3, v16, v25 + Msum v21, v22, \P5, v18, v25 + + vmuleub v23, \P1, v14 ;# 2 negative taps + vmuloub v24, \P1, v14 + Msum v23, v24, \P4, v17, v25 + + vsubuhs v21, v21, v23 ;# subtract neg from pos + vsubuhs v22, v22, v24 + vsrh v21, v21, v19 ;# divide by 128 + vsrh v22, v22, v19 ;# v16 v17 = evens, odds + vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order + vmrglh v24, v21, v22 + vpkuhus \P0, v23, v24 ;# P0 = 8-bit result +.endm + + +.macro Vinterp P0 P1 P2 P3 P4 P5 + vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 + stvx \P0, 0, r7 + add r7, r7, r8 ;# 33 ops per 16 pels +.endm + + +.macro luma_v P0, P1, P2, P3, P4, P5 + addi r9, r9, 16 ;# P5 = newest input row + lvx \P5, 0, r9 + Vinterp \P0, \P1, \P2, \P3, \P4, \P5 +.endm + +.macro luma_vtwo + luma_v v10, v11, v12, v13, v14, v15 + luma_v v11, v12, v13, v14, v15, v10 +.endm + +.macro luma_vfour + luma_vtwo + luma_v v12, v13, v14, v15, v10, v11 + luma_v v13, v14, v15, v10, v11, v12 +.endm + +.macro luma_vsix + luma_vfour + luma_v v14, v15, v10, v11, v12, v13 + luma_v v15, v10, v11, v12, v13, v14 +.endm + +.macro Interp4 R I I4 + vmsummbm \R, v13, \I, v15 + vmsummbm \R, v14, \I4, \R +.endm + +.macro Read8x8 VD, RS, RP, increment_counter + lvsl v21, 0, \RS ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx \VD, 0, \RS + lvx v20, r10, \RS + +.if \increment_counter + add \RS, \RS, \RP +.endif + + vperm \VD, \VD, v20, v21 +.endm + +.macro interp_8x8 R + vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 + vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A + Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 + vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx + Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 + + vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 + vsrh \R, \R, v19 + + vpkuhus \R, \R, \R ;# saturate and pack + +.endm + +.macro Read4x4 VD, RS, RP, increment_counter + lvsl v21, 0, \RS ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v20, 0, \RS + +.if \increment_counter + add \RS, \RS, \RP +.endif + + vperm \VD, v20, v20, v21 +.endm + .text + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +sixtap_predict_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xff87 + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- vertical_only_4x4 + + ;# load up horizontal filter + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_4x4 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + + b second_pass_4x4 + +vertical_only_4x4: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_4x4: + load_c v20, b_hilo_4x4, 0, r9, r10 + load_c v21, b_hilo, 0, r9, r10 + + ;# reposition input so that it can go through the + ;# filtering phase with one pass. + vperm v0, v0, v1, v20 ;# 0 1 x x + vperm v2, v2, v3, v20 ;# 2 3 x x + vperm v4, v4, v5, v20 ;# 4 5 x x + vperm v6, v6, v7, v20 ;# 6 7 x x + + vperm v0, v0, v2, v21 ;# 0 1 2 3 + vperm v4, v4, v6, v21 ;# 4 5 6 7 + + vsldoi v1, v0, v4, 4 + vsldoi v2, v0, v4, 8 + vsldoi v3, v0, v4, 12 + + vsldoi v5, v4, v8, 4 + + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + + stvx v0, 0, r1 + + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 4(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 8(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 12(r1) + stw r0, 0(r7) + + b exit_4x4 + +store_4x4: + + stvx v2, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v3, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v4, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v5, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + +exit_4x4: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro w_8x8 V, D, R, P + stvx \V, 0, r1 + lwz \R, 0(r1) + stw \R, 0(r7) + lwz \R, 4(r1) + stw \R, 4(r7) + add \D, \D, \P +.endm + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +sixtap_predict8x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- second_pass_pre_copy_8x4 + + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_8x4 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + + b second_pass_8x4 + +second_pass_pre_copy_8x4: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_8x4: + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 + vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 + vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x4 + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + + b exit_8x4 + +store_aligned_8x4: + + load_c v10, b_hilo, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + + b exit_8x4 + +store_8x4: + cmpi cr0, r8, 8 + beq cr0, store_aligned2_8x4 + + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + + b exit_8x4 + +store_aligned2_8x4: + load_c v10, b_hilo, 0, r9, r10 + + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + +exit_8x4: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +;# Because the width that needs to be filtered will fit in a single altivec +;# register there is no need to loop. Everything can stay in registers. +sixtap_predict8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- second_pass_pre_copy_8x8 + + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + Read8x8 v9, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + interp_8x8 v9 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_8x8 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v10, r3, r4, 1 + Read8x8 v11, r3, r4, 1 + Read8x8 v12, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v10 + interp_8x8 v11 + interp_8x8 v12 + + b second_pass_8x8 + +second_pass_pre_copy_8x8: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + Read8x8 v9, r3, r4, 1 + Read8x8 v10, r3, r4, 1 + Read8x8 v11, r3, r4, 1 + Read8x8 v12, r3, r4, 0 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_8x8: + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 + vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 + vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 + vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 + vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 + vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 + vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x8 + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + + b exit_8x8 + +store_aligned_8x8: + + load_c v10, b_hilo, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + + b exit_8x8 + +store_8x8: + cmpi cr0, r8, 8 + beq cr0, store_aligned2_8x8 + + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + w_8x8 v8, r7, r0, r8 + w_8x8 v9, r7, r0, r8 + + b exit_8x8 + +store_aligned2_8x8: + load_c v10, b_hilo, 0, r9, r10 + + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + vperm v8, v8, v9, v10 + + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + addi r7, r7, 16 + stvx v8, 0, r7 + +exit_8x8: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +;# Two pass filtering. First pass is Horizontal edges, second pass is vertical +;# edges. One of the filters can be null, but both won't be. Needs to use a +;# temporary buffer because the source buffer can't be modified and the buffer +;# for the destination is not large enough to hold the temporary data. +sixtap_predict16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-416(r1) ;# create space on the stack + + ;# Three possiblities + ;# 1. First filter is null. Don't use a temp buffer. + ;# 2. Second filter is null. Don't use a temp buffer. + ;# 3. Neither are null, use temp buffer. + + ;# First Pass (horizontal edge) + ;# setup pointers for src + ;# if possiblity (1) then setup the src pointer to be the orginal and jump + ;# to second pass. this is based on if x_offset is 0. + + ;# load up horizontal filter + slwi. r5, r5, 5 ;# index into horizontal filter array + + load_hfilter v4, v5 + + beq- copy_horizontal_16x21 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# setup constants + ;# v14 permutation value for alignment + load_c v14, b_hperm, 0, r9, r10 + + ;# These statements are guessing that there won't be a second pass, + ;# but if there is then inside the bypass they need to be set + li r0, 16 ;# prepare for no vertical filter + + ;# Change the output pointer and pitch to be the actual + ;# desination instead of a temporary buffer. + addi r9, r7, 0 + addi r5, r8, 0 + + ;# no vertical filter, so write the output from the first pass + ;# directly into the output buffer. + beq- no_vertical_filter_bypass + + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + + ;# setup counter for the number of lines that are going to be filtered + li r0, 21 + + ;# use the stack as temporary storage + la r9, 48(r1) + li r5, 16 + +no_vertical_filter_bypass: + + mtctr r0 + + ;# rounding added in on the multiply + vspltisw v10, 8 + vspltisw v12, 3 + vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v13, 7 + + ;# index to the next set of vectors in the row. + li r10, 16 + li r12, 32 + +horizontal_loop_16x16: + + lvsl v15, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v1, 0, r3 + lvx v2, r10, r3 + lvx v3, r12, r3 + + vperm v8, v1, v2, v15 + vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified + + vsldoi v11, v8, v9, 4 + + ;# set 0 + vmsummbm v6, v4, v8, v12 ;# taps times elements + vmsummbm v0, v5, v11, v6 + + ;# set 1 + vsldoi v10, v8, v9, 1 + vsldoi v11, v8, v9, 5 + + vmsummbm v6, v4, v10, v12 + vmsummbm v1, v5, v11, v6 + + ;# set 2 + vsldoi v10, v8, v9, 2 + vsldoi v11, v8, v9, 6 + + vmsummbm v6, v4, v10, v12 + vmsummbm v2, v5, v11, v6 + + ;# set 3 + vsldoi v10, v8, v9, 3 + vsldoi v11, v8, v9, 7 + + vmsummbm v6, v4, v10, v12 + vmsummbm v3, v5, v11, v6 + + vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F + + vsrh v0, v0, v13 ;# divide v0, v1 by 128 + vsrh v1, v1, v13 + + vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result + vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result + + stvx v0, 0, r9 + add r9, r9, r5 + + add r3, r3, r4 + + bdnz horizontal_loop_16x16 + + ;# check again to see if vertical filter needs to be done. + cmpi cr0, r6, 0 + beq cr0, end_16x16 + + ;# yes there is, so go to the second pass + b second_pass_16x16 + +copy_horizontal_16x21: + li r10, 21 + mtctr r10 + + li r10, 16 + + sub r3, r3, r4 + sub r3, r3, r4 + + ;# this is done above if there is a horizontal filter, + ;# if not it needs to be done down here. + slwi r6, r6, 4 ;# index into vertical filter array + + ;# always write to the stack when doing a horizontal copy + la r9, 48(r1) + +copy_horizontal_loop_16x21: + lvsl v15, 0, r3 ;# permutate value for alignment + + lvx v1, 0, r3 + lvx v2, r10, r3 + + vperm v8, v1, v2, v15 + + stvx v8, 0, r9 + addi r9, r9, 16 + + add r3, r3, r4 + + bdnz copy_horizontal_loop_16x21 + +second_pass_16x16: + + ;# always read from the stack when doing a vertical filter + la r9, 48(r1) + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v7, 7 + + vpre_load + + luma_vsix + luma_vsix + luma_vfour + +end_16x16: + + addi r1, r1, 416 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +HFilter: + .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 + .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 + .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 + .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 + .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 + .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 + .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 + .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 + .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 + .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 + .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 + .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 + .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 + .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 + + .align 4 +VFilter: + .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + .align 4 +b_hperm: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +B_0123: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +B_4567: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + + .align 4 +B_89AB: + .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + + .align 4 +b_hilo: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + + .align 4 +b_hilo_4x4: + .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm new file mode 100644 index 000000000..fd8aa665f --- /dev/null +++ b/vp8/common/ppc/filter_bilinear_altivec.asm @@ -0,0 +1,677 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl bilinear_predict4x4_ppc + .globl bilinear_predict8x4_ppc + .globl bilinear_predict8x8_ppc + .globl bilinear_predict16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_vfilter V0, V1 + load_c \V0, vfilter_b, r6, r9, r10 + + addi r6, r6, 16 + lvx \V1, r6, r10 +.endm + +.macro HProlog jump_label + ;# load up horizontal filter + slwi. r5, r5, 4 ;# index into horizontal filter array + + ;# index to the next set of vectors in the row. + li r10, 16 + li r12, 32 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq \jump_label + + load_c v20, hfilter_b, r5, r9, r0 + + ;# setup constants + ;# v14 permutation value for alignment + load_c v28, b_hperm_b, 0, r9, r0 + + ;# rounding added in on the multiply + vspltisw v21, 8 + vspltisw v18, 3 + vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 + + slwi. r6, r6, 5 ;# index into vertical filter array +.endm + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro HFilter V + vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 + vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A + + vmsummbm v24, v20, v24, v18 + vmsummbm v25, v20, v25, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + + vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result +.endm + +.macro hfilter_8 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 9 bytes wide, output is 8 bytes. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + + HFilter \V +.endm + + +.macro load_and_align_8 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + + vperm \V, v21, v22, v17 +.endm + +.macro write_aligned_8 V, increment_counter + stvx \V, 0, r7 + +.if \increment_counter + add r7, r7, r8 +.endif +.endm + +.macro vfilter_16 P0 P1 + vmuleub v22, \P0, v20 ;# 64 + 4 positive taps + vadduhm v22, v18, v22 + vmuloub v23, \P0, v20 + vadduhm v23, v18, v23 + + vmuleub v24, \P1, v21 + vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary + vmuloub v25, \P1, v21 + vadduhm v23, v23, v25 ;# Ro = odds + + vsrh v22, v22, v19 ;# divide by 128 + vsrh v23, v23, v19 ;# v16 v17 = evens, odds + vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order + vmrglh v23, v22, v23 + vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result +.endm + + +.macro w_8x8 V, D, R, P + stvx \V, 0, r1 + lwz \R, 0(r1) + stw \R, 0(r7) + lwz \R, 4(r1) + stw \R, 4(r7) + add \D, \D, \P +.endm + + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict4x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_4x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_4x4_b + + hfilter_8 v4, 0 + + b second_pass_4x4_b + +second_pass_4x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + +second_pass_4x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +store_out_4x4_b: + + stvx v0, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v1, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v2, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v3, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + +exit_4x4: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict8x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_8x4_b + + hfilter_8 v4, 0 + + b second_pass_8x4_b + +second_pass_8x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + +second_pass_8x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +store_out_8x4_b: + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x4_b + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + + b exit_8x4 + +store_aligned_8x4_b: + load_c v10, b_hilo_b, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + +exit_8x4: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff0 + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x8_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + hfilter_8 v4, 1 + hfilter_8 v5, 1 + hfilter_8 v6, 1 + hfilter_8 v7, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_8x8_b + + hfilter_8 v8, 0 + + b second_pass_8x8_b + +second_pass_8x8_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + load_and_align_8 v5, 1 + load_and_align_8 v6, 1 + load_and_align_8 v7, 1 + load_and_align_8 v8, 0 + +second_pass_8x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +store_out_8x8_b: + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x8_b + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + + b exit_8x8 + +store_aligned_8x8_b: + load_c v10, b_hilo_b, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + +exit_8x8: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro hfilter_16 V, increment_counter + + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + lvx v23, r12, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified + + ;# set 0 + vmsummbm v24, v20, v21, v18 ;# taps times elements + + ;# set 1 + vsldoi v23, v21, v22, 1 + vmsummbm v25, v20, v23, v18 + + ;# set 2 + vsldoi v23, v21, v22, 2 + vmsummbm v26, v20, v23, v18 + + ;# set 3 + vsldoi v23, v21, v22, 3 + vmsummbm v27, v20, v23, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + vsrh v25, v25, v19 + + vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result + vperm \V, \V, v0, v28 ;# \V = correctly-ordered result +.endm + +.macro load_and_align_16 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + + vperm \V, v21, v22, v17 +.endm + +.macro write_16 V, increment_counter + stvx \V, 0, r7 + +.if \increment_counter + add r7, r7, r8 +.endif +.endm + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + HProlog second_pass_16x16_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + hfilter_16 v8, 1 + hfilter_16 v9, 1 + hfilter_16 v10, 1 + hfilter_16 v11, 1 + hfilter_16 v12, 1 + hfilter_16 v13, 1 + hfilter_16 v14, 1 + hfilter_16 v15, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_16x16_b + + hfilter_16 v16, 0 + + b second_pass_16x16_b + +second_pass_16x16_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, 1 + load_and_align_16 v1, 1 + load_and_align_16 v2, 1 + load_and_align_16 v3, 1 + load_and_align_16 v4, 1 + load_and_align_16 v5, 1 + load_and_align_16 v6, 1 + load_and_align_16 v7, 1 + load_and_align_16 v8, 1 + load_and_align_16 v9, 1 + load_and_align_16 v10, 1 + load_and_align_16 v11, 1 + load_and_align_16 v12, 1 + load_and_align_16 v13, 1 + load_and_align_16 v14, 1 + load_and_align_16 v15, 1 + load_and_align_16 v16, 0 + +second_pass_16x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +store_out_16x16_b: + + write_16 v0, 1 + write_16 v1, 1 + write_16 v2, 1 + write_16 v3, 1 + write_16 v4, 1 + write_16 v5, 1 + write_16 v6, 1 + write_16 v7, 1 + write_16 v8, 1 + write_16 v9, 1 + write_16 v10, 1 + write_16 v11, 1 + write_16 v12, 1 + write_16 v13, 1 + write_16 v14, 1 + write_16 v15, 0 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +hfilter_b: + .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 + .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 + .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 + .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 + .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 + .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 + .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 + .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 + + .align 4 +vfilter_b: + .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + + .align 4 +b_hperm_b: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +b_0123_b: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +b_4567_b: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + +b_hilo_b: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm new file mode 100644 index 000000000..117d9cfc8 --- /dev/null +++ b/vp8/common/ppc/idctllm_altivec.asm @@ -0,0 +1,189 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl short_idct4x4llm_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +;# r3 short *input +;# r4 short *output +;# r5 int pitch + .align 2 +short_idct4x4llm_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + load_c v8, sinpi8sqrt2, 0, r9, r10 + load_c v9, cospi8sqrt2minus1, 0, r9, r10 + load_c v10, hi_hi, 0, r9, r10 + load_c v11, lo_lo, 0, r9, r10 + load_c v12, shift_16, 0, r9, r10 + + li r10, 16 + lvx v0, 0, r3 ;# input ip[0], ip[ 4] + lvx v1, r10, r3 ;# input ip[8], ip[12] + + ;# first pass + vupkhsh v2, v0 + vupkhsh v3, v1 + vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] + vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] + + vupklsh v0, v0 + vmulosh v4, v0, v8 + vsraw v4, v4, v12 + vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) + + vupklsh v1, v1 + vmulosh v5, v1, v9 + vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v1 + + vsubsws v4, v4, v5 ;# c1 + + vmulosh v3, v1, v8 + vsraw v3, v3, v12 + vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) + + vmulosh v5, v0, v9 + vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v0 + + vaddsws v3, v3, v5 ;# d1 + + vaddsws v0, v6, v3 ;# a1 + d1 + vsubsws v3, v6, v3 ;# a1 - d1 + + vaddsws v1, v7, v4 ;# b1 + c1 + vsubsws v2, v7, v4 ;# b1 - c1 + + ;# transpose input + vmrghw v4, v0, v1 ;# a0 b0 a1 b1 + vmrghw v5, v2, v3 ;# c0 d0 c1 d1 + + vmrglw v6, v0, v1 ;# a2 b2 a3 b3 + vmrglw v7, v2, v3 ;# c2 d2 c3 d3 + + vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 + vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 + + vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 + vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 + + ;# second pass + vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] + vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] + + vmulosh v4, v1, v8 + vsraw v4, v4, v12 + vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) + + vmulosh v5, v3, v9 + vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v3 + + vsubsws v4, v4, v5 ;# c1 + + vmulosh v2, v3, v8 + vsraw v2, v2, v12 + vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) + + vmulosh v5, v1, v9 + vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v1 + + vaddsws v3, v2, v5 ;# d1 + + vaddsws v0, v6, v3 ;# a1 + d1 + vsubsws v3, v6, v3 ;# a1 - d1 + + vaddsws v1, v7, v4 ;# b1 + c1 + vsubsws v2, v7, v4 ;# b1 - c1 + + vspltish v6, 4 + vspltish v7, 3 + + vpkswss v0, v0, v1 + vpkswss v1, v2, v3 + + vaddshs v0, v0, v6 + vaddshs v1, v1, v6 + + vsrah v0, v0, v7 + vsrah v1, v1, v7 + + ;# transpose output + vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 + vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 + + vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 + vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 + + stwu r1,-416(r1) ;# create space on the stack + + stvx v0, 0, r1 + lwz r6, 0(r1) + stw r6, 0(r4) + lwz r6, 4(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + lwz r6, 8(r1) + stw r6, 0(r4) + lwz r6, 12(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + stvx v1, 0, r1 + lwz r6, 0(r1) + stw r6, 0(r4) + lwz r6, 4(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + lwz r6, 8(r1) + stw r6, 0(r4) + lwz r6, 12(r1) + stw r6, 4(r4) + + addi r1, r1, 416 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 4 +sinpi8sqrt2: + .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 + + .align 4 +cospi8sqrt2minus1: + .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 + + .align 4 +shift_16: + .long 16, 16, 16, 16 + + .align 4 +hi_hi: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + + .align 4 +lo_lo: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c new file mode 100644 index 000000000..71bf6e2d7 --- /dev/null +++ b/vp8/common/ppc/loopfilter_altivec.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef void loop_filter_function_y_ppc +( + unsigned char *s, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh +); + +typedef void loop_filter_function_uv_ppc +( + unsigned char *u, // source pointer + unsigned char *v, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh +); + +typedef void loop_filter_function_s_ppc +( + unsigned char *s, // source pointer + int p, // pitch + const signed char *flimit +); + +loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc; +loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc; +loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc; +loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc; + +loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc; +loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc; +loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc; +loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc; + +loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc; +loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; + +// Horizontal MB filtering +void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); + + if (u_ptr) + mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); +} + +void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim); +} + +// Vertical MB Filtering +void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); + + if (u_ptr) + mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); +} + +void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim); +} + +// Horizontal B Filtering +void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + // These should all be done at once with one call, instead of 3 + loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + + if (u_ptr) + loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr); +} + +void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim); + loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim); + loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim); +} + +// Vertical B Filtering +void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); + + if (u_ptr) + loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr); +} + +void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim); + loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim); + loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim); +} diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm new file mode 100644 index 000000000..61df4e976 --- /dev/null +++ b/vp8/common/ppc/loopfilter_filters_altivec.asm @@ -0,0 +1,1253 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl mbloop_filter_horizontal_edge_y_ppc + .globl loop_filter_horizontal_edge_y_ppc + .globl mbloop_filter_vertical_edge_y_ppc + .globl loop_filter_vertical_edge_y_ppc + + .globl mbloop_filter_horizontal_edge_uv_ppc + .globl loop_filter_horizontal_edge_uv_ppc + .globl mbloop_filter_vertical_edge_uv_ppc + .globl loop_filter_vertical_edge_uv_ppc + + .globl loop_filter_simple_horizontal_edge_ppc + .globl loop_filter_simple_vertical_edge_ppc + + .text +;# We often need to perform transposes (and other transpose-like operations) +;# on matrices of data. This is simplified by the fact that we usually +;# operate on hunks of data whose dimensions are powers of 2, or at least +;# divisible by highish powers of 2. +;# +;# These operations can be very confusing. They become more straightforward +;# when we think of them as permutations of address bits: Concatenate a +;# group of vector registers and think of it as occupying a block of +;# memory beginning at address zero. The low four bits 0...3 of the +;# address then correspond to position within a register, the higher-order +;# address bits select the register. +;# +;# Although register selection, at the code level, is arbitrary, things +;# are simpler if we use contiguous ranges of register numbers, simpler +;# still if the low-order bits of the register number correspond to +;# conceptual address bits. We do this whenever reasonable. +;# +;# A 16x16 transpose can then be thought of as an operation on +;# a 256-element block of memory. It takes 8 bits 0...7 to address this +;# memory and the effect of a transpose is to interchange address bit +;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the +;# column, which is interchanged with the row addressed by bits 4..7. +;# +;# The altivec merge instructions provide a rapid means of effecting +;# many of these transforms. They operate at three widths (8,16,32). +;# Writing V(x) for vector register #x, paired merges permute address +;# indices as follows. +;# +;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: +;# +;# vmrghb V( x), V( y), V( y + (1<<s)) +;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s)) +;# +;# +;# =0= 1->2 2->3 3->(4+d) (4+s)->1: +;# +;# vmrghh V( x), V( y), V( y + (1<<s)) +;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s)) +;# +;# +;# =0= =1= 2->3 3->(4+d) (4+s)->2: +;# +;# vmrghw V( x), V( y), V( y + (1<<s)) +;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s)) +;# +;# +;# Unfortunately, there is no doubleword merge instruction. +;# The following sequence uses "vperm" is a substitute. +;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c) +;# are in registers Vhihi and Vlolo, we can also effect the permutation +;# +;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence: +;# +;# vperm V( x), V( y), V( y + (1<<s)), Vhihi +;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo +;# +;# +;# Except for bits s and d, the other relationships between register +;# number (= high-order part of address) bits are at the disposal of +;# the programmer. +;# + +;# To avoid excess transposes, we filter all 3 vertical luma subblock +;# edges together. This requires a single 16x16 transpose, which, in +;# the above language, amounts to the following permutation of address +;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by +;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. +;# +;# Except for the fact that the destination registers get written +;# before we are done referencing the old contents, the cyclic transform +;# is effected by +;# +;# x = 0; do { +;# vmrghb V(2x), V(x), V(x+8); +;# vmrghb V(2x+1), V(x), V(x+8); +;# } while( ++x < 8); +;# +;# For clarity, and because we can afford it, we do this transpose +;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, +;# leaving the final result in 16 .. 31, as the lower registers are +;# used in the filtering itself. +;# +.macro Tpair A, B, X, Y + vmrghb \A, \X, \Y + vmrglb \B, \X, \Y +.endm + +;# Each step takes 8*2 = 16 instructions + +.macro t16_even + Tpair v16,v17, v0,v8 + Tpair v18,v19, v1,v9 + Tpair v20,v21, v2,v10 + Tpair v22,v23, v3,v11 + Tpair v24,v25, v4,v12 + Tpair v26,v27, v5,v13 + Tpair v28,v29, v6,v14 + Tpair v30,v31, v7,v15 +.endm + +.macro t16_odd + Tpair v0,v1, v16,v24 + Tpair v2,v3, v17,v25 + Tpair v4,v5, v18,v26 + Tpair v6,v7, v19,v27 + Tpair v8,v9, v20,v28 + Tpair v10,v11, v21,v29 + Tpair v12,v13, v22,v30 + Tpair v14,v15, v23,v31 +.endm + +;# Whole transpose takes 4*16 = 64 instructions + +.macro t16_full + t16_odd + t16_even + t16_odd + t16_even +.endm + +;# Vertical edge filtering requires transposes. For the simple filter, +;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels +;# each. Writing 0 ... 63 for the pixel indices, the desired result is: +;# +;# v0 = 0 1 ... 14 15 +;# v1 = 16 17 ... 30 31 +;# v2 = 32 33 ... 47 48 +;# v3 = 49 50 ... 62 63 +;# +;# In frame-buffer memory, the layout is: +;# +;# 0 16 32 48 +;# 1 17 33 49 +;# ... +;# 15 31 47 63. +;# +;# We begin by reading the data 32 bits at a time (using scalar operations) +;# into a temporary array, reading the rows of the array into vector registers, +;# with the following layout: +;# +;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 +;# v1 = 1 17 33 49 5 21 ... 45 61 +;# v2 = 2 18 ... 46 62 +;# v3 = 3 19 ... 47 63 +;# +;# From the "address-bit" perspective discussed above, we simply need to +;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. +;# In other words, we transpose each of the four 4x4 submatrices. +;# +;# This transformation is its own inverse, and we need to perform it +;# again before writing the pixels back into the frame buffer. +;# +;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, +;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors +;# defined above. We think of both groups of 4 registers as having +;# "addresses" {0,1,2,3} * 16. +;# +.macro Transpose4times4x4 Vlo, Vhi + + ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= + + vmrghb v4, v0, v1 + vmrglb v5, v0, v1 + vmrghb v6, v2, v3 + vmrglb v7, v2, v3 + + ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 + + vmrghh v0, v4, v6 + vmrglh v1, v4, v6 + vmrghh v2, v5, v7 + vmrglh v3, v5, v7 + + ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= + + vmrghw v4, v0, v1 + vmrglw v5, v0, v1 + vmrghw v6, v2, v3 + vmrglw v7, v2, v3 + + ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 + + vperm v0, v4, v6, \Vlo + vperm v1, v4, v6, \Vhi + vperm v2, v5, v7, \Vlo + vperm v3, v5, v7, \Vhi +.endm +;# end Transpose4times4x4 + + +;# Normal mb vertical edge filter transpose. +;# +;# We read 8 columns of data, initially in the following pattern: +;# +;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) +;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) +;# ... +;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) +;# +;# and wish to convert to: +;# +;# (0,0) ... (0,15) +;# (1,0) ... (1,15) +;# ... +;# (7,0) ... (7,15). +;# +;# In "address bit" language, we wish to map +;# +;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. +;# +;# This can be accomplished by 4 iterations of the cyclic transform +;# +;# I -> (I+1) mod 7; +;# +;# each iteration can be realized by (d=0, s=2): +;# +;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); +;# +;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; +;# preserving v8 = sign converter. +;# +;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the +;# result lands in the "mirror" registers v10...v17 +;# +.macro t8x16_odd + Tpair v10, v11, v0, v4 + Tpair v12, v13, v1, v5 + Tpair v14, v15, v2, v6 + Tpair v16, v17, v3, v7 +.endm + +.macro t8x16_even + Tpair v0, v1, v10, v14 + Tpair v2, v3, v11, v15 + Tpair v4, v5, v12, v16 + Tpair v6, v7, v13, v17 +.endm + +.macro transpose8x16_fwd + t8x16_odd + t8x16_even + t8x16_odd + t8x16_even +.endm + +.macro transpose8x16_inv + t8x16_odd + t8x16_even + t8x16_odd +.endm + +.macro Transpose16x16 + vmrghb v0, v16, v24 + vmrglb v1, v16, v24 + vmrghb v2, v17, v25 + vmrglb v3, v17, v25 + vmrghb v4, v18, v26 + vmrglb v5, v18, v26 + vmrghb v6, v19, v27 + vmrglb v7, v19, v27 + vmrghb v8, v20, v28 + vmrglb v9, v20, v28 + vmrghb v10, v21, v29 + vmrglb v11, v21, v29 + vmrghb v12, v22, v30 + vmrglb v13, v22, v30 + vmrghb v14, v23, v31 + vmrglb v15, v23, v31 + vmrghb v16, v0, v8 + vmrglb v17, v0, v8 + vmrghb v18, v1, v9 + vmrglb v19, v1, v9 + vmrghb v20, v2, v10 + vmrglb v21, v2, v10 + vmrghb v22, v3, v11 + vmrglb v23, v3, v11 + vmrghb v24, v4, v12 + vmrglb v25, v4, v12 + vmrghb v26, v5, v13 + vmrglb v27, v5, v13 + vmrghb v28, v6, v14 + vmrglb v29, v6, v14 + vmrghb v30, v7, v15 + vmrglb v31, v7, v15 + vmrghb v0, v16, v24 + vmrglb v1, v16, v24 + vmrghb v2, v17, v25 + vmrglb v3, v17, v25 + vmrghb v4, v18, v26 + vmrglb v5, v18, v26 + vmrghb v6, v19, v27 + vmrglb v7, v19, v27 + vmrghb v8, v20, v28 + vmrglb v9, v20, v28 + vmrghb v10, v21, v29 + vmrglb v11, v21, v29 + vmrghb v12, v22, v30 + vmrglb v13, v22, v30 + vmrghb v14, v23, v31 + vmrglb v15, v23, v31 + vmrghb v16, v0, v8 + vmrglb v17, v0, v8 + vmrghb v18, v1, v9 + vmrglb v19, v1, v9 + vmrghb v20, v2, v10 + vmrglb v21, v2, v10 + vmrghb v22, v3, v11 + vmrglb v23, v3, v11 + vmrghb v24, v4, v12 + vmrglb v25, v4, v12 + vmrghb v26, v5, v13 + vmrglb v27, v5, v13 + vmrghb v28, v6, v14 + vmrglb v29, v6, v14 + vmrghb v30, v7, v15 + vmrglb v31, v7, v15 +.endm + +;# load_g loads a global vector (whose address is in the local variable Gptr) +;# into vector register Vreg. Trashes r0 +.macro load_g Vreg, Gptr + lwz r0, \Gptr + lvx \Vreg, 0, r0 +.endm + +;# exploit the saturation here. if the answer is negative +;# it will be clamped to 0. orring 0 with a positive +;# number will be the positive number (abs) +;# RES = abs( A-B), trashes TMP +.macro Abs RES, TMP, A, B + vsububs \RES, \A, \B + vsububs \TMP, \B, \A + vor \RES, \RES, \TMP +.endm + +;# RES = Max( RES, abs( A-B)), trashes TMP +.macro max_abs RES, TMP, A, B + vsububs \TMP, \A, \B + vmaxub \RES, \RES, \TMP + vsububs \TMP, \B, \A + vmaxub \RES, \RES, \TMP +.endm + +.macro Masks + ;# build masks + ;# input is all 8 bit unsigned (0-255). need to + ;# do abs(vala-valb) > limit. but no need to compare each + ;# value to the limit. find the max of the absolute differences + ;# and compare that to the limit. + ;# First hev + Abs v14, v13, v2, v3 ;# |P1 - P0| + max_abs v14, v13, v5, v4 ;# |Q1 - Q0| + + vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded + + ;# Next limit + max_abs v14, v13, v0, v1 ;# |P3 - P2| + max_abs v14, v13, v1, v2 ;# |P2 - P1| + max_abs v14, v13, v6, v5 ;# |Q2 - Q1| + max_abs v14, v13, v7, v6 ;# |Q3 - Q2| + + vcmpgtub v9, v14, v9 ;# R = true if limit exceeded + + ;# flimit + Abs v14, v13, v3, v4 ;# |P0 - Q0| + + vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded + + vor v8, v8, v9 ;# R = true if flimit or limit exceeded + ;# done building masks +.endm + +.macro build_constants RFL, RLI, RTH, FL, LI, TH + ;# build constants + lvx \FL, 0, \RFL ;# flimit + lvx \LI, 0, \RLI ;# limit + lvx \TH, 0, \RTH ;# thresh + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 +.endm + +.macro load_data_y + ;# setup strides/pointers to be able to access + ;# all of the data + add r5, r4, r4 ;# r5 = 2 * stride + sub r6, r3, r5 ;# r6 -> 2 rows back + neg r7, r4 ;# r7 = -stride + + ;# load 16 pixels worth of data to work on + sub r0, r6, r5 ;# r0 -> 4 rows back (temp) + lvx v0, 0, r0 ;# P3 (read only) + lvx v1, r7, r6 ;# P2 + lvx v2, 0, r6 ;# P1 + lvx v3, r7, r3 ;# P0 + lvx v4, 0, r3 ;# Q0 + lvx v5, r4, r3 ;# Q1 + lvx v6, r5, r3 ;# Q2 + add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) + lvx v7, r4, r0 ;# Q3 (read only) +.endm + +;# Expects +;# v10 == HEV +;# v13 == tmp +;# v14 == tmp +.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT + vxor \P1, \P1, v11 ;# SP1 + vxor \P0, \P0, v11 ;# SP0 + vxor \Q0, \Q0, v11 ;# SQ0 + vxor \Q1, \Q1, v11 ;# SQ1 + + vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) +.if \HEV_PRESENT + vand v13, v13, v10 ;# f &= hev +.endif + vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + + vandc v13, v13, v8 ;# f &= mask + + vspltisb v8, 3 + vspltisb v9, 4 + + vaddsbs v14, v13, v9 ;# f1 = c (f+4) + vaddsbs v15, v13, v8 ;# f2 = c (f+3) + + vsrab v13, v14, v8 ;# f1 >>= 3 + vsrab v15, v15, v8 ;# f2 >>= 3 + + vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) + vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) +.endm + +.macro vp8_mbfilter + Masks + + ;# start the fitering here + vxor v1, v1, v11 ;# SP2 + vxor v2, v2, v11 ;# SP1 + vxor v3, v3, v11 ;# SP0 + vxor v4, v4, v11 ;# SQ0 + vxor v5, v5, v11 ;# SQ1 + vxor v6, v6, v11 ;# SQ2 + + ;# add outer taps if we have high edge variance + vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) + + vsubsbs v14, v4, v3 ;# SQ0-SP0 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) + + vandc v13, v13, v8 ;# f &= mask + vand v15, v13, v10 ;# f2 = f & hev + + ;# save bottom 3 bits so that we round one side +4 and the other +3 + vspltisb v8, 3 + vspltisb v9, 4 + + vaddsbs v14, v15, v9 ;# f1 = c (f+4) + vaddsbs v15, v15, v8 ;# f2 = c (f+3) + + vsrab v14, v14, v8 ;# f1 >>= 3 + vsrab v15, v15, v8 ;# f2 >>= 3 + + vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) + vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) + + ;# only apply wider filter if not high edge variance + vandc v13, v13, v10 ;# f &= ~hev + + vspltisb v9, 2 + vnor v8, v8, v8 + vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f + vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f + vspltisb v8, 9 + + ;# roughly 1/7th difference across boundary + vspltish v10, 7 + vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v8, v13 + vaddshs v14, v14, v9 ;# += 63 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v6, v6, v10 ;# subtract from Q and add to P + vaddsbs v1, v1, v10 + + vxor v6, v6, v11 + vxor v1, v1, v11 + + ;# roughly 2/7th difference across boundary + vspltish v10, 7 + vaddubm v12, v8, v8 + vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v12, v13 + vaddshs v14, v14, v9 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v5, v5, v10 ;# subtract from Q and add to P + vaddsbs v2, v2, v10 + + vxor v5, v5, v11 + vxor v2, v2, v11 + + ;# roughly 3/7th difference across boundary + vspltish v10, 7 + vaddubm v12, v12, v8 + vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v12, v13 + vaddshs v14, v14, v9 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v4, v4, v10 ;# subtract from Q and add to P + vaddsbs v3, v3, v10 + + vxor v4, v4, v11 + vxor v3, v3, v11 +.endm + +.macro SBFilter + Masks + + common_adjust v3, v4, v2, v5, 1 + + ;# outer tap adjustments + vspltisb v8, 1 + + vaddubm v13, v13, v8 ;# f += 1 + vsrab v13, v13, v8 ;# f >>= 1 + + vandc v13, v13, v10 ;# f &= ~hev + + vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) + vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) + + vxor v2, v2, v11 + vxor v3, v3, v11 + vxor v4, v4, v11 + vxor v5, v5, v11 +.endm + + .align 2 +mbloop_filter_horizontal_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r5, r6, r7, v8, v9, v10 + + load_data_y + + vp8_mbfilter + + stvx v1, r7, r6 ;# P2 + stvx v2, 0, r6 ;# P1 + stvx v3, r7, r3 ;# P0 + stvx v4, 0, r3 ;# Q0 + stvx v5, r4, r3 ;# Q1 + stvx v6, r5, r3 ;# Q2 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +loop_filter_horizontal_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r5, r6, r7, v8, v9, v10 + + load_data_y + + SBFilter + + stvx v2, 0, r6 ;# P1 + stvx v3, r7, r3 ;# P0 + stvx v4, 0, r3 ;# Q0 + stvx v5, r4, r3 ;# Q1 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. +;# So we can read in an entire mb aligned. However if we want to filter the mb +;# edge we run into problems. For the loopfilter we require 4 bytes before the mb +;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit +;# of a waste. So this is an even uglier way to get around that. +;# Using the regular register file words are read in and then saved back out to +;# memory to align and order them up. Then they are read in using the +;# vector register file. +.macro RLVmb V, R + lwzux r0, r3, r4 + stw r0, 4(\R) + lwz r0,-4(r3) + stw r0, 0(\R) + lwzux r0, r3, r4 + stw r0,12(\R) + lwz r0,-4(r3) + stw r0, 8(\R) + lvx \V, 0, \R +.endm + +.macro WLVmb V, R + stvx \V, 0, \R + lwz r0,12(\R) + stwux r0, r3, r4 + lwz r0, 8(\R) + stw r0,-4(r3) + lwz r0, 4(\R) + stwux r0, r3, r4 + lwz r0, 0(\R) + stw r0,-4(r3) +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +mbloop_filter_vertical_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + sub r3, r3, r4 + + RLVmb v0, r9 + RLVmb v1, r9 + RLVmb v2, r9 + RLVmb v3, r9 + RLVmb v4, r9 + RLVmb v5, r9 + RLVmb v6, r9 + RLVmb v7, r9 + + transpose8x16_fwd + + build_constants r5, r6, r7, v8, v9, v10 + + vp8_mbfilter + + transpose8x16_inv + + add r3, r3, r4 + neg r4, r4 + + WLVmb v17, r9 + WLVmb v16, r9 + WLVmb v15, r9 + WLVmb v14, r9 + WLVmb v13, r9 + WLVmb v12, r9 + WLVmb v11, r9 + WLVmb v10, r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro RL V, R, P + lvx \V, 0, \R + add \R, \R, \P +.endm + +.macro WL V, R, P + stvx \V, 0, \R + add \R, \R, \P +.endm + +.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 + ;# K = |P0-P1| already + Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| + vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) + vcmpgtub v10, v14, v0 + + Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] + + max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) + max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) + max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) + + vmaxub v14, v14, v4 ;# M = max interior abs diff + vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded + + Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) + vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded + vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded + + ;# replace P1,Q1 w/signed versions + common_adjust \P0, \Q0, \P1, \Q1, 1 + + vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant + vsrab v13, v13, v1 + vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev + vsubsbs \Q1, \Q1, v13 + vaddsbs \P1, \P1, v13 + + vxor \P1, \P1, v11 ;# P1 + vxor \P0, \P0, v11 ;# P0 + vxor \Q0, \Q0, v11 ;# Q0 + vxor \Q1, \Q1, v11 ;# Q1 +.endm + + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +loop_filter_vertical_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + addi r9, r3, 0 + RL v16, r9, r4 + RL v17, r9, r4 + RL v18, r9, r4 + RL v19, r9, r4 + RL v20, r9, r4 + RL v21, r9, r4 + RL v22, r9, r4 + RL v23, r9, r4 + RL v24, r9, r4 + RL v25, r9, r4 + RL v26, r9, r4 + RL v27, r9, r4 + RL v28, r9, r4 + RL v29, r9, r4 + RL v30, r9, r4 + lvx v31, 0, r9 + + Transpose16x16 + + vspltisb v1, 1 + + build_constants r5, r6, r7, v3, v2, v0 + + Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| + + Fil v16, v17, v18, v19, v20, v21, v22, v23 + Fil v20, v21, v22, v23, v24, v25, v26, v27 + Fil v24, v25, v26, v27, v28, v29, v30, v31 + + Transpose16x16 + + addi r9, r3, 0 + WL v16, r9, r4 + WL v17, r9, r4 + WL v18, r9, r4 + WL v19, r9, r4 + WL v20, r9, r4 + WL v21, r9, r4 + WL v22, r9, r4 + WL v23, r9, r4 + WL v24, r9, r4 + WL v25, r9, r4 + WL v26, r9, r4 + WL v27, r9, r4 + WL v28, r9, r4 + WL v29, r9, r4 + WL v30, r9, r4 + stvx v31, 0, r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +.macro active_chroma_sel V + andi. r7, r3, 8 ;# row origin modulo 16 + add r7, r7, r7 ;# selects selectors + lis r12, _chromaSelectors@ha + la r0, _chromaSelectors@l(r12) + lwzux r0, r7, r0 ;# leave selector addr in r7 + + lvx \V, 0, r0 ;# mask to concatenate active U,V pels +.endm + +.macro hread_uv Dest, U, V, Offs, VMask + lvx \U, \Offs, r3 + lvx \V, \Offs, r4 + vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V +.endm + +.macro hwrite_uv New, U, V, Offs, Umask, Vmask + vperm \U, \New, \U, \Umask ;# Combine new pels with siblings + vperm \V, \New, \V, \Vmask + stvx \U, \Offs, r3 ;# Write to frame buffer + stvx \V, \Offs, r4 +.endm + +;# Process U,V in parallel. +.macro load_chroma_h + neg r9, r5 ;# r9 = -1 * stride + add r8, r9, r9 ;# r8 = -2 * stride + add r10, r5, r5 ;# r10 = 2 * stride + + active_chroma_sel v12 + + ;# P3, Q3 are read-only; need not save addresses or sibling pels + add r6, r8, r8 ;# r6 = -4 * stride + hread_uv v0, v14, v15, r6, v12 + add r6, r10, r5 ;# r6 = 3 * stride + hread_uv v7, v14, v15, r6, v12 + + ;# Others are read/write; save addresses and sibling pels + + add r6, r8, r9 ;# r6 = -3 * stride + hread_uv v1, v16, v17, r6, v12 + hread_uv v2, v18, v19, r8, v12 + hread_uv v3, v20, v21, r9, v12 + hread_uv v4, v22, v23, 0, v12 + hread_uv v5, v24, v25, r5, v12 + hread_uv v6, v26, v27, r10, v12 +.endm + +.macro uresult_sel V + load_g \V, 4(r7) +.endm + +.macro vresult_sel V + load_g \V, 8(r7) +.endm + +;# always write P1,P0,Q0,Q1 +.macro store_chroma_h + uresult_sel v11 + vresult_sel v12 + hwrite_uv v2, v18, v19, r8, v11, v12 + hwrite_uv v3, v20, v21, r9, v11, v12 + hwrite_uv v4, v22, v23, 0, v11, v12 + hwrite_uv v5, v24, v25, r5, v11, v12 +.endm + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +mbloop_filter_horizontal_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r6, r7, r8, v8, v9, v10 + + load_chroma_h + + vp8_mbfilter + + store_chroma_h + + hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 + hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +loop_filter_horizontal_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r6, r7, r8, v8, v9, v10 + + load_chroma_h + + SBFilter + + store_chroma_h + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro R V, R + lwzux r0, r3, r5 + stw r0, 4(\R) + lwz r0,-4(r3) + stw r0, 0(\R) + lwzux r0, r4, r5 + stw r0,12(\R) + lwz r0,-4(r4) + stw r0, 8(\R) + lvx \V, 0, \R +.endm + + +.macro W V, R + stvx \V, 0, \R + lwz r0,12(\R) + stwux r0, r4, r5 + lwz r0, 8(\R) + stw r0,-4(r4) + lwz r0, 4(\R) + stwux r0, r3, r5 + lwz r0, 0(\R) + stw r0,-4(r3) +.endm + +.macro chroma_vread R + sub r3, r3, r5 ;# back up one line for simplicity + sub r4, r4, r5 + + R v0, \R + R v1, \R + R v2, \R + R v3, \R + R v4, \R + R v5, \R + R v6, \R + R v7, \R + + transpose8x16_fwd +.endm + +.macro chroma_vwrite R + + transpose8x16_inv + + add r3, r3, r5 + add r4, r4, r5 + neg r5, r5 ;# Write rows back in reverse order + + W v17, \R + W v16, \R + W v15, \R + W v14, \R + W v13, \R + W v12, \R + W v11, \R + W v10, \R +.endm + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +mbloop_filter_vertical_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + + chroma_vread r9 + + build_constants r6, r7, r8, v8, v9, v10 + + vp8_mbfilter + + chroma_vwrite r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +loop_filter_vertical_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + + chroma_vread r9 + + build_constants r6, r7, r8, v8, v9, v10 + + SBFilter + + chroma_vwrite r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- + +.macro vp8_simple_filter + Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) + vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit + + ;# preserve unsigned v0 and v3 + common_adjust v1, v2, v0, v3, 0 + + vxor v1, v1, v11 + vxor v2, v2, v11 ;# cvt Q0, P0 back to pels +.endm + +.macro simple_vertical + addi r8, 0, 16 + addi r7, r5, 32 + + lvx v0, 0, r5 + lvx v1, r8, r5 + lvx v2, 0, r7 + lvx v3, r8, r7 + + lis r12, _B_hihi@ha + la r0, _B_hihi@l(r12) + lvx v16, 0, r0 + + lis r12, _B_lolo@ha + la r0, _B_lolo@l(r12) + lvx v17, 0, r0 + + Transpose4times4x4 v16, v17 + vp8_simple_filter + + vxor v0, v0, v11 + vxor v3, v3, v11 ;# cvt Q0, P0 back to pels + + Transpose4times4x4 v16, v17 + + stvx v0, 0, r5 + stvx v1, r8, r5 + stvx v2, 0, r7 + stvx v3, r8, r7 +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +loop_filter_simple_horizontal_edge_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + ;# build constants + lvx v8, 0, r5 ;# flimit + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 + + neg r5, r4 ;# r5 = -1 * stride + add r6, r5, r5 ;# r6 = -2 * stride + + lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge + lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge + lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge + lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge + + vp8_simple_filter + + stvx v1, r5, r3 ;# store P0 + stvx v2, 0, r3 ;# store Q0 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro RLV Offs + stw r0, (\Offs*4)(r5) + lwzux r0, r7, r4 +.endm + +.macro WLV Offs + lwz r0, (\Offs*4)(r5) + stwux r0, r7, r4 +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +loop_filter_simple_vertical_edge_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + ;# build constants + lvx v8, 0, r5 ;# flimit + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 + + la r5, -96(r1) ;# temporary space for reading in vectors + + ;# Store 4 pels at word "Offs" in temp array, then advance r7 + ;# to next row and read another 4 pels from the frame buffer. + + subi r7, r3, 2 ;# r7 -> 2 pels before start + lwzx r0, 0, r7 ;# read first 4 pels + + ;# 16 unaligned word accesses + RLV 0 + RLV 4 + RLV 8 + RLV 12 + RLV 1 + RLV 5 + RLV 9 + RLV 13 + RLV 2 + RLV 6 + RLV 10 + RLV 14 + RLV 3 + RLV 7 + RLV 11 + + stw r0, (15*4)(r5) ;# write last 4 pels + + simple_vertical + + ;# Read temp array, write frame buffer. + subi r7, r3, 2 ;# r7 -> 2 pels before start + lwzx r0, 0, r5 ;# read/write first 4 pels + stwx r0, 0, r7 + + WLV 4 + WLV 8 + WLV 12 + WLV 1 + WLV 5 + WLV 9 + WLV 13 + WLV 2 + WLV 6 + WLV 10 + WLV 14 + WLV 3 + WLV 7 + WLV 11 + WLV 15 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + +_chromaSelectors: + .long _B_hihi + .long _B_Ures0 + .long _B_Vres0 + .long 0 + .long _B_lolo + .long _B_Ures8 + .long _B_Vres8 + .long 0 + + .align 4 +_B_Vres8: + .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 + + .align 4 +_B_Ures8: + .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 + + .align 4 +_B_lolo: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 + + .align 4 +_B_Vres0: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 + .align 4 +_B_Ures0: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 + + .align 4 +_B_hihi: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm new file mode 100644 index 000000000..f81d86f74 --- /dev/null +++ b/vp8/common/ppc/platform_altivec.asm @@ -0,0 +1,59 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl save_platform_context + .globl restore_platform_context + +.macro W V P + stvx \V, 0, \P + addi \P, \P, 16 +.endm + +.macro R V P + lvx \V, 0, \P + addi \P, \P, 16 +.endm + +;# r3 context_ptr + .align 2 +save_platform_contex: + W v20, r3 + W v21, r3 + W v22, r3 + W v23, r3 + W v24, r3 + W v25, r3 + W v26, r3 + W v27, r3 + W v28, r3 + W v29, r3 + W v30, r3 + W v31, r3 + + blr + +;# r3 context_ptr + .align 2 +restore_platform_context: + R v20, r3 + R v21, r3 + R v22, r3 + R v23, r3 + R v24, r3 + R v25, r3 + R v26, r3 + R v27, r3 + R v28, r3 + R v29, r3 + R v30, r3 + R v31, r3 + + blr diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm new file mode 100644 index 000000000..dd39e05a8 --- /dev/null +++ b/vp8/common/ppc/recon_altivec.asm @@ -0,0 +1,175 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl recon4b_ppc + .globl recon2b_ppc + .globl recon_b_ppc + +.macro row_of16 Diff Pred Dst Stride + lvx v1, 0, \Pred ;# v1 = pred = p0..p15 + addi \Pred, \Pred, 16 ;# next pred + vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 + lvx v3, 0, \Diff ;# v3 = d0..d7 + vaddshs v2, v2, v3 ;# v2 = r0..r7 + vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 + lvx v3, r8, \Diff ;# v3 = d8..d15 + addi \Diff, \Diff, 32 ;# next diff + vaddshs v3, v3, v1 ;# v3 = r8..r15 + vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15 + stvx v2, 0, \Dst ;# to dst + add \Dst, \Dst, \Stride ;# next dst +.endm + + .text + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride +recon4b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + li r8, 16 + + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr + +.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels + lvx v1, 0, \Pred ;# v1 = pred = p0..p15 + vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 + lvx v3, 0, \Diff ;# v3 = d0..d7 + vaddshs v2, v2, v3 ;# v2 = r0..r7 + vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 + lvx v3, r8, \Diff ;# v2 = d8..d15 + vaddshs v3, v3, v1 ;# v3 = r8..r15 + vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15 + stvx v2, 0, r10 ;# 2 rows to dst from buf + lwz r0, 0(r10) +.if \write_first_four_pels + stw r0, 0(\Dst) + .else + stwux r0, \Dst, \Stride +.endif + lwz r0, 4(r10) + stw r0, 4(\Dst) + lwz r0, 8(r10) + stwux r0, \Dst, \Stride ;# advance dst to next row + lwz r0, 12(r10) + stw r0, 4(\Dst) +.endm + + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride + +recon2b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + li r8, 16 + + la r10, -48(r1) ;# buf + + two_rows_of8 r3, r4, r5, r6, 1 + + addi r4, r4, 16; ;# next pred + addi r3, r3, 32; ;# next diff + + two_rows_of8 r3, r4, r5, r6, 0 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr + +.macro get_two_diff_rows + stw r0, 0(r10) + lwz r0, 4(r3) + stw r0, 4(r10) + lwzu r0, 32(r3) + stw r0, 8(r10) + lwz r0, 4(r3) + stw r0, 12(r10) + lvx v3, 0, r10 +.endm + + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride +recon_b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + + la r10, -48(r1) ;# buf + + lwz r0, 0(r4) + stw r0, 0(r10) + lwz r0, 16(r4) + stw r0, 4(r10) + lwz r0, 32(r4) + stw r0, 8(r10) + lwz r0, 48(r4) + stw r0, 12(r10) + + lvx v1, 0, r10; ;# v1 = pred = p0..p15 + + lwz r0, 0(r3) ;# v3 = d0..d7 + + get_two_diff_rows + + vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7 + vaddshs v2, v2, v3; ;# v2 = r0..r7 + + lwzu r0, 32(r3) ;# v3 = d8..d15 + + get_two_diff_rows + + vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15 + vaddshs v3, v3, v1; ;# v3 = r8..r15 + + vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15 + stvx v2, 0, r10; ;# 16 pels to dst from buf + + lwz r0, 0(r10) + stw r0, 0(r5) + lwz r0, 4(r10) + stwux r0, r5, r6 + lwz r0, 8(r10) + stwux r0, r5, r6 + lwz r0, 12(r10) + stwx r0, r5, r6 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr diff --git a/vp8/common/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm new file mode 100644 index 000000000..e5f26380f --- /dev/null +++ b/vp8/common/ppc/sad_altivec.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl vp8_sad16x16_ppc + .globl vp8_sad16x8_ppc + .globl vp8_sad8x16_ppc + .globl vp8_sad8x8_ppc + .globl vp8_sad4x4_ppc + +.macro load_aligned_16 V R O + lvsl v3, 0, \R ;# permutate value for alignment + + lvx v1, 0, \R + lvx v2, \O, \R + + vperm \V, v1, v2, v3 +.endm + +.macro prologue + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + li r10, 16 ;# load offset and loop counter + + vspltisw v8, 0 ;# zero out total to start +.endm + +.macro epilogue + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE +.endm + +.macro SAD_16 + ;# v6 = abs (v4 - v5) + vsububs v6, v4, v5 + vsububs v7, v5, v4 + vor v6, v6, v7 + + ;# v8 += abs (v4 - v5) + vsum4ubs v8, v6, v8 +.endm + +.macro sad_16_loop loop_label + lvsl v3, 0, r5 ;# only needs to be done once per block + + ;# preload a line of data before getting into the loop + lvx v4, 0, r3 + lvx v1, 0, r5 + lvx v2, r10, r5 + + add r5, r5, r6 + add r3, r3, r4 + + vperm v5, v1, v2, v3 + + .align 4 +\loop_label: + ;# compute difference on first row + vsububs v6, v4, v5 + vsububs v7, v5, v4 + + ;# load up next set of data + lvx v9, 0, r3 + lvx v1, 0, r5 + lvx v2, r10, r5 + + ;# perform abs() of difference + vor v6, v6, v7 + add r3, r3, r4 + + ;# add to the running tally + vsum4ubs v8, v6, v8 + + ;# now onto the next line + vperm v5, v1, v2, v3 + add r5, r5, r6 + lvx v4, 0, r3 + + ;# compute difference on second row + vsububs v6, v9, v5 + lvx v1, 0, r5 + vsububs v7, v5, v9 + lvx v2, r10, r5 + vor v6, v6, v7 + add r3, r3, r4 + vsum4ubs v8, v6, v8 + vperm v5, v1, v2, v3 + add r5, r5, r6 + + bdnz \loop_label + + vspltisw v7, 0 + + vsumsws v8, v8, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) +.endm + +.macro sad_8_loop loop_label + .align 4 +\loop_label: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + ;# only one of the inputs should need to be aligned. + load_aligned_16 v6, r3, r10 + load_aligned_16 v7, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + vmrghb v4, v4, v6 + vmrghb v5, v5, v7 + + SAD_16 + + bdnz \loop_label + + vspltisw v7, 0 + + vsumsws v8, v8, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad16x16_ppc: + + prologue + + li r9, 8 + mtctr r9 + + sad_16_loop sad16x16_loop + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad16x8_ppc: + + prologue + + li r9, 4 + mtctr r9 + + sad_16_loop sad16x8_loop + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad8x16_ppc: + + prologue + + li r9, 8 + mtctr r9 + + sad_8_loop sad8x16_loop + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad8x8_ppc: + + prologue + + li r9, 4 + mtctr r9 + + sad_8_loop sad8x8_loop + + epilogue + + blr + +.macro transfer_4x4 I P + lwz r0, 0(\I) + add \I, \I, \P + + lwz r7, 0(\I) + add \I, \I, \P + + lwz r8, 0(\I) + add \I, \I, \P + + lwz r9, 0(\I) + + stw r0, 0(r1) + stw r7, 4(r1) + stw r8, 8(r1) + stw r9, 12(r1) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_stride +;# r5 unsigned char *ref_ptr +;# r6 int ref_stride +;# +;# r3 return value +vp8_sad4x4_ppc: + + prologue + + transfer_4x4 r3, r4 + lvx v4, 0, r1 + + transfer_4x4 r5, r6 + lvx v5, 0, r1 + + vspltisw v8, 0 ;# zero out total to start + + ;# v6 = abs (v4 - v5) + vsububs v6, v4, v5 + vsububs v7, v5, v4 + vor v6, v6, v7 + + ;# v8 += abs (v4 - v5) + vsum4ubs v7, v6, v8 + vsumsws v7, v7, v8 + + stvx v7, 0, r1 + lwz r3, 12(r1) + + epilogue + + blr diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c new file mode 100644 index 000000000..87f4cac72 --- /dev/null +++ b/vp8/common/ppc/systemdependent.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "subpixel.h" +#include "loopfilter.h" +#include "recon.h" +#include "idct.h" +#include "onyxc_int.h" + +void (*vp8_short_idct4x4)(short *input, short *output, int pitch); +void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); +void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); + +extern void (*vp8_post_proc_down_and_across_mb_row)( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int cols, + unsigned char *f, + int size +); + +extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); +extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit); +extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); +extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); + +extern void vp8_post_proc_down_and_across_mb_row_c +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int cols, + unsigned char *f, + int size +); +void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); + +extern copy_mem_block_function *vp8_copy_mem16x16; +extern copy_mem_block_function *vp8_copy_mem8x8; +extern copy_mem_block_function *vp8_copy_mem8x4; + +// PPC +extern subpixel_predict_function sixtap_predict_ppc; +extern subpixel_predict_function sixtap_predict8x4_ppc; +extern subpixel_predict_function sixtap_predict8x8_ppc; +extern subpixel_predict_function sixtap_predict16x16_ppc; +extern subpixel_predict_function bilinear_predict4x4_ppc; +extern subpixel_predict_function bilinear_predict8x4_ppc; +extern subpixel_predict_function bilinear_predict8x8_ppc; +extern subpixel_predict_function bilinear_predict16x16_ppc; + +extern copy_mem_block_function copy_mem16x16_ppc; + +void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); + +extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); + +// Generic C +extern subpixel_predict_function vp8_sixtap_predict_c; +extern subpixel_predict_function vp8_sixtap_predict8x4_c; +extern subpixel_predict_function vp8_sixtap_predict8x8_c; +extern subpixel_predict_function vp8_sixtap_predict16x16_c; +extern subpixel_predict_function vp8_bilinear_predict4x4_c; +extern subpixel_predict_function vp8_bilinear_predict8x4_c; +extern subpixel_predict_function vp8_bilinear_predict8x8_c; +extern subpixel_predict_function vp8_bilinear_predict16x16_c; + +extern copy_mem_block_function vp8_copy_mem16x16_c; +extern copy_mem_block_function vp8_copy_mem8x8_c; +extern copy_mem_block_function vp8_copy_mem8x4_c; + +void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); + +extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); +extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch); +extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); + +// PPC +extern loop_filter_block_function loop_filter_mbv_ppc; +extern loop_filter_block_function loop_filter_bv_ppc; +extern loop_filter_block_function loop_filter_mbh_ppc; +extern loop_filter_block_function loop_filter_bh_ppc; + +extern loop_filter_block_function loop_filter_mbvs_ppc; +extern loop_filter_block_function loop_filter_bvs_ppc; +extern loop_filter_block_function loop_filter_mbhs_ppc; +extern loop_filter_block_function loop_filter_bhs_ppc; + +// Generic C +extern loop_filter_block_function vp8_loop_filter_mbv_c; +extern loop_filter_block_function vp8_loop_filter_bv_c; +extern loop_filter_block_function vp8_loop_filter_mbh_c; +extern loop_filter_block_function vp8_loop_filter_bh_c; + +extern loop_filter_block_function vp8_loop_filter_mbvs_c; +extern loop_filter_block_function vp8_loop_filter_bvs_c; +extern loop_filter_block_function vp8_loop_filter_mbhs_c; +extern loop_filter_block_function vp8_loop_filter_bhs_c; + +extern loop_filter_block_function *vp8_lf_mbvfull; +extern loop_filter_block_function *vp8_lf_mbhfull; +extern loop_filter_block_function *vp8_lf_bvfull; +extern loop_filter_block_function *vp8_lf_bhfull; + +extern loop_filter_block_function *vp8_lf_mbvsimple; +extern loop_filter_block_function *vp8_lf_mbhsimple; +extern loop_filter_block_function *vp8_lf_bvsimple; +extern loop_filter_block_function *vp8_lf_bhsimple; + +void vp8_clear_c(void) +{ +} + +void vp8_machine_specific_config(void) +{ + // Pure C: + vp8_clear_system_state = vp8_clear_c; + vp8_recon_b = vp8_recon_b_c; + vp8_recon4b = vp8_recon4b_c; + vp8_recon2b = vp8_recon2b_c; + + vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc; + vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc; + vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc; + vp8_bilinear_predict = bilinear_predict4x4_ppc; + + vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc; + vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc; + vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc; + vp8_sixtap_predict = sixtap_predict_ppc; + + vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c; + vp8_short_idct4x4 = short_idct4x4llm_ppc; + vp8_dc_only_idct = vp8_dc_only_idct_c; + + vp8_lf_mbvfull = loop_filter_mbv_ppc; + vp8_lf_bvfull = loop_filter_bv_ppc; + vp8_lf_mbhfull = loop_filter_mbh_ppc; + vp8_lf_bhfull = loop_filter_bh_ppc; + + vp8_lf_mbvsimple = loop_filter_mbvs_ppc; + vp8_lf_bvsimple = loop_filter_bvs_ppc; + vp8_lf_mbhsimple = loop_filter_mbhs_ppc; + vp8_lf_bhsimple = loop_filter_bhs_ppc; + + vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; + vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; + vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; + vp8_plane_add_noise = vp8_plane_add_noise_c; + + vp8_copy_mem16x16 = copy_mem16x16_ppc; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + +} diff --git a/vp8/common/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm new file mode 100644 index 000000000..fb8d5bb1d --- /dev/null +++ b/vp8/common/ppc/variance_altivec.asm @@ -0,0 +1,375 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl vp8_get8x8var_ppc + .globl vp8_get16x16var_ppc + .globl vp8_mse16x16_ppc + .globl vp8_variance16x16_ppc + .globl vp8_variance16x8_ppc + .globl vp8_variance8x16_ppc + .globl vp8_variance8x8_ppc + .globl vp8_variance4x4_ppc + +.macro load_aligned_16 V R O + lvsl v3, 0, \R ;# permutate value for alignment + + lvx v1, 0, \R + lvx v2, \O, \R + + vperm \V, v1, v2, v3 +.endm + +.macro prologue + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + li r10, 16 ;# load offset and loop counter + + vspltisw v7, 0 ;# zero for merging + vspltisw v8, 0 ;# zero out total to start + vspltisw v9, 0 ;# zero out total for dif^2 +.endm + +.macro epilogue + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE +.endm + +.macro compute_sum_sse + ;# Compute sum first. Unpack to so signed subract + ;# can be used. Only have a half word signed + ;# subract. Do high, then low. + vmrghb v2, v7, v4 + vmrghb v3, v7, v5 + vsubshs v2, v2, v3 + vsum4shs v8, v2, v8 + + vmrglb v2, v7, v4 + vmrglb v3, v7, v5 + vsubshs v2, v2, v3 + vsum4shs v8, v2, v8 + + ;# Now compute sse. + vsububs v2, v4, v5 + vsububs v3, v5, v4 + vor v2, v2, v3 + + vmsumubm v9, v2, v2, v9 +.endm + +.macro variance_16 DS loop_label store_sum +\loop_label: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + compute_sum_sse + + bdnz \loop_label + + vsumsws v8, v8, v7 + vsumsws v9, v9, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r4, 12(r1) + +.if \store_sum + stw r3, 0(r8) ;# sum +.endif + stw r4, 0(r7) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srlwi r3, r3, \DS ;# (sum*sum) >> DS + subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) +.endm + +.macro variance_8 DS loop_label store_sum +\loop_label: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + ;# only one of the inputs should need to be aligned. + load_aligned_16 v6, r3, r10 + load_aligned_16 v0, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + vmrghb v4, v4, v6 + vmrghb v5, v5, v0 + + compute_sum_sse + + bdnz \loop_label + + vsumsws v8, v8, v7 + vsumsws v9, v9, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r4, 12(r1) + +.if \store_sum + stw r3, 0(r8) ;# sum +.endif + stw r4, 0(r7) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srlwi r3, r3, \DS ;# (sum*sum) >> 8 + subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *SSE +;# r8 int *Sum +;# +;# r3 return value +vp8_get8x8var_ppc: + + prologue + + li r9, 4 + mtctr r9 + + variance_8 6, get8x8var_loop, 1 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *SSE +;# r8 int *Sum +;# +;# r3 return value +vp8_get16x16var_ppc: + + prologue + + mtctr r10 + + variance_16 8, get16x16var_loop, 1 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r 3 return value +vp8_mse16x16_ppc: + prologue + + mtctr r10 + +mse16x16_loop: + ;# only one of the inputs should need to be aligned. + load_aligned_16 v4, r3, r10 + load_aligned_16 v5, r5, r10 + + ;# move onto the next line + add r3, r3, r4 + add r5, r5, r6 + + ;# Now compute sse. + vsububs v2, v4, v5 + vsububs v3, v5, v4 + vor v2, v2, v3 + + vmsumubm v9, v2, v2, v9 + + bdnz mse16x16_loop + + vsumsws v9, v9, v7 + + stvx v9, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r3, 12(r1) + + stw r3, 0(r7) ;# sse + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance16x16_ppc: + + prologue + + mtctr r10 + + variance_16 8, variance16x16_loop, 0 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance16x8_ppc: + + prologue + + li r9, 8 + mtctr r9 + + variance_16 7, variance16x8_loop, 0 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance8x16_ppc: + + prologue + + li r9, 8 + mtctr r9 + + variance_8 7, variance8x16_loop, 0 + + epilogue + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance8x8_ppc: + + prologue + + li r9, 4 + mtctr r9 + + variance_8 6, variance8x8_loop, 0 + + epilogue + + blr + +.macro transfer_4x4 I P + lwz r0, 0(\I) + add \I, \I, \P + + lwz r10,0(\I) + add \I, \I, \P + + lwz r8, 0(\I) + add \I, \I, \P + + lwz r9, 0(\I) + + stw r0, 0(r1) + stw r10, 4(r1) + stw r8, 8(r1) + stw r9, 12(r1) +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int source_stride +;# r5 unsigned char *ref_ptr +;# r6 int recon_stride +;# r7 unsigned int *sse +;# +;# r3 return value +vp8_variance4x4_ppc: + + prologue + + transfer_4x4 r3, r4 + lvx v4, 0, r1 + + transfer_4x4 r5, r6 + lvx v5, 0, r1 + + compute_sum_sse + + vsumsws v8, v8, v7 + vsumsws v9, v9, v7 + + stvx v8, 0, r1 + lwz r3, 12(r1) + + stvx v9, 0, r1 + lwz r4, 12(r1) + + stw r4, 0(r7) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srlwi r3, r3, 4 ;# (sum*sum) >> 4 + subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) + + epilogue + + blr diff --git a/vp8/common/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm new file mode 100644 index 000000000..2308373a1 --- /dev/null +++ b/vp8/common/ppc/variance_subpixel_altivec.asm @@ -0,0 +1,865 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl vp8_sub_pixel_variance4x4_ppc + .globl vp8_sub_pixel_variance8x8_ppc + .globl vp8_sub_pixel_variance8x16_ppc + .globl vp8_sub_pixel_variance16x8_ppc + .globl vp8_sub_pixel_variance16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_vfilter V0, V1 + load_c \V0, vfilter_b, r6, r12, r10 + + addi r6, r6, 16 + lvx \V1, r6, r10 +.endm + +.macro HProlog jump_label + ;# load up horizontal filter + slwi. r5, r5, 4 ;# index into horizontal filter array + + ;# index to the next set of vectors in the row. + li r10, 16 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq \jump_label + + load_c v20, hfilter_b, r5, r12, r0 + + ;# setup constants + ;# v14 permutation value for alignment + load_c v28, b_hperm_b, 0, r12, r0 + + ;# index to the next set of vectors in the row. + li r12, 32 + + ;# rounding added in on the multiply + vspltisw v21, 8 + vspltisw v18, 3 + vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 + + slwi. r6, r6, 5 ;# index into vertical filter array +.endm + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# + +.macro hfilter_8 V, hp, lp, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 9 bytes wide, output is 8 bytes. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + + vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 + vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A + + vmsummbm v24, v20, v24, v18 + vmsummbm v25, v20, v25, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + + vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result +.endm + +.macro vfilter_16 P0 P1 + vmuleub v22, \P0, v20 ;# 64 + 4 positive taps + vadduhm v22, v18, v22 + vmuloub v23, \P0, v20 + vadduhm v23, v18, v23 + + vmuleub v24, \P1, v21 + vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary + vmuloub v25, \P1, v21 + vadduhm v23, v23, v25 ;# Ro = odds + + vsrh v22, v22, v19 ;# divide by 128 + vsrh v23, v23, v19 ;# v16 v17 = evens, odds + vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order + vmrglh v23, v22, v23 + vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result +.endm + +.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 + ;# Compute sum first. Unpack to so signed subract + ;# can be used. Only have a half word signed + ;# subract. Do high, then low. + vmrghb \t1, \z0, \src + vmrghb \t2, \z0, \ref + vsubshs \t1, \t1, \t2 + vsum4shs \sum, \t1, \sum + + vmrglb \t1, \z0, \src + vmrglb \t2, \z0, \ref + vsubshs \t1, \t1, \t2 + vsum4shs \sum, \t1, \sum + + ;# Now compute sse. + vsububs \t1, \src, \ref + vsububs \t2, \ref, \src + vor \t1, \t1, \t2 + + vmsumubm \sse, \t1, \t1, \sse +.endm + +.macro variance_final sum, sse, z0, DS + vsumsws \sum, \sum, \z0 + vsumsws \sse, \sse, \z0 + + stvx \sum, 0, r1 + lwz r3, 12(r1) + + stvx \sse, 0, r1 + lwz r4, 12(r1) + + stw r4, 0(r9) ;# sse + + mullw r3, r3, r3 ;# sum*sum + srlwi r3, r3, \DS ;# (sum*sum) >> 8 + subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) +.endm + +.macro compute_sum_sse_16 V, increment_counter + load_and_align_16 v16, r7, r8, \increment_counter + compute_sum_sse \V, v16, v18, v19, v20, v21, v23 +.endm + +.macro load_and_align_16 V, R, P, increment_counter + lvsl v17, 0, \R ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, \R + lvx v22, r10, \R + +.if \increment_counter + add \R, \R, \P +.endif + + vperm \V, v21, v22, v17 +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance4x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_4x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r12, r0 + load_c v11, b_4567_b, 0, r12, r0 + + hfilter_8 v0, v10, v11, 1 + hfilter_8 v1, v10, v11, 1 + hfilter_8 v2, v10, v11, 1 + hfilter_8 v3, v10, v11, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_4x4_b + + hfilter_8 v4, v10, v11, 0 + + b second_pass_4x4_b + +second_pass_4x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 0 + +second_pass_4x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +compute_sum_sse_4x4_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + load_and_align_16 v4, r7, r8, 1 + load_and_align_16 v5, r7, r8, 1 + load_and_align_16 v6, r7, r8, 1 + load_and_align_16 v7, r7, r8, 1 + + vmrghb v0, v0, v1 + vmrghb v1, v2, v3 + + vmrghb v2, v4, v5 + vmrghb v3, v6, v7 + + load_c v10, b_hilo_b, 0, r12, r0 + + vperm v0, v0, v1, v10 + vperm v1, v2, v3, v10 + + compute_sum_sse v0, v1, v18, v19, v20, v21, v23 + + variance_final v18, v19, v23, 4 + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff0 + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x8_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r12, r0 + load_c v11, b_4567_b, 0, r12, r0 + + hfilter_8 v0, v10, v11, 1 + hfilter_8 v1, v10, v11, 1 + hfilter_8 v2, v10, v11, 1 + hfilter_8 v3, v10, v11, 1 + hfilter_8 v4, v10, v11, 1 + hfilter_8 v5, v10, v11, 1 + hfilter_8 v6, v10, v11, 1 + hfilter_8 v7, v10, v11, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_8x8_b + + hfilter_8 v8, v10, v11, 0 + + b second_pass_8x8_b + +second_pass_8x8_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 0 + + beq compute_sum_sse_8x8_b + +second_pass_8x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +compute_sum_sse_8x8_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + vmrghb v0, v0, v1 + vmrghb v1, v2, v3 + vmrghb v2, v4, v5 + vmrghb v3, v6, v7 + + load_and_align_16 v4, r7, r8, 1 + load_and_align_16 v5, r7, r8, 1 + load_and_align_16 v6, r7, r8, 1 + load_and_align_16 v7, r7, r8, 1 + load_and_align_16 v8, r7, r8, 1 + load_and_align_16 v9, r7, r8, 1 + load_and_align_16 v10, r7, r8, 1 + load_and_align_16 v11, r7, r8, 0 + + vmrghb v4, v4, v5 + vmrghb v5, v6, v7 + vmrghb v6, v8, v9 + vmrghb v7, v10, v11 + + compute_sum_sse v0, v4, v18, v19, v20, v21, v23 + compute_sum_sse v1, v5, v18, v19, v20, v21, v23 + compute_sum_sse v2, v6, v18, v19, v20, v21, v23 + compute_sum_sse v3, v7, v18, v19, v20, v21, v23 + + variance_final v18, v19, v23, 6 + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance8x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfffc + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x16_pre_copy_b + + ;# Load up permutation constants + load_c v29, b_0123_b, 0, r12, r0 + load_c v30, b_4567_b, 0, r12, r0 + + hfilter_8 v0, v29, v30, 1 + hfilter_8 v1, v29, v30, 1 + hfilter_8 v2, v29, v30, 1 + hfilter_8 v3, v29, v30, 1 + hfilter_8 v4, v29, v30, 1 + hfilter_8 v5, v29, v30, 1 + hfilter_8 v6, v29, v30, 1 + hfilter_8 v7, v29, v30, 1 + hfilter_8 v8, v29, v30, 1 + hfilter_8 v9, v29, v30, 1 + hfilter_8 v10, v29, v30, 1 + hfilter_8 v11, v29, v30, 1 + hfilter_8 v12, v29, v30, 1 + hfilter_8 v13, v29, v30, 1 + hfilter_8 v14, v29, v30, 1 + hfilter_8 v15, v29, v30, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_8x16_b + + hfilter_8 v16, v29, v30, 0 + + b second_pass_8x16_b + +second_pass_8x16_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 1 + load_and_align_16 v9, r3, r4, 1 + load_and_align_16 v10, r3, r4, 1 + load_and_align_16 v11, r3, r4, 1 + load_and_align_16 v12, r3, r4, 1 + load_and_align_16 v13, r3, r4, 1 + load_and_align_16 v14, r3, r4, 1 + load_and_align_16 v15, r3, r4, 1 + load_and_align_16 v16, r3, r4, 0 + + beq compute_sum_sse_8x16_b + +second_pass_8x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +compute_sum_sse_8x16_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + vmrghb v0, v0, v1 + vmrghb v1, v2, v3 + vmrghb v2, v4, v5 + vmrghb v3, v6, v7 + vmrghb v4, v8, v9 + vmrghb v5, v10, v11 + vmrghb v6, v12, v13 + vmrghb v7, v14, v15 + + load_and_align_16 v8, r7, r8, 1 + load_and_align_16 v9, r7, r8, 1 + load_and_align_16 v10, r7, r8, 1 + load_and_align_16 v11, r7, r8, 1 + load_and_align_16 v12, r7, r8, 1 + load_and_align_16 v13, r7, r8, 1 + load_and_align_16 v14, r7, r8, 1 + load_and_align_16 v15, r7, r8, 1 + + vmrghb v8, v8, v9 + vmrghb v9, v10, v11 + vmrghb v10, v12, v13 + vmrghb v11, v14, v15 + + compute_sum_sse v0, v8, v18, v19, v20, v21, v23 + compute_sum_sse v1, v9, v18, v19, v20, v21, v23 + compute_sum_sse v2, v10, v18, v19, v20, v21, v23 + compute_sum_sse v3, v11, v18, v19, v20, v21, v23 + + load_and_align_16 v8, r7, r8, 1 + load_and_align_16 v9, r7, r8, 1 + load_and_align_16 v10, r7, r8, 1 + load_and_align_16 v11, r7, r8, 1 + load_and_align_16 v12, r7, r8, 1 + load_and_align_16 v13, r7, r8, 1 + load_and_align_16 v14, r7, r8, 1 + load_and_align_16 v15, r7, r8, 0 + + vmrghb v8, v8, v9 + vmrghb v9, v10, v11 + vmrghb v10, v12, v13 + vmrghb v11, v14, v15 + + compute_sum_sse v4, v8, v18, v19, v20, v21, v23 + compute_sum_sse v5, v9, v18, v19, v20, v21, v23 + compute_sum_sse v6, v10, v18, v19, v20, v21, v23 + compute_sum_sse v7, v11, v18, v19, v20, v21, v23 + + variance_final v18, v19, v23, 7 + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + blr + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro hfilter_16 V, increment_counter + + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + lvx v23, r12, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified + + ;# set 0 + vmsummbm v24, v20, v21, v18 ;# taps times elements + + ;# set 1 + vsldoi v23, v21, v22, 1 + vmsummbm v25, v20, v23, v18 + + ;# set 2 + vsldoi v23, v21, v22, 2 + vmsummbm v26, v20, v23, v18 + + ;# set 3 + vsldoi v23, v21, v22, 3 + vmsummbm v27, v20, v23, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + vsrh v25, v25, v19 + + vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result + vperm \V, \V, v0, v28 ;# \V = correctly-ordered result +.endm + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance16x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + HProlog second_pass_16x8_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_16x8_b + + hfilter_16 v8, 0 + + b second_pass_16x8_b + +second_pass_16x8_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 1 + + beq compute_sum_sse_16x8_b + +second_pass_16x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +compute_sum_sse_16x8_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + compute_sum_sse_16 v0, 1 + compute_sum_sse_16 v1, 1 + compute_sum_sse_16 v2, 1 + compute_sum_sse_16 v3, 1 + compute_sum_sse_16 v4, 1 + compute_sum_sse_16 v5, 1 + compute_sum_sse_16 v6, 1 + compute_sum_sse_16 v7, 0 + + variance_final v18, v19, v23, 7 + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *src_ptr +;# r4 int src_pixels_per_line +;# r5 int xoffset +;# r6 int yoffset +;# r7 unsigned char *dst_ptr +;# r8 int dst_pixels_per_line +;# r9 unsigned int *sse +;# +;# r3 return value +vp8_sub_pixel_variance16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1, -32(r1) ;# create space on the stack + + HProlog second_pass_16x16_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + hfilter_16 v8, 1 + hfilter_16 v9, 1 + hfilter_16 v10, 1 + hfilter_16 v11, 1 + hfilter_16 v12, 1 + hfilter_16 v13, 1 + hfilter_16 v14, 1 + hfilter_16 v15, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq compute_sum_sse_16x16_b + + hfilter_16 v16, 0 + + b second_pass_16x16_b + +second_pass_16x16_pre_copy_b: + slwi. r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, r3, r4, 1 + load_and_align_16 v1, r3, r4, 1 + load_and_align_16 v2, r3, r4, 1 + load_and_align_16 v3, r3, r4, 1 + load_and_align_16 v4, r3, r4, 1 + load_and_align_16 v5, r3, r4, 1 + load_and_align_16 v6, r3, r4, 1 + load_and_align_16 v7, r3, r4, 1 + load_and_align_16 v8, r3, r4, 1 + load_and_align_16 v9, r3, r4, 1 + load_and_align_16 v10, r3, r4, 1 + load_and_align_16 v11, r3, r4, 1 + load_and_align_16 v12, r3, r4, 1 + load_and_align_16 v13, r3, r4, 1 + load_and_align_16 v14, r3, r4, 1 + load_and_align_16 v15, r3, r4, 1 + load_and_align_16 v16, r3, r4, 0 + + beq compute_sum_sse_16x16_b + +second_pass_16x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +compute_sum_sse_16x16_b: + vspltish v18, 0 ;# sum + vspltish v19, 0 ;# sse + vspltish v23, 0 ;# unpack + li r10, 16 + + compute_sum_sse_16 v0, 1 + compute_sum_sse_16 v1, 1 + compute_sum_sse_16 v2, 1 + compute_sum_sse_16 v3, 1 + compute_sum_sse_16 v4, 1 + compute_sum_sse_16 v5, 1 + compute_sum_sse_16 v6, 1 + compute_sum_sse_16 v7, 1 + compute_sum_sse_16 v8, 1 + compute_sum_sse_16 v9, 1 + compute_sum_sse_16 v10, 1 + compute_sum_sse_16 v11, 1 + compute_sum_sse_16 v12, 1 + compute_sum_sse_16 v13, 1 + compute_sum_sse_16 v14, 1 + compute_sum_sse_16 v15, 0 + + variance_final v18, v19, v23, 8 + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +hfilter_b: + .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 + .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 + .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 + .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 + .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 + .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 + .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 + .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 + + .align 4 +vfilter_b: + .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + + .align 4 +b_hperm_b: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +b_0123_b: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +b_4567_b: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + +b_hilo_b: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 |