diff options
Diffstat (limited to 'vp9/common')
51 files changed, 5210 insertions, 744 deletions
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c index 3e3e400a4..fb7b5cdc4 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -20,26 +20,28 @@ extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, int16_t skip_adding, uint8_t *dest, int dest_stride); -extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, +extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input, int16_t *output, int output_stride); -extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *output, int16_t *pass1Output, int16_t skip_adding, uint8_t *dest, int dest_stride); -extern void save_neon_registers(); -extern void restore_neon_registers(); +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void vp9_push_neon(int64_t *store); +extern void vp9_pop_neon(int64_t *store); void vp9_short_idct16x16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the @@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } -void vp9_short_idct10_16x16_add_neon(int16_t *input, +void vp9_short_idct16x16_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); + vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct10_16x16_add_neon_pass2(input+1, + vp9_short_idct16x16_10_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } diff --git a/vp9/common/arm/neon/vp9_save_reg_neon.asm b/vp9/common/arm/neon/vp9_save_reg_neon.asm new file mode 100644 index 000000000..71c3e7077 --- /dev/null +++ b/vp9/common/arm/neon/vp9_save_reg_neon.asm @@ -0,0 +1,36 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_push_neon| + EXPORT |vp9_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp9_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm index 7464e800f..df2a0526c 100644 --- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm @@ -10,10 +10,8 @@ EXPORT |vp9_short_idct16x16_add_neon_pass1| EXPORT |vp9_short_idct16x16_add_neon_pass2| - EXPORT |vp9_short_idct10_16x16_add_neon_pass1| - EXPORT |vp9_short_idct10_16x16_add_neon_pass2| - EXPORT |save_neon_registers| - EXPORT |restore_neon_registers| + EXPORT |vp9_short_idct16x16_10_add_neon_pass1| + EXPORT |vp9_short_idct16x16_10_add_neon_pass2| ARM REQUIRE8 PRESERVE8 @@ -788,7 +786,7 @@ end_idct16x16_pass2 bx lr ENDP ; |vp9_short_idct16x16_add_neon_pass2| -;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input, +;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -798,7 +796,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass1| PROC +|vp9_short_idct16x16_10_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -907,9 +905,9 @@ end_idct16x16_pass2 vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass1| + ENDP ; |vp9_short_idct16x16_10_add_neon_pass1| -;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -926,7 +924,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass2| PROC +|vp9_short_idct16x16_10_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -1177,15 +1175,5 @@ end_idct16x16_pass2 end_idct10_16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass2| -;void |save_neon_registers|() -|save_neon_registers| PROC - vpush {d8-d15} - bx lr - ENDP ; |save_registers| -;void |restore_neon_registers|() -|restore_neon_registers| PROC - vpop {d8-d15} - bx lr - ENDP ; |restore_registers| + ENDP ; |vp9_short_idct16x16_10_add_neon_pass2| END diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm index 869ee5f3f..0d4a721c4 100644 --- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct4x4_1_add_neon| + EXPORT |vp9_idct4x4_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct4x4_1_add_neon| PROC +|vp9_idct4x4_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -63,6 +63,6 @@ vst1.32 {d7[1]}, [r12] bx lr - ENDP ; |vp9_short_idct4x4_1_add_neon| + ENDP ; |vp9_idct4x4_1_add_neon| END diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm index 640fb9356..00283fc8d 100644 --- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct4x4_add_neon| + EXPORT |vp9_idct4x4_16_add_neon| ARM REQUIRE8 PRESERVE8 @@ -16,13 +16,13 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct4x4_add_neon| PROC +|vp9_idct4x4_16_add_neon| PROC ; The 2D transform is done with two passes which are actually pretty ; similar. We first transform the rows. This is done by transposing @@ -185,6 +185,6 @@ vst1.32 {d26[1]}, [r1], r2 vst1.32 {d26[0]}, [r1] ; no post-increment bx lr - ENDP ; |vp9_short_idct4x4_add_neon| + ENDP ; |vp9_idct4x4_16_add_neon| END diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm index a744f59db..c02251a3d 100644 --- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -9,7 +9,7 @@ ; EXPORT |vp9_short_idct8x8_add_neon| - EXPORT |vp9_short_idct10_8x8_add_neon| + EXPORT |vp9_short_idct8x8_10_add_neon| ARM REQUIRE8 PRESERVE8 @@ -310,13 +310,13 @@ bx lr ENDP ; |vp9_short_idct8x8_add_neon| -;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct10_8x8_add_neon| PROC +|vp9_short_idct8x8_10_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_short_idct10_8x8_add_neon| + ENDP ; |vp9_short_idct8x8_10_add_neon| END diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c index f14472113..536febb65 100644 --- a/vp9/common/generic/vp9_systemdependent.c +++ b/vp9/common/generic/vp9_systemdependent.c @@ -10,7 +10,7 @@ #include "./vpx_config.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_onyxc_int.h" void vp9_machine_specific_config(VP9_COMMON *cm) { diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h new file mode 100644 index 000000000..d2fa4c1dc --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_ +#define VP9_COMMON_VP9_COMMON_DSPR2_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" + +#if HAVE_DSPR2 +#define CROP_WIDTH 512 +extern uint8_t *vp9_ff_cropTbl; + +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \ + \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ + \ + __asm__ __volatile__ ( \ + /* out = dct_const_round_shift(input_dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 \n\t"\ + "mthi $zero, $ac1 \n\t"\ + "madd $ac1, %[in], %[cospi_16_64] \n\t"\ + "extp %[tmp], $ac1, 31 \n\t"\ + \ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 \n\t"\ + "mthi $zero, $ac2 \n\t"\ + "madd $ac2, %[tmp], %[cospi_16_64] \n\t"\ + "extp %[out], $ac2, 31 \n\t"\ + \ + : [tmp] "=&r" (tmp), [out] "=r" (out) \ + : [in] "r" (in), \ + [dct_cost_rounding] "r" (dct_cost_rounding), \ + [cospi_16_64] "r" (cospi_16_64) \ + ); \ + out; }) + +static INLINE void vp9_prefetch_load(const unsigned char *src) { + __asm__ __volatile__ ( + "pref 0, 0(%[src]) \n\t" + : + : [src] "r" (src) + ); +} + +/* prefetch data for store */ +static INLINE void vp9_prefetch_store(unsigned char *dst) { + __asm__ __volatile__ ( + "pref 1, 0(%[dst]) \n\t" + : + : [dst] "r" (dst) + ); +} + +static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__ ( + "pref 4, 0(%[src]) \n\t" + : + : [src] "r" (src) + ); +} + +/* prefetch data for store */ +static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__ ( + "pref 5, 0(%[dst]) \n\t" + : + : [dst] "r" (dst) + ); +} + +#endif // #if HAVE_DSPR2 +#endif // VP9_COMMON_VP9_COMMON_DSPR2_H_ diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c new file mode 100644 index 000000000..0930ad123 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c @@ -0,0 +1,689 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_avg_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_y)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == y_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + vp9_prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_avg_vert_4_dspr2(src, src_stride, + dst, dst_stride, + filter_y, w, h); + break; + case 64: + vp9_prefetch_store(dst + 32); + convolve_avg_vert_64_dspr2(src, src_stride, + dst, dst_stride, + filter_y, h); + break; + default: + vp9_convolve8_avg_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_avg_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} + +void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + + assert(w <= 64); + assert(h <= 64); + + if (intermediate_height < h) + intermediate_height = h; + + if (x_step_q4 != 16 || y_step_q4 != 16) + return vp9_convolve8_avg_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + vp9_convolve8_horiz(src - (src_stride * 3), src_stride, + temp, 64, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, intermediate_height); + + vp9_convolve8_avg_vert(temp + (64*3), 64, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); +} + +void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + uint32_t tp1, tp2, tn1; + uint32_t tp3, tp4, tn2; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + /* 1 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + + : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), + [tp2] "=&r" (tp2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 8: + /* 2 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 16: + /* 4 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 32: + /* 8 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_load(src + src_stride + 64); + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 32(%[dst]) \n\t" + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "ulw %[tp3], 36(%[src]) \n\t" + "ulw %[tp4], 36(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 40(%[src]) \n\t" + "ulw %[tp2], 40(%[dst]) \n\t" + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "ulw %[tp3], 44(%[src]) \n\t" + "ulw %[tp4], 44(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 48(%[src]) \n\t" + "ulw %[tp2], 48(%[dst]) \n\t" + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "ulw %[tp3], 52(%[src]) \n\t" + "ulw %[tp4], 52(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 56(%[src]) \n\t" + "ulw %[tp2], 56(%[dst]) \n\t" + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "ulw %[tp3], 60(%[src]) \n\t" + "ulw %[tp4], 60(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + default: + for (y = h; y > 0; --y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c new file mode 100644 index 000000000..37c665be9 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c @@ -0,0 +1,1032 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tn3], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tn3], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tn2], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn3], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tn1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" + + "lbux %[n1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" + + /* store bytes */ + "sb %[tn2], 3(%[dst]) \n\t" + "sb %[tn3], 5(%[dst]) \n\t" + "sb %[tn1], 7(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), + [st0] "=&r" (st0), [st1] "=&r" (st1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [qload3] "=&r" (qload3), [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + vp9_prefetch_store(dst_ptr + dst_stride); + vp9_prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [qload3] "=&r" (qload3), [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_x)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == x_step_q4) { + uint32_t pos = 38; + + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + convolve_avg_horiz_4_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + case 8: + convolve_avg_horiz_8_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + case 16: + convolve_avg_horiz_16_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h, 1); + break; + case 32: + convolve_avg_horiz_16_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h, 2); + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + convolve_avg_horiz_64_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + default: + vp9_convolve8_avg_horiz_c(src + 3, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_avg_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} +#endif diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c new file mode 100644 index 000000000..2c48bd038 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c @@ -0,0 +1,1281 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *vp9_ff_cropTbl; + +void vp9_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + vp9_ff_cropTbl_a[i] = 0; + vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH]; +} + +static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tn1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), + [dst_ptr] "+r" (dst_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4, n1; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp2], 0(%[src]) \n\t" + "ulw %[tp1], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp1] \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tp3] \n\t" + "preceu.ph.qbl %[n1], %[tp3] \n\t" + "ulw %[tp2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tp2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "lbux %[tp3], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp3], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "ulw %[tp1], 1(%[src]) \n\t" + "ulw %[tp3], 5(%[src]) \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[tp2], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "ulw %[tp2], 9(%[src]) \n\t" + + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[n1], %[tp2] \n\t" + "ulw %[Temp1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[Temp1] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[n1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "ulw %[qload2], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "ulw %[qload2], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y, k; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + for (k = 0; k < 8; ++k) + sum += src[x + k] * filter[k]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + if (intermediate_height < h) + intermediate_height = h; + + if (x_step_q4 != 16 || y_step_q4 != 16) + return vp9_convolve8_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + if ((((const int32_t *)filter_x)[1] == 0x800000) + && (((const int32_t *)filter_y)[1] == 0x800000)) + return vp9_convolve_copy(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + /* copy the src to dst */ + if (filter_x[3] == 0x80) { + int32_t y; + int32_t c; + const uint8_t *src_ptr = src - src_stride * 3; + uint8_t *dst_ptr = temp; + + for (y = intermediate_height; y--;) { + for (c = 0; c < w; c++) { + dst_ptr[c * intermediate_height] = src_ptr[c]; + } + + /* next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } + } else { + src -= (src_stride * 3 + 3); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_horiz_4_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height); + break; + case 8: + convolve_horiz_8_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height, + (w/16)); + break; + case 64: + vp9_prefetch_load(src + 32); + convolve_horiz_64_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height); + break; + default: + convolve_horiz_transposed(src, src_stride, + temp, intermediate_height, + filter_x, w, intermediate_height); + break; + } + } + + /* copy the src to dst */ + if (filter_y[3] == 0x80) { + int32_t y; + int32_t c; + uint8_t *src_ptr = temp + 3; + uint8_t *dst_ptr = dst; + + for (y = w; y--;) { + for (c = 0; c < h; c++) { + dst_ptr[c * dst_stride] = src_ptr[c]; + } + + /* next row... */ + src_ptr += intermediate_height; + dst_ptr += 1; + } + } else { + switch (h) { + case 4: + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w); + break; + case 8: + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w, (h/16)); + break; + case 64: + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w); + break; + default: + convolve_horiz_transposed(temp, intermediate_height, + dst, dst_stride, + filter_y, h, w); + break; + } + } +} + +void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 8: + { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 16: + { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 32: + { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), + [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 64: + { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_load(src + src_stride + 64); + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), + [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + default: + for (y = h; y--; ) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c new file mode 100644 index 000000000..743d64116 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), + [st0] "=&r" (st0), [st1] "=&r" (st1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), + [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + vp9_prefetch_store(dst_ptr + dst_stride); + vp9_prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), + [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_x)[1] == 0x800000) { + vp9_convolve_copy(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == x_step_q4) { + uint32_t pos = 38; + + vp9_prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h, 2); + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + default: + vp9_convolve8_horiz_c(src + 3, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} +#endif diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c new file mode 100644 index 000000000..bdc7930b7 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_y)[1] == 0x800000) { + vp9_convolve_copy(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == y_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + vp9_prefetch_store(dst); + + switch (w) { + case 4 : + case 8 : + case 16 : + case 32 : + convolve_vert_4_dspr2(src, src_stride, + dst, dst_stride, + filter_y, w, h); + break; + case 64 : + vp9_prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, + dst, dst_stride, + filter_y, h); + break; + default: + vp9_convolve8_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} + +#endif diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 864e27e98..f0c653f72 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -58,13 +58,13 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) { } static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { - cm->mb_cols = (aligned_width + 8) >> 4; - cm->mb_rows = (aligned_height + 8) >> 4; - cm->MBs = cm->mb_rows * cm->mb_cols; - cm->mi_cols = aligned_width >> MI_SIZE_LOG2; cm->mi_rows = aligned_height >> MI_SIZE_LOG2; cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE; + + cm->mb_cols = (cm->mi_cols + 1) >> 1; + cm->mb_rows = (cm->mi_rows + 1) >> 1; + cm->MBs = cm->mb_rows * cm->mb_cols; } static void setup_mi(VP9_COMMON *cm) { @@ -170,13 +170,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { void vp9_create_common(VP9_COMMON *cm) { vp9_machine_specific_config(cm); - vp9_init_mbmode_probs(cm); - cm->tx_mode = ONLY_4X4; cm->comp_pred_mode = HYBRID_PREDICTION; - - // Initialize reference frame sign bias structure to defaults - vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias)); } void vp9_remove_common(VP9_COMMON *cm) { diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 9ab2cc31b..f116c0647 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -20,6 +20,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_common_data.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" @@ -56,14 +57,6 @@ typedef enum { } FRAME_TYPE; typedef enum { - EIGHTTAP = 0, - EIGHTTAP_SMOOTH = 1, - EIGHTTAP_SHARP = 2, - BILINEAR = 3, - SWITCHABLE = 4 /* should be the last one */ -} INTERPOLATIONFILTERTYPE; - -typedef enum { DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal @@ -101,10 +94,10 @@ static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) { modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ -union b_mode_info { +typedef struct { MB_PREDICTION_MODE as_mode; int_mv as_mv[2]; // first, second inter predictor motion vectors -}; +} b_mode_info; typedef enum { NONE = -1, @@ -154,7 +147,7 @@ typedef struct { typedef struct { MB_MODE_INFO mbmi; - union b_mode_info bmi[4]; + b_mode_info bmi[4]; } MODE_INFO; static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { @@ -244,7 +237,6 @@ typedef struct macroblockd { unsigned char ab_index; // index of 4x4 block inside the 8x8 block int q_index; - } MACROBLOCKD; static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h index 3822bfc08..2945cd203 100644 --- a/vp9/common/vp9_common_data.h +++ b/vp9/common/vp9_common_data.h @@ -29,4 +29,4 @@ extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES]; extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; -#endif // VP9_COMMON_VP9_COMMON_DATA_H +#endif // VP9_COMMON_VP9_COMMON_DATA_H diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h index 13220e97e..9a5caa662 100644 --- a/vp9/common/vp9_convolve.h +++ b/vp9/common/vp9_convolve.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_CONVOLVE_H_ -#define VP9_COMMON_CONVOLVE_H_ +#ifndef VP9_COMMON_VP9_CONVOLVE_H_ +#define VP9_COMMON_VP9_CONVOLVE_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -21,9 +21,4 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -struct subpix_fn_table { - const int16_t (*filter_x)[8]; - const int16_t (*filter_y)[8]; -}; - -#endif // VP9_COMMON_CONVOLVE_H_ +#endif // VP9_COMMON_VP9_CONVOLVE_H_ diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c index 79f769e40..355ac1a49 100644 --- a/vp9/common/vp9_debugmodes.c +++ b/vp9/common/vp9_debugmodes.c @@ -63,9 +63,9 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); - log_frame_info(cm, "Vectors ",mvs); + log_frame_info(cm, "Vectors ", mvs); for (mi_row = 0; mi_row < rows; mi_row++) { - fprintf(mvs,"V "); + fprintf(mvs, "V "); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row, mi_8x8[mi_index]->mbmi.mv[0].as_mv.col); diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h index 185fcedbe..3b512beb9 100644 --- a/vp9/common/vp9_default_coef_probs.h +++ b/vp9/common/vp9_default_coef_probs.h @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_ +#define VP9_COMMON_DEFAULT_COEF_PROBS_H_ /*Generated file, included by vp9_entropy.c*/ static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { @@ -694,3 +696,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { } }; +#endif // VP9_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 32d9e0cf7..72ea72e09 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -107,101 +107,171 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { }; DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { - 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, - 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, - 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, - 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, - 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, - 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, - 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, - 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, - 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, - 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, - 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, - 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, - 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, - 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, - 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251, - 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255, + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, + 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, + 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, + 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, + 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, + 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, + 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, + 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, + 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, + 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, + 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, + 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, + 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, + 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, + 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, + 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, + 255, }; DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { - 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, - 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, - 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, - 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, - 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, - 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, - 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, - 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, - 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, - 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, - 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, - 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, - 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, - 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, - 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236, - 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255, + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, + 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, + 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, + 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, + 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, + 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, + 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, + 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, + 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, + 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, + 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, + 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, + 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, + 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, + 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, + 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, + 255, }; DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { - 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, - 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, - 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, - 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, - 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, - 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, - 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, - 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, - 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, - 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, - 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, - 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, - 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, - 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, - 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175, - 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255, + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, + 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, + 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, + 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, + 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, + 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, + 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, + 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, + 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, + 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, + 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, + 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, + 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, + 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, + 158, + 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, + 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, + 255, }; DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { - 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, - 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, - 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136, - 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451, - 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, 453, 139, 44, 234, - 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, 486, 77, 204, 362, - 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, 111, 238, 48, 143, - 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, 393, 300, 269, 176, 145, - 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395, - 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, 210, 179, 117, 86, 55, 738, 707, - 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, 645, 552, 521, 428, 397, 304, - 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, 864, 833, 802, 771, 740, 709, - 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, 710, 679, 617, 586, 555, 493, - 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, 743, 619, 495, 371, 247, 123, - 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, 898, 836, 805, 774, 712, 681, - 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527, - 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, 559, 497, 466, 435, 373, - 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, 499, 375, 251, 127, - 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, 685, 654, 592, 561, - 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, - 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, 967, 874, 843, 750, - 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, 564, 533, 440, 409, - 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, - 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, 350, 319, 1002, 971, - 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, 537, 444, 413, 972, - 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, - 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, 1007, 883, 759, 635, 511, - 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, 884, 853, 822, 791, - 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, 1011, 887, 763, 639, - 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982, - 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, 1016, 985, 954, 923, - 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023, + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, + 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, + 68, 131, 37, 100, + 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, + 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, + 102, 352, 8, 197, + 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, + 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, + 41, 417, 199, 136, + 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, + 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, + 295, 420, 106, 451, + 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, + 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, + 453, 139, 44, 234, + 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, + 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, + 486, 77, 204, 362, + 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, + 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, + 111, 238, 48, 143, + 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, + 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, + 393, 300, 269, 176, 145, + 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, + 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, + 550, 519, 488, 457, 426, 395, + 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, + 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, + 210, 179, 117, 86, 55, 738, 707, + 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, + 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, + 645, 552, 521, 428, 397, 304, + 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, + 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, + 864, 833, 802, 771, 740, 709, + 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, + 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, + 710, 679, 617, 586, 555, 493, + 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, + 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, + 743, 619, 495, 371, 247, 123, + 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, + 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, + 898, 836, 805, 774, 712, 681, + 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, + 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, + 651, 620, 589, 558, 527, + 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, + 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, + 559, 497, 466, 435, 373, + 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, + 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, + 499, 375, 251, 127, + 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, + 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, + 685, 654, 592, 561, + 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, + 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, + 438, 407, 376, 345, + 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, + 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, + 967, 874, 843, 750, + 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, + 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, + 564, 533, 440, 409, + 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, + 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, + 752, 721, 690, 659, + 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, + 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, + 350, 319, 1002, 971, + 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, + 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, + 537, 444, 413, 972, + 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, + 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, + 570, 539, 508, 477, + 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, + 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, + 1007, 883, 759, 635, 511, + 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, + 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, + 884, 853, 822, 791, + 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, + 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, + 1011, 887, 763, 639, + 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, + 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, + 702, 671, 1013, 982, + 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, + 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, + 1016, 985, 954, 923, + 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, + 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, + 990, 959, 1022, 991, 1023, }; /* Array indices are identical to previously-existing CONTEXT_NODE indices */ -const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ -{ +const vp9_tree_index vp9_coef_tree[ 22] = { -DCT_EOB_TOKEN, 2, /* 0 = EOB */ -ZERO_TOKEN, 4, /* 1 = ZERO */ -ONE_TOKEN, 6, /* 2 = ONE */ @@ -569,31 +639,6 @@ void vp9_init_neighbors() { vp9_default_scan_32x32_neighbors); } -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) { - if (scan == vp9_default_scan_4x4) { - return vp9_default_scan_4x4_neighbors; - } else if (scan == vp9_row_scan_4x4) { - return vp9_row_scan_4x4_neighbors; - } else if (scan == vp9_col_scan_4x4) { - return vp9_col_scan_4x4_neighbors; - } else if (scan == vp9_default_scan_8x8) { - return vp9_default_scan_8x8_neighbors; - } else if (scan == vp9_row_scan_8x8) { - return vp9_row_scan_8x8_neighbors; - } else if (scan == vp9_col_scan_8x8) { - return vp9_col_scan_8x8_neighbors; - } else if (scan == vp9_default_scan_16x16) { - return vp9_default_scan_16x16_neighbors; - } else if (scan == vp9_row_scan_16x16) { - return vp9_row_scan_16x16_neighbors; - } else if (scan == vp9_col_scan_16x16) { - return vp9_col_scan_16x16_neighbors; - } else { - assert(scan == vp9_default_scan_32x32); - return vp9_default_scan_32x32_neighbors; - } -} - void vp9_coef_tree_initialize() { vp9_init_neighbors(); init_bit_trees(); diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index c1f2d782b..ef9ea46f5 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -190,9 +190,6 @@ static INLINE int get_coef_context(const int16_t *neighbors, token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan); - - // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly @@ -210,9 +207,6 @@ typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] [PREV_COEF_CONTEXTS] [UNCONSTRAINED_NODES + 1]; -typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] - [UNCONSTRAINED_NODES][2]; void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); @@ -367,22 +361,24 @@ static int get_entropy_context(TX_SIZE tx_size, static void get_scan_and_band(const MACROBLOCKD *xd, TX_SIZE tx_size, PLANE_TYPE type, int block_idx, const int16_t **scan, + const int16_t **scan_nb, const uint8_t **band_translate) { switch (tx_size) { case TX_4X4: - *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx)); + get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb); *band_translate = vp9_coefband_trans_4x4; break; case TX_8X8: - *scan = get_scan_8x8(get_tx_type_8x8(type, xd)); + get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb); *band_translate = vp9_coefband_trans_8x8plus; break; case TX_16X16: - *scan = get_scan_16x16(get_tx_type_16x16(type, xd)); + get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb); *band_translate = vp9_coefband_trans_8x8plus; break; case TX_32X32: *scan = vp9_default_scan_32x32; + *scan_nb = vp9_default_scan_32x32_neighbors; *band_translate = vp9_coefband_trans_8x8plus; break; default: diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 93c89b03a..e17679616 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -286,7 +286,7 @@ static const struct tx_probs default_tx_probs = { { 66 } } }; -void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, unsigned int (*ct_32x32p)[2]) { ct_32x32p[0][0] = tx_count_32x32p[TX_4X4]; ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] + @@ -299,7 +299,7 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, ct_32x32p[2][1] = tx_count_32x32p[TX_32X32]; } -void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, unsigned int (*ct_16x16p)[2]) { ct_16x16p[0][0] = tx_count_16x16p[TX_4X4]; ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16]; @@ -307,7 +307,7 @@ void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, ct_16x16p[1][1] = tx_count_16x16p[TX_16X16]; } -void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]) { ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; @@ -356,17 +356,19 @@ void vp9_entropy_mode_init() { #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 -static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) { +static int update_ct(vp9_prob pre_prob, vp9_prob prob, + const unsigned int ct[2]) { return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); } -static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) { +static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) { return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); } static void update_mode_probs(int n_modes, - const vp9_tree_index *tree, unsigned int *cnt, - vp9_prob *pre_probs, vp9_prob *dst_probs, + const vp9_tree_index *tree, + const unsigned int *cnt, + const vp9_prob *pre_probs, vp9_prob *dst_probs, unsigned int tok0_offset) { #define MAX_PROBS 32 vp9_prob probs[MAX_PROBS]; @@ -382,8 +384,8 @@ static void update_mode_probs(int n_modes, void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; FRAME_CONTEXT *fc = &cm->fc; - FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - FRAME_COUNTS *counts = &cm->counts; + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_COUNTS *counts = &cm->counts; for (i = 0; i < INTRA_INTER_CONTEXTS; i++) fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i], diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index 31537c7f7..ccade2752 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -61,11 +61,11 @@ void vp9_init_mbmode_probs(struct VP9Common *cm); void vp9_adapt_mode_probs(struct VP9Common *cm); -void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, unsigned int (*ct_32x32p)[2]); -void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, unsigned int (*ct_16x16p)[2]); -void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); #endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 2e973e53f..a9e25b727 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { }; struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; -const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = { +const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2] = { -0, -1, }; struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; -const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = { +const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2] = { -0, 2, -1, 4, -2, -3 @@ -53,8 +53,8 @@ struct vp9_token vp9_mv_fp_encodings[4]; static const nmv_context default_nmv_context = { {32, 64, 96}, - { - { /* vert component */ + { // NOLINT + { /* vert component */ // NOLINT 128, /* sign */ {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */ {216}, /* class0 */ @@ -64,7 +64,7 @@ static const nmv_context default_nmv_context = { 160, /* class0_hp bit */ 128, /* hp */ }, - { /* hor component */ + { /* hor component */ // NOLINT 128, /* sign */ {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */ {208}, /* class0 */ @@ -149,7 +149,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr, int usehp) { int s, z, c, o, d, e, f; - assert (v != 0); /* should not be zero */ + assert(v != 0); /* should not be zero */ s = v < 0; comp_counts->sign[s] += incr; z = (s ? -v : v) - 1; /* magnitude - 1 */ @@ -198,8 +198,6 @@ static unsigned int adapt_probs(unsigned int i, vp9_prob this_probs[], const vp9_prob last_probs[], const unsigned int num_events[]) { - - const unsigned int left = tree[i] <= 0 ? num_events[-tree[i]] : adapt_probs(tree[i], tree, this_probs, last_probs, num_events); diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h index a10c933f6..3b782ab0a 100644 --- a/vp9/common/vp9_entropymv.h +++ b/vp9/common/vp9_entropymv.h @@ -13,7 +13,7 @@ #define VP9_COMMON_VP9_ENTROPYMV_H_ #include "vp9/common/vp9_treecoder.h" -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_blockd.h" struct VP9Common; @@ -73,6 +73,10 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; #define MV_MAX ((1 << MV_MAX_BITS) - 1) #define MV_VALS ((MV_MAX << 1) + 1) +#define MV_IN_USE_BITS 14 +#define MV_UPP ((1 << MV_IN_USE_BITS) - 1) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2]; extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 4ac2bc93f..cedd44cad 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -8,12 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> + #include "vpx_ports/mem.h" #include "vp9/common/vp9_filter.h" -DECLARE_ALIGNED(256, const int16_t, - vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_bilinear_filters[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -33,8 +35,8 @@ DECLARE_ALIGNED(256, const int16_t, }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -54,8 +56,8 @@ DECLARE_ALIGNED(256, const int16_t, }; // DCT based filter -DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -75,8 +77,8 @@ DECLARE_ALIGNED(256, const int16_t, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, @@ -94,3 +96,20 @@ DECLARE_ALIGNED(256, const int16_t, { 0, -3, 2, 41, 63, 29, -2, -2}, { 0, -3, 1, 38, 64, 32, -1, -3} }; + +const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type) { + switch (type) { + case EIGHTTAP: + return vp9_sub_pel_filters_8; + case EIGHTTAP_SMOOTH: + return vp9_sub_pel_filters_8lp; + case EIGHTTAP_SHARP: + return vp9_sub_pel_filters_8s; + case BILINEAR: + return vp9_bilinear_filters; + default: + assert(!"Invalid filter type."); + return NULL; + } +} + diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index 7b1ffaeda..676b274b9 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -11,7 +11,7 @@ #ifndef VP9_COMMON_VP9_FILTER_H_ #define VP9_COMMON_VP9_FILTER_H_ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vpx/vpx_integer.h" #define SUBPEL_BITS 4 @@ -19,11 +19,28 @@ #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) #define SUBPEL_TAPS 8 -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]; +typedef enum { + EIGHTTAP = 0, + EIGHTTAP_SMOOTH = 1, + EIGHTTAP_SHARP = 2, + BILINEAR = 3, + SWITCHABLE = 4 /* should be the last one */ +} INTERPOLATIONFILTERTYPE; + +typedef const int16_t subpel_kernel[SUBPEL_TAPS]; + +struct subpix_fn_table { + const subpel_kernel *filter_x; + const subpel_kernel *filter_y; +}; + +const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type); + +extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]; // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 73f6b4c19..b0c0c57ae 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -57,7 +57,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); } else if (block_idx == 1 || block_idx == 2) { int dst = 0, n; - union b_mode_info *bmi = mi->bmi; + b_mode_info *bmi = mi->bmi; dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; for (n = 0; dst < MAX_MV_REF_CANDIDATES && @@ -66,7 +66,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, dst_list[dst++].as_int = mv_list[n].as_int; } else { int dst = 0, n; - union b_mode_info *bmi = mi->bmi; + b_mode_info *bmi = mi->bmi; assert(block_idx == 3); dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int; diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h index ad0d882b9..50dfdc7fb 100644 --- a/vp9/common/vp9_findnearmv.h +++ b/vp9/common/vp9_findnearmv.h @@ -55,13 +55,11 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, if (!mi) return DC_PRED; - if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { + if (is_inter_block(&mi->mbmi)) return DC_PRED; - } else if (mi->mbmi.sb_type < BLOCK_8X8) { - return ((mi->bmi + 1 + b)->as_mode); - } else { - return mi->mbmi.mode; - } + else + return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 1 + b)->as_mode + : mi->mbmi.mode; } assert(b == 1 || b == 3); return (mi->bmi + b - 1)->as_mode; @@ -77,13 +75,11 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, if (!mi) return DC_PRED; - if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { + if (is_inter_block(&mi->mbmi)) return DC_PRED; - } else if (mi->mbmi.sb_type < BLOCK_8X8) { - return ((mi->bmi + 2 + b)->as_mode); - } else { - return mi->mbmi.mode; - } + else + return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode + : mi->mbmi.mode; } return (mi->bmi + b - 2)->as_mode; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index f06bf047b..463637007 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -18,7 +18,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ int i; @@ -70,7 +70,7 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } -void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { int i; int a1, e1; int16_t tmp[4]; @@ -116,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) { output[3] = step[0] - step[3]; } -void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[4 * 4]; int16_t *outptr = out; int i, j; @@ -140,7 +140,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } -void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int i; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -420,7 +420,7 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, + dest[j * dest_stride + i]); } } -void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, +void vp9_short_idct8x8_10_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; @@ -838,7 +838,7 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, + dest[j * dest_stride + i]); } } -void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, +void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; @@ -1269,8 +1269,107 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } -void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { +void vp9_short_idct32x32_1_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int i, j; + int a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 6); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += dest_stride; + } +} + +// idct +void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) { + if (eob > 1) + vp9_idct4x4_16_add(input, dest, stride); + else + vp9_idct4x4_1_add(input, dest, stride); +} + + +void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) { + if (eob > 1) + vp9_iwht4x4_16_add(input, dest, stride); + else + vp9_iwht4x4_1_add(input, dest, stride); +} + +void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. + if (eob) { + if (eob == 1) + // DC only DCT coefficient + vp9_short_idct8x8_1_add(input, dest, stride); + else if (eob <= 10) + vp9_short_idct8x8_10_add(input, dest, stride); + else + vp9_short_idct8x8_add(input, dest, stride); + } +} + +void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob) { + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eob) { + if (eob == 1) + /* DC only DCT coefficient. */ + vp9_short_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vp9_short_idct16x16_10_add(input, dest, stride); + else + vp9_short_idct16x16_add(input, dest, stride); + } +} + +void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob) { + if (eob) { + if (eob == 1) + vp9_short_idct32x32_1_add(input, dest, stride); + else + vp9_short_idct32x32_add(input, dest, stride); + } +} + +// iht +void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, + int eob) { + if (tx_type == DCT_DCT) + vp9_idct4x4_add(input, dest, stride, eob); + else + vp9_short_iht4x4_add(input, dest, stride, tx_type); +} + +void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct_add_8x8(input, dest, stride, eob); + } else { + if (eob > 0) { + vp9_short_iht8x8_add(input, dest, stride, tx_type); + } + } +} + +void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct_add_16x16(input, dest, stride, eob); + } else { + if (eob > 0) { + vp9_short_iht16x16_add(input, dest, stride, tx_type); + } + } } diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 59892cd03..a15b6d36f 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -16,6 +16,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" // Constants and Macros used by all idct/dct functions @@ -86,4 +87,21 @@ typedef struct { transform_1d cols, rows; // vertical and horizontal } transform_2d; + +void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob); + +void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, + int stride, int eob); + +void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest, + int stride, int eob); + +void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest, + int stride, int eob); + + #endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 6e425e8fb..85ac6d2bf 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_reconinter.h" diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 91d40ac97..c698090d8 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -12,7 +12,7 @@ #define VP9_COMMON_VP9_LOOPFILTER_H_ #include "vpx_ports/mem.h" -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_seg_common.h" diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index 88130d801..2c4bf6cb2 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index a444b8555..659079639 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -108,7 +108,7 @@ static const int idx_n_column_to_subblock[4][2] = { }; // clamp_mv_ref -#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h index f424e6a12..acb4724e5 100644 --- a/vp9/common/vp9_onyx.h +++ b/vp9/common/vp9_onyx.h @@ -13,7 +13,7 @@ #ifdef __cplusplus extern "C" -{ +{ // NOLINT #endif #include "./vpx_config.h" @@ -33,7 +33,6 @@ extern "C" FOURFIVE = 1, THREEFIVE = 2, ONETWO = 3 - } VPX_SCALING; typedef enum { @@ -71,42 +70,48 @@ extern "C" // 3 - lowest quality/fastest decode int width; // width of data passed to the compressor int height; // height of data passed to the compressor - double framerate; // set to passed in framerate - int64_t target_bandwidth; // bandwidth to be used in kilobits per second + double framerate; // set to passed in framerate + int64_t target_bandwidth; // bandwidth to be used in kilobits per second - int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 - int Sharpness; // parameter used for sharpening output: recommendation 0: + int noise_sensitivity; // pre processing blur: recommendation 0 + int Sharpness; // sharpening output: recommendation 0: int cpu_used; unsigned int rc_max_intra_bitrate_pct; // mode -> - // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing - // a television signal or feed from a live camera). ( speed setting controls how fast ) - // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to - // encode the output. ( speed setting controls how fast ) - // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding - // speed. The output is compressed at the highest possible quality. This option takes the longest - // amount of time to encode. ( speed setting ignored ) - // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding - // pass. ( speed setting controls how fast ) - // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding - // pass to create the compressed output. ( speed setting controls how fast ) - // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first - // encoding pass to create the compressed output using the highest possible quality, and taking a + // (0)=Realtime/Live Encoding. This mode is optimized for realtime + // encoding (for example, capturing a television signal or feed from + // a live camera). ( speed setting controls how fast ) + // (1)=Good Quality Fast Encoding. The encoder balances quality with the + // amount of time it takes to encode the output. ( speed setting + // controls how fast ) + // (2)=One Pass - Best Quality. The encoder places priority on the + // quality of the output over encoding speed. The output is compressed + // at the highest possible quality. This option takes the longest + // amount of time to encode. ( speed setting ignored ) + // (3)=Two Pass - First Pass. The encoder generates a file of statistics + // for use in the second encoding pass. ( speed setting controls how + // fast ) + // (4)=Two Pass - Second Pass. The encoder uses the statistics that were + // generated in the first encoding pass to create the compressed + // output. ( speed setting controls how fast ) + // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that + // were generated in the first encoding pass to create the compressed + // output using the highest possible quality, and taking a // longer amount of time to encode.. ( speed setting ignored ) - int Mode; // + int Mode; // Key Framing Operations - int auto_key; // automatically detect cut scenes and set the keyframes - int key_freq; // maximum distance to key frame. + int auto_key; // autodetect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. - int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) - int lag_in_frames; // how many frames lag before we start encoding + int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) + int lag_in_frames; // how many frames lag before we start encoding // ---------------------------------------------------------------- // DATARATE CONTROL OPTIONS - int end_usage; // vbr or cbr + int end_usage; // vbr or cbr // buffer targeting aggressiveness int under_shoot_pct; @@ -138,7 +143,7 @@ extern "C" int play_alternate; int alt_freq; - int encode_breakout; // early breakout encode threshold : for video conf recommend 800 + int encode_breakout; // early breakout : for video conf recommend 800 /* Bitfield defining the error resiliency features to enable. * Can provide decodable frames after losses in previous @@ -173,8 +178,8 @@ extern "C" void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); -// receive a frames worth of data caller can assume that a copy of this frame is made -// and not just a copy of the pointer.. + // receive a frames worth of data. caller can assume that a copy of this + // frame is made and not just a copy of the pointer.. int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 44948ff4d..953764c85 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -11,9 +11,9 @@ #ifndef VP9_COMMON_VP9_ONYXC_INT_H_ #define VP9_COMMON_VP9_ONYXC_INT_H_ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -120,7 +120,7 @@ typedef struct VP9Common { YV12_BUFFER_CONFIG post_proc_buffer; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ FRAME_TYPE frame_type; int show_frame; @@ -291,10 +291,6 @@ static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd, xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end); } -static int get_token_alloc(int mb_rows, int mb_cols) { - return mb_rows * mb_cols * (48 * 16 + 4); -} - static void set_prev_mi(VP9_COMMON *cm) { const int use_prev_in_find_mv_refs = cm->width == cm->last_width && cm->height == cm->last_height && diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 955e6766a..212a28ab9 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -8,6 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> +#include <stdlib.h> +#include <stdio.h> #include "./vpx_config.h" #include "vpx_scale/yv12config.h" @@ -18,11 +21,6 @@ #include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" - -#include <math.h> -#include <stdlib.h> -#include <stdio.h> - #define RGB_TO_YUV(t) \ ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ (0.098*(float)(t & 0xff)) + 16), \ @@ -155,7 +153,6 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, p_dst = dst_ptr; for (col = 0; col < cols; col++) { - int kernel = 4; int v = p_src[col]; @@ -257,7 +254,7 @@ void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch, void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int flimit) { int r, c, i; - const short *rv3 = &vp9_rv[63 & rand()]; + const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT for (c = 0; c < cols; c++) { uint8_t *s = &dst[c]; @@ -408,7 +405,6 @@ static void fillrd(struct postproc_state *state, int q, int a) { next = next + j; } - } for (; next < 256; next++) @@ -416,7 +412,7 @@ static void fillrd(struct postproc_state *state, int q, int a) { } for (i = 0; i < 3072; i++) { - state->noise[i] = char_dist[rand() & 0xff]; + state->noise[i] = char_dist[rand() & 0xff]; // NOLINT } for (i = 0; i < 16; i++) { @@ -680,13 +676,14 @@ int vp9_post_proc_frame(struct VP9Common *cm, #if 0 && CONFIG_POSTPROC_VISUALIZER if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { char message[512]; - sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (cm->frame_type == KEY_FRAME), - cm->refresh_golden_frame, - cm->base_qindex, - cm->filter_level, - flags, - cm->mb_cols, cm->mb_rows); + snprintf(message, sizeof(message) -1, + "F%1dG%1dQ%3dF%3dP%d_s%dx%d", + (cm->frame_type == KEY_FRAME), + cm->refresh_golden_frame, + cm->base_qindex, + cm->filter_level, + flags, + cm->mb_cols, cm->mb_rows); vp9_blit_text(message, cm->post_proc_buffer.y_buffer, cm->post_proc_buffer.y_stride); } @@ -707,7 +704,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, for (j = 0; j < mb_cols; j++) { char zz[4]; - sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); + snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a'); vp9_blit_text(zz, y_ptr, post->y_stride); mb_index++; @@ -716,7 +713,6 @@ int vp9_post_proc_frame(struct VP9Common *cm, mb_index++; /* border */ y_ptr += post->y_stride * 16 - post->y_width; - } } @@ -740,9 +736,9 @@ int vp9_post_proc_frame(struct VP9Common *cm, mi[mb_index].mbmi.skip_coeff); if (cm->frame_type == KEY_FRAME) - sprintf(zz, "a"); + snprintf(zz, sizeof(zz) - 1, "a"); else - sprintf(zz, "%c", dc_diff + '0'); + snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0'); vp9_blit_text(zz, y_ptr, post->y_stride); mb_index++; @@ -751,7 +747,6 @@ int vp9_post_proc_frame(struct VP9Common *cm, mb_index++; /* border */ y_ptr += post->y_stride * 16 - post->y_width; - } } @@ -894,8 +889,9 @@ int vp9_post_proc_frame(struct VP9Common *cm, constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); - } else + } else { vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); + } } mi++; diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index 81fbf1f26..e89683150 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -392,11 +392,6 @@ void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) { xd->this_mi->mbmi.seg_id_predicted = pred_flag; } -void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize, - uint8_t pred_flag) { - xd->this_mi->mbmi.skip_coeff = pred_flag; -} - int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index 47ca8abd8..9230c4531 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -52,9 +52,6 @@ static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) { return xd->this_mi->mbmi.skip_coeff; } -void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize, - uint8_t pred_flag); - unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd); @@ -69,8 +66,9 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, const MACROBLOCKD *xd); -static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { +static INLINE +vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd); return cm->fc.comp_inter_prob[pred_context]; } diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index bc40854a3..6dbdb4216 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -14,69 +14,69 @@ #if 1 static const int16_t dc_qlookup[QINDEX_RANGE] = { - 4, 8, 8, 9, 10, 11, 12, 12, - 13, 14, 15, 16, 17, 18, 19, 19, - 20, 21, 22, 23, 24, 25, 26, 26, - 27, 28, 29, 30, 31, 32, 32, 33, - 34, 35, 36, 37, 38, 38, 39, 40, - 41, 42, 43, 43, 44, 45, 46, 47, - 48, 48, 49, 50, 51, 52, 53, 53, - 54, 55, 56, 57, 57, 58, 59, 60, - 61, 62, 62, 63, 64, 65, 66, 66, - 67, 68, 69, 70, 70, 71, 72, 73, - 74, 74, 75, 76, 77, 78, 78, 79, - 80, 81, 81, 82, 83, 84, 85, 85, - 87, 88, 90, 92, 93, 95, 96, 98, - 99, 101, 102, 104, 105, 107, 108, 110, - 111, 113, 114, 116, 117, 118, 120, 121, - 123, 125, 127, 129, 131, 134, 136, 138, - 140, 142, 144, 146, 148, 150, 152, 154, - 156, 158, 161, 164, 166, 169, 172, 174, - 177, 180, 182, 185, 187, 190, 192, 195, - 199, 202, 205, 208, 211, 214, 217, 220, - 223, 226, 230, 233, 237, 240, 243, 247, - 250, 253, 257, 261, 265, 269, 272, 276, - 280, 284, 288, 292, 296, 300, 304, 309, - 313, 317, 322, 326, 330, 335, 340, 344, - 349, 354, 359, 364, 369, 374, 379, 384, - 389, 395, 400, 406, 411, 417, 423, 429, - 435, 441, 447, 454, 461, 467, 475, 482, - 489, 497, 505, 513, 522, 530, 539, 549, - 559, 569, 579, 590, 602, 614, 626, 640, - 654, 668, 684, 700, 717, 736, 755, 775, - 796, 819, 843, 869, 896, 925, 955, 988, + 4, 8, 8, 9, 10, 11, 12, 12, + 13, 14, 15, 16, 17, 18, 19, 19, + 20, 21, 22, 23, 24, 25, 26, 26, + 27, 28, 29, 30, 31, 32, 32, 33, + 34, 35, 36, 37, 38, 38, 39, 40, + 41, 42, 43, 43, 44, 45, 46, 47, + 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, + 61, 62, 62, 63, 64, 65, 66, 66, + 67, 68, 69, 70, 70, 71, 72, 73, + 74, 74, 75, 76, 77, 78, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 85, + 87, 88, 90, 92, 93, 95, 96, 98, + 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, + 123, 125, 127, 129, 131, 134, 136, 138, + 140, 142, 144, 146, 148, 150, 152, 154, + 156, 158, 161, 164, 166, 169, 172, 174, + 177, 180, 182, 185, 187, 190, 192, 195, + 199, 202, 205, 208, 211, 214, 217, 220, + 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, + 280, 284, 288, 292, 296, 300, 304, 309, + 313, 317, 322, 326, 330, 335, 340, 344, + 349, 354, 359, 364, 369, 374, 379, 384, + 389, 395, 400, 406, 411, 417, 423, 429, + 435, 441, 447, 454, 461, 467, 475, 482, + 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, + 654, 668, 684, 700, 717, 736, 755, 775, + 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, }; static const int16_t ac_qlookup[QINDEX_RANGE] = { - 4, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, - 39, 40, 41, 42, 43, 44, 45, 46, - 47, 48, 49, 50, 51, 52, 53, 54, - 55, 56, 57, 58, 59, 60, 61, 62, - 63, 64, 65, 66, 67, 68, 69, 70, - 71, 72, 73, 74, 75, 76, 77, 78, - 79, 80, 81, 82, 83, 84, 85, 86, - 87, 88, 89, 90, 91, 92, 93, 94, - 95, 96, 97, 98, 99, 100, 101, 102, - 104, 106, 108, 110, 112, 114, 116, 118, - 120, 122, 124, 126, 128, 130, 132, 134, - 136, 138, 140, 142, 144, 146, 148, 150, - 152, 155, 158, 161, 164, 167, 170, 173, - 176, 179, 182, 185, 188, 191, 194, 197, - 200, 203, 207, 211, 215, 219, 223, 227, - 231, 235, 239, 243, 247, 251, 255, 260, - 265, 270, 275, 280, 285, 290, 295, 300, - 305, 311, 317, 323, 329, 335, 341, 347, - 353, 359, 366, 373, 380, 387, 394, 401, - 408, 416, 424, 432, 440, 448, 456, 465, - 474, 483, 492, 501, 510, 520, 530, 540, - 550, 560, 571, 582, 593, 604, 615, 627, - 639, 651, 663, 676, 689, 702, 715, 729, - 743, 757, 771, 786, 801, 816, 832, 848, - 864, 881, 898, 915, 933, 951, 969, 988, + 4, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 83, 84, 85, 86, + 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 155, 158, 161, 164, 167, 170, 173, + 176, 179, 182, 185, 188, 191, 194, 197, + 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, + 265, 270, 275, 280, 285, 290, 295, 300, + 305, 311, 317, 323, 329, 335, 341, 347, + 353, 359, 366, 373, 380, 387, 394, 401, + 408, 416, 424, 432, 440, 448, 456, 465, + 474, 483, 492, 501, 510, 520, 530, 540, + 550, 560, 571, 582, 593, 604, 615, 627, + 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, + 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 0f2e4e999..b3b9e1d8a 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -20,34 +20,23 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" - void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERPOLATIONFILTERTYPE mcomp_filter_type, VP9_COMMON *cm) { if (xd->mi_8x8 && xd->this_mi) { - MB_MODE_INFO * mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; - set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1, - cm->active_ref_scale); + set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME, + mbmi->ref_frame[1] - LAST_FRAME, + cm->active_ref_scale); } else { set_scale_factors(xd, -1, -1, cm->active_ref_scale); } - switch (mcomp_filter_type) { - case EIGHTTAP: - case SWITCHABLE: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8; - break; - case EIGHTTAP_SMOOTH: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp; - break; - case EIGHTTAP_SHARP: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s; - break; - case BILINEAR: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters; - break; - } + xd->subpix.filter_x = xd->subpix.filter_y = + vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ? + EIGHTTAP : mcomp_filter_type); + assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } @@ -132,7 +121,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, const int x = 4 * (block & ((1 << bwl) - 1)); const int y = 4 * (block >> bwl); const MODE_INFO *mi = xd->this_mi; - const int use_second_ref = mi->mbmi.ref_frame[1] > 0; + const int is_compound = has_second_ref(&mi->mbmi); int ref; assert(x < bw); @@ -140,7 +129,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw); assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh); - for (ref = 0; ref < 1 + use_second_ref; ++ref) { + for (ref = 0; ref < 1 + is_compound; ++ref) { struct scale_factors *const scale = &xd->scale_factor[ref]; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 4a451b909..bd609dcf0 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -13,7 +13,7 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_once.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c index 72613ae07..dc15a84ff 100644 --- a/vp9/common/vp9_rtcd.c +++ b/vp9/common/vp9_rtcd.c @@ -7,9 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #define RTCD_C -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vpx_ports/vpx_once.h" void vpx_scale_rtcd(void); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 864a84095..67dced210 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -28,22 +28,6 @@ forward_decls vp9_common_forward_decls [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3 # -# Dequant -# - -prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob" -specialize vp9_idct_add_16x16 - -prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob" -specialize vp9_idct_add_8x8 - -prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob" -specialize vp9_idct_add - -prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob" -specialize vp9_idct_add_32x32 - -# # RECON # prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" @@ -202,17 +186,6 @@ specialize vp9_dc_left_predictor_32x32 prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_128_predictor_32x32 -if [ "$CONFIG_VP9_DECODER" = "yes" ]; then -prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_8x8 sse2 neon - -prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_16x16 sse2 neon - -prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_32x32 sse2 neon -fi - # # Loopfilter # @@ -268,37 +241,37 @@ specialize vp9_blend_b # Sub Pixel Filters # prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_copy $sse2_x86inc neon +specialize vp9_convolve_copy $sse2_x86inc neon dspr2 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_avg $sse2_x86inc neon +specialize vp9_convolve_avg $sse2_x86inc neon dspr2 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8 ssse3 neon +specialize vp9_convolve8 ssse3 neon dspr2 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_horiz ssse3 neon +specialize vp9_convolve8_horiz ssse3 neon dspr2 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_vert ssse3 neon +specialize vp9_convolve8_vert ssse3 neon dspr2 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg ssse3 neon +specialize vp9_convolve8_avg ssse3 neon dspr2 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_horiz ssse3 neon +specialize vp9_convolve8_avg_horiz ssse3 neon dspr2 prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_vert ssse3 neon +specialize vp9_convolve8_avg_vert ssse3 neon dspr2 # # dct # -prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_1_add sse2 neon +prototype void vp9_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct4x4_1_add sse2 neon -prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_add sse2 neon +prototype void vp9_idct4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct4x4_16_add sse2 neon prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_1_add sse2 neon @@ -306,8 +279,8 @@ specialize vp9_short_idct8x8_1_add sse2 neon prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 neon -prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_8x8_add sse2 neon +prototype void vp9_short_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct8x8_10_add sse2 neon prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_1_add sse2 neon @@ -315,14 +288,14 @@ specialize vp9_short_idct16x16_1_add sse2 neon prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_add sse2 neon -prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_16x16_add sse2 neon +prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct16x16_10_add sse2 neon prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct32x32_add sse2 neon -prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_32x32 +prototype void vp9_short_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct32x32_1_add sse2 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_short_iht4x4_add sse2 neon @@ -337,11 +310,11 @@ prototype void vp9_idct4_1d "int16_t *input, int16_t *output" specialize vp9_idct4_1d sse2 # dct and add -prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_iwalsh4x4_1_add +prototype void vp9_iwht4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_iwht4x4_1_add -prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_iwalsh4x4_add +prototype void vp9_iwht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_iwht4x4_16_add # # Encoder functions below this point. diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index 7a720d035..ece011477 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -48,4 +48,4 @@ static int vp9_is_scaled(const struct scale_factors *sf) { sf->y_scale_fp != REF_NO_SCALE; } -#endif // VP9_COMMON_VP9_SCALE_H_ +#endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/vp9/common/vp9_subpelvar.h b/vp9/common/vp9_subpelvar.h deleted file mode 100644 index fe75481f6..000000000 --- a/vp9/common/vp9_subpelvar.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SUBPELVAR_H_ -#define VP9_COMMON_VP9_SUBPELVAR_H_ - -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_convolve.h" - -static void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { - int i, j; - int diff; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; - *sse += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - src_ptr++; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -#endif // VP9_COMMON_VP9_SUBPELVAR_H_ diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c index 2e21a5b30..da1213d71 100644 --- a/vp9/common/vp9_treecoder.c +++ b/vp9/common/vp9_treecoder.c @@ -25,8 +25,9 @@ static void tree2tok(struct vp9_token *const p, vp9_tree t, if (j <= 0) { p[-j].value = v; p[-j].len = l; - } else + } else { tree2tok(p, t, j, v, l); + } } while (++v & 1); } @@ -65,11 +66,9 @@ static unsigned int convert_distribution(unsigned int i, return left + right; } -void vp9_tree_probs_from_distribution( - vp9_tree tree, - vp9_prob probs [ /* n-1 */ ], - unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ], - unsigned int tok0_offset) { +void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */], + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */], + unsigned int tok0_offset) { convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset); } diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 8f740f412..f03af3378 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -15,7 +15,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, @@ -148,7 +148,7 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE4X4(dest, input3); } -void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; @@ -985,7 +985,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_short_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -1014,7 +1014,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) // Stage1 - { + { //NOLINT const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); @@ -1039,7 +1039,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { } // Stage2 - { + { //NOLINT const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); @@ -1069,7 +1069,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { } // Stage3 - { + { //NOLINT const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); @@ -2456,7 +2456,7 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, write_buffer_8x16(dest, in1, stride); } -void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, +void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); @@ -3548,4 +3548,52 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { dest += 8 - (stride * 32); } } +} //NOLINT + +void vp9_short_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 4; ++i) { + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + dest += 8 - (stride * 32); + } } diff --git a/vp9/common/x86/vp9_postproc_x86.h b/vp9/common/x86/vp9_postproc_x86.h index b0e8b181f..8870215a2 100644 --- a/vp9/common/x86/vp9_postproc_x86.h +++ b/vp9/common/x86/vp9_postproc_x86.h @@ -61,4 +61,4 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt); #endif #endif -#endif +#endif // VP9_COMMON_X86_VP9_POSTPROC_X86_H_ diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index bbf9888ca..7a5cca056 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -534,6 +534,21 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): ret ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%macro HORIZx4_ROW 2 + movdqa %2, %1 + pshufb %1, [GLOBAL(shuf_t0t1)] + pshufb %2, [GLOBAL(shuf_t2t3)] + pmaddubsw %1, xmm6 + pmaddubsw %2, xmm7 + + paddsw %1, %2 + movdqa %2, %1 + psrldq %2, 8 + paddsw %1, %2 + paddsw %1, xmm5 + psraw %1, 7 + packuswb %1, %1 +%endm %macro HORIZx4 1 mov rdx, arg(5) ;filter ptr @@ -544,64 +559,84 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movdqa xmm4, [rdx] ;load filters movq xmm5, rcx packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 + pshuflw xmm6, xmm4, 0b ;k0_k1 + pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 + pshuflw xmm7, xmm4, 01010101b ;k2_k3 + pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 + pshufd xmm5, xmm5, 0 ;rounding movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height - + shr rcx, 1 .loop: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 - punpcklqdq xmm0, xmm3 + ;Do two rows once + movq xmm0, [rsi - 3] ;load src + movq xmm1, [rsi + 5] + movq xmm2, [rsi + rax - 3] + movq xmm3, [rsi + rax + 5] + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + + HORIZx4_ROW xmm0, xmm1 + HORIZx4_ROW xmm2, xmm3 +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 + movd xmm3, [rdi + rdx] + pavgb xmm2, xmm3 +%endif + movd [rdi], xmm0 + movd [rdi +rdx], xmm2 - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 + lea rsi, [rsi + rax] + prefetcht0 [rsi + 4 * rax - 3] + lea rsi, [rsi + rax] + lea rdi, [rdi + 2 * rdx] + prefetcht0 [rsi + 2 * rax - 3] - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 + dec rcx + jnz .loop - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 + ; Do last row if output_height is odd + movsxd rcx, dword ptr arg(4) ;output_height + and rcx, 1 + je .done - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 + movq xmm0, [rsi - 3] ; load src + movq xmm1, [rsi + 5] + punpcklqdq xmm0, xmm1 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 + HORIZx4_ROW xmm0, xmm1 %if %1 movd xmm1, [rdi] pavgb xmm0, xmm1 %endif - lea rsi, [rsi + rax] movd [rdi], xmm0 +.done +%endm - lea rdi, [rdi + rdx] - dec rcx - jnz .loop +%macro HORIZx8_ROW 4 + movdqa %2, %1 + movdqa %3, %1 + movdqa %4, %1 + + pshufb %1, [GLOBAL(shuf_t0t1)] + pshufb %2, [GLOBAL(shuf_t2t3)] + pshufb %3, [GLOBAL(shuf_t4t5)] + pshufb %4, [GLOBAL(shuf_t6t7)] + + pmaddubsw %1, k0k1 + pmaddubsw %2, k2k3 + pmaddubsw %3, k4k5 + pmaddubsw %4, k6k7 + + paddsw %1, %2 + paddsw %1, %4 + paddsw %1, %3 + paddsw %1, krd + psraw %1, 7 + packuswb %1, %1 %endm %macro HORIZx8 1 @@ -633,45 +668,51 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height + shr rcx, 1 .loop: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + movq xmm0, [rsi - 3] ;load src + movq xmm3, [rsi + 5] + movq xmm4, [rsi + rax - 3] + movq xmm7, [rsi + rax + 5] punpcklqdq xmm0, xmm3 + punpcklqdq xmm4, xmm7 - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 + HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 +%if %1 + movq xmm1, [rdi] + movq xmm2, [rdi + rdx] + pavgb xmm0, xmm1 + pavgb xmm4, xmm2 +%endif + movq [rdi], xmm0 + movq [rdi + rdx], xmm4 - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 + lea rsi, [rsi + rax] + prefetcht0 [rsi + 4 * rax - 3] + lea rsi, [rsi + rax] + lea rdi, [rdi + 2 * rdx] + prefetcht0 [rsi + 2 * rax - 3] + dec rcx + jnz .loop - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 + ;Do last row if output_height is odd + movsxd rcx, dword ptr arg(4) ;output_height + and rcx, 1 + je .done - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 + movq xmm0, [rsi - 3] + movq xmm3, [rsi + 5] + punpcklqdq xmm0, xmm3 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 %if %1 movq xmm1, [rdi] pavgb xmm0, xmm1 %endif - - lea rsi, [rsi + rax] movq [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .loop +.done %endm %macro HORIZx16 1 @@ -705,60 +746,53 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movsxd rcx, dword ptr arg(4) ;output_height .loop: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + prefetcht0 [rsi + 2 * rax -3] - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 - punpcklqdq xmm0, xmm3 + movq xmm0, [rsi - 3] ;load src data + movq xmm4, [rsi + 5] + movq xmm7, [rsi + 13] + punpcklqdq xmm0, xmm4 + punpcklqdq xmm4, xmm7 movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 - movdqa xmm2, xmm1 + pshufb xmm0, [GLOBAL(shuf_t0t1)] pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 + pshufb xmm3, [GLOBAL(shuf_t6t7)] + pshufb xmm4, [GLOBAL(shuf_t0t1)] + pshufb xmm5, [GLOBAL(shuf_t2t3)] + pshufb xmm6, [GLOBAL(shuf_t4t5)] + pshufb xmm7, [GLOBAL(shuf_t6t7)] - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 + pmaddubsw xmm0, k0k1 + pmaddubsw xmm1, k2k3 + pmaddubsw xmm2, k4k5 + pmaddubsw xmm3, k6k7 + pmaddubsw xmm4, k0k1 + pmaddubsw xmm5, k2k3 + pmaddubsw xmm6, k4k5 + pmaddubsw xmm7, k6k7 paddsw xmm0, xmm1 - paddsw xmm0, xmm4 + paddsw xmm0, xmm3 paddsw xmm0, xmm2 + paddsw xmm4, xmm5 + paddsw xmm4, xmm7 + paddsw xmm4, xmm6 + paddsw xmm0, krd + paddsw xmm4, krd psraw xmm0, 7 + psraw xmm4, 7 packuswb xmm0, xmm0 - - - movq xmm3, [rsi + 5] - movq xmm7, [rsi + 13] - punpcklqdq xmm3, xmm7 - - movdqa xmm1, xmm3 - pshufb xmm3, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm3, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm3, xmm1 - paddsw xmm3, xmm4 - paddsw xmm3, xmm2 - paddsw xmm3, krd - psraw xmm3, 7 - packuswb xmm3, xmm3 - punpcklqdq xmm0, xmm3 + packuswb xmm4, xmm4 + punpcklqdq xmm0, xmm4 %if %1 movdqa xmm1, [rdi] pavgb xmm0, xmm1 @@ -792,19 +826,8 @@ sym(vp9_filter_block1d4_h8_ssse3): push rdi ; end prolog - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - HORIZx4 0 - add rsp, 16*5 - pop rsp - ; begin epilog pop rdi pop rsi @@ -909,19 +932,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3): push rdi ; end prolog - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - HORIZx4 1 - add rsp, 16*5 - pop rsp - ; begin epilog pop rdi pop rsi |