diff options
Diffstat (limited to 'vp9')
78 files changed, 7996 insertions, 1918 deletions
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h index dc88f1603..644264f65 100644 --- a/vp9/common/mips/dspr2/vp9_common_dspr2.h +++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -81,6 +81,9 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { ); } +void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); + void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c new file mode 100644 index 000000000..1b2f5506a --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -0,0 +1,1315 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--; ) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), + [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), + [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), + [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) + : [const_2_power_13] "r" (const_2_power_13), + [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), + [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6) + : [output] "r" (output), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), + [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), + [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), + [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) + ); + + __asm__ __volatile__ ( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6) + : [output] "r" (output), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) + ); + + input += 16; + output += 1; + } +} + +static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), + [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), + [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), + [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) + : [const_2_power_13] "r" (const_2_power_13), + [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), + [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), + [cospi_16_64] "r" (cospi_16_64) + ); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__ ( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), + [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), + [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) + ); + + input += 16; + } +} + +void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct16_1d_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +static void iadst16_1d(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int pitch, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + int16_t temp_out[16]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct16_1d_rows_dspr2(input, outptr, 16); + idct16_1d_cols_add_blk_dspr2(out, dest, pitch); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct16_1d_rows_dspr2(input, outptr, 16); + + outptr = out; + + for (i = 0; i < 16; ++i) { + iadst16_1d(outptr, temp_out); + + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + outptr += 16; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + { + int16_t temp_in[16 * 16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + iadst16_1d(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) + for (j = 0; j < 16; ++j) + temp_in[j * 16 + i] = out[i * 16 + j]; + + idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); + } + break; + case ADST_ADST: // ADST in both directions + { + int16_t temp_in[16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + iadst16_1d(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + iadst16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + } + } + break; + default: + printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); + break; + } +} + +void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_1d_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + outptr += 2; + } + + // Then transform columns + idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 16; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 16; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c new file mode 100644 index 000000000..5e92db3d2 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -0,0 +1,1073 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; + int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; + int16_t step1_27, step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; + int16_t step3_28, step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i, temp21; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * dest_stride; + + __asm__ __volatile__ ( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), + [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), + [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), + [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), + [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), + [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), + [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), + [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), + [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), + [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), + [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), + [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), + [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) + : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_11] "r" (step2_11), [step2_12] "r" (step2_12), + [step2_13] "r" (step2_13), [step2_14] "r" (step2_14), + [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64) + ); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r" (step3_18) + : [const_2_power_13] "r" (const_2_power_13), + [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r" (step3_19) + : [const_2_power_13] "r" (const_2_power_13), + [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r" (step3_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r" (step3_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + // stage 7 + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) + : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20), + [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) + : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26), + [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) + : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25), + [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) + : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24), + [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step2_28] "r" (step2_28), [step2_29] "r" (step2_29), + [step2_30] "r" (step2_30), [step2_31] "r" (step2_31) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step1_24] "r" (step1_24), [step1_25] "r" (step1_25), + [step1_26] "r" (step1_26), [step1_27] "r" (step1_27) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_20] "r" (step1_20), [step1_21] "r" (step1_21), + [step1_22] "r" (step1_22), [step1_23] "r" (step1_23) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), + [step1_14] "r" (step1_14), [step1_15] "r" (step1_15), + [step2_16] "r" (step2_16), [step2_17] "r" (step2_17), + [step2_18] "r" (step2_18), [step2_19] "r" (step2_19) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c new file mode 100644 index 000000000..d3aee73cb --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -0,0 +1,1013 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int16_t step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int16_t step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int temp21; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = 32; i--; ) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__ ( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r" (output) + ); + + output += 1; + + continue; + } + + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 32)); + vp9_prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__ ( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), + [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), + [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), + [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), + [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), + [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), + [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), + [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), + [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), + [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), + [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), + [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), + [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) + : [const_2_power_13] "r" (const_2_power_13), + [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), + [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), + [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), + [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r" (step3_18) + : [const_2_power_13] "r" (const_2_power_13), + [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r" (step3_19) + : [const_2_power_13] "r" (const_2_power_13), + [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r" (step3_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r" (step3_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + + ); + + __asm__ __volatile__ ( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) + : [const_2_power_13] "r" (const_2_power_13), + [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) + : [const_2_power_13] "r" (const_2_power_13), + [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + // final stage + output[0 * 32] = step1_0 + step2_31; + output[1 * 32] = step1_1 + step2_30; + output[2 * 32] = step1_2 + step2_29; + output[3 * 32] = step1_3 + step2_28; + output[4 * 32] = step1_4 + step1_27; + output[5 * 32] = step1_5 + step1_26; + output[6 * 32] = step1_6 + step1_25; + output[7 * 32] = step1_7 + step1_24; + output[8 * 32] = step1_8 + step1_23; + output[9 * 32] = step1_9 + step1_22; + output[10 * 32] = step1_10 + step1_21; + output[11 * 32] = step1_11 + step1_20; + output[12 * 32] = step1_12 + step2_19; + output[13 * 32] = step1_13 + step2_18; + output[14 * 32] = step1_14 + step2_17; + output[15 * 32] = step1_15 + step2_16; + output[16 * 32] = step1_15 - step2_16; + output[17 * 32] = step1_14 - step2_17; + output[18 * 32] = step1_13 - step2_18; + output[19 * 32] = step1_12 - step2_19; + output[20 * 32] = step1_11 - step1_20; + output[21 * 32] = step1_10 - step1_21; + output[22 * 32] = step1_9 - step1_22; + output[23 * 32] = step1_8 - step1_23; + output[24 * 32] = step1_7 - step1_24; + output[25 * 32] = step1_6 - step1_25; + output[26 * 32] = step1_5 - step1_26; + output[27 * 32] = step1_4 - step1_27; + output[28 * 32] = step1_3 - step2_28; + output[29 * 32] = step1_2 - step2_29; + output[30 * 32] = step1_1 - step2_30; + output[31 * 32] = step1_0 - step2_31; + + input += 32; + output += 1; + } +} + +void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + idct32_1d_rows_dspr2(input, outptr); + + // Columns + vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 32; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [stride] "r" (stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 32; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [stride] "r" (stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c new file mode 100644 index 000000000..5b7aa5e71 --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--; ) { + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), + [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), + [output] "+r" (output) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input) + ); + + input += 4; + output += 1; + } +} + +static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), + [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), + [dest_pix] "+r" (dest_pix) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) + ); + + input += 4; + } +} + +void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + vp9_idct4_1d_rows_dspr2(input, outptr); + + // Columns + vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__ ( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 4; r--;) { + __asm__ __volatile__ ( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 4; r--;) { + __asm__ __volatile__ ( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} + +static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + int16_t temp_in[4 * 4], temp_out[4]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + vp9_idct4_1d_rows_dspr2(input, outptr); + + outptr = out; + + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(outptr, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + + outptr += 4; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + temp_in[i * 4 + j] = out[j * 4 + i]; + } + } + vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + iadst4_1d_dspr2(temp_in, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + } + break; + default: + printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); + break; + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c new file mode 100644 index 000000000..93a08401d --- /dev/null +++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--; ) { + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), + [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), + [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), + [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), + [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [Temp4] "=&r" (Temp4) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_24_64] "r" (cospi_24_64), + [output] "r" (output), [input] "r" (input) + ); + + input += 8; + output += 1; + } +} + +static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), + [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), + [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), + [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), + [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dest_pix] "+r" (dest_pix) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) + ); + + input += 8; + } +} + +void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct8_1d_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} + +void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + int16_t temp_in[8 * 8], temp_out[8]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct8_1d_rows_dspr2(input, outptr, 8); + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct8_1d_rows_dspr2(input, outptr, 8); + + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(&out[i * 8], temp_out); + + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) { + temp_in[i * 8 + j] = out[j * 8 + i]; + } + } + idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + + iadst8_1d_dspr2(temp_in, temp_out); + + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } + break; + default: + printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); + break; + } +} + +void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct8_1d_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + + // Then transform columns and add to dest + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 8; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 8; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [dest] "+r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 0f50f374d..0d65651f0 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -41,17 +41,12 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) { vpx_free(cm->mip); vpx_free(cm->prev_mip); - vpx_free(cm->above_seg_context); vpx_free(cm->last_frame_seg_map); vpx_free(cm->mi_grid_base); vpx_free(cm->prev_mi_grid_base); - vpx_free(cm->above_context[0]); - for (i = 0; i < MAX_MB_PLANE; i++) - cm->above_context[i] = 0; cm->mip = NULL; cm->prev_mip = NULL; - cm->above_seg_context = NULL; cm->last_frame_seg_map = NULL; cm->mi_grid_base = NULL; cm->prev_mi_grid_base = NULL; @@ -85,7 +80,7 @@ static void setup_mi(VP9_COMMON *cm) { } int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { - int i, mi_cols; + int i; const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); @@ -140,21 +135,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { setup_mi(cm); - // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling - // information is exposed at this level - mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - - // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm - // block where mi unit size is 8x8. - cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * - (2 * mi_cols), 1); - if (!cm->above_context[0]) - goto fail; - - cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1); - if (!cm->above_seg_context) - goto fail; - // Create the segmentation map structure and set to 0. cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); if (!cm->last_frame_seg_map) @@ -186,18 +166,12 @@ void vp9_initialize_common() { } void vp9_update_frame_size(VP9_COMMON *cm) { - int i, mi_cols; const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2); set_mb_mi(cm, aligned_width, aligned_height); setup_mi(cm); - mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - for (i = 1; i < MAX_MB_PLANE; i++) - cm->above_context[i] = - cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols; - // Initialize the previous frame segment map to 0. if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 1a03269fb..d0d485272 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -74,10 +74,6 @@ typedef enum { MB_MODE_COUNT } MB_PREDICTION_MODE; -static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) { - return mode <= TM_PRED; -} - static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } @@ -140,7 +136,7 @@ typedef struct { // Flags used for prediction status of various bit-stream signals unsigned char seg_id_predicted; - INTERPOLATIONFILTERTYPE interp_filter; + INTERPOLATION_TYPE interp_filter; BLOCK_SIZE sb_type; } MB_MODE_INFO; @@ -226,6 +222,13 @@ typedef struct macroblockd { unsigned char ab_index; // index of 4x4 block inside the 8x8 block int q_index; + + /* Y,U,V,(A) */ + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; } MACROBLOCKD; @@ -414,7 +417,7 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, *y = (raster_mb >> tx_cols_log2) << tx_size; } -static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize, +static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, int plane, int block, TX_SIZE tx_size) { struct macroblockd_plane *const pd = &xd->plane[plane]; uint8_t *const buf = pd->dst.buf; @@ -439,19 +442,22 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize, } if (xd->mb_to_bottom_edge < 0) { - const int bh = 4 << b_height_log2(plane_bsize); - const int umv_border_start = bh + (xd->mb_to_bottom_edge >> - (3 + pd->subsampling_y)); - int i; - const uint8_t c = buf[(umv_border_start - 1) * stride + x]; - uint8_t *d = &buf[umv_border_start * stride + x]; - - if (y + bh > umv_border_start) - for (i = 0; i < bh; ++i, d += stride) - *d = c; + if (xd->left_available || x >= 0) { + const int bh = 4 << b_height_log2(plane_bsize); + const int umv_border_start = + bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)); + + if (y + bh > umv_border_start) { + const uint8_t c = buf[(umv_border_start - 1) * stride + x]; + uint8_t *d = &buf[umv_border_start * stride + x]; + int i; + for (i = 0; i < bh; ++i, d += stride) + *d = c; + } + } } } -static void set_contexts_on_border(MACROBLOCKD *xd, +static void set_contexts_on_border(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, int tx_size_in_blocks, int has_eob, @@ -489,7 +495,7 @@ static void set_contexts_on_border(MACROBLOCKD *xd, L[pt] = 0; } -static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd, +static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { ENTROPY_CONTEXT *const A = pd->above_context + aoff; diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 3ac192b4a..36d1cdf14 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -84,9 +84,11 @@ static int get_unsigned_bits(unsigned int num_values) { } while (0) #endif -#define SYNC_CODE_0 0x49 -#define SYNC_CODE_1 0x83 -#define SYNC_CODE_2 0x42 +#define VP9_SYNC_CODE_0 0x49 +#define VP9_SYNC_CODE_1 0x83 +#define VP9_SYNC_CODE_2 0x42 + +#define VP9_FRAME_MARKER 0x2 #endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 2640ac72b..d3a867c3f 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -322,9 +322,8 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size]; unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = cm->counts.eob_branch[tx_size]; - int t, i, j, k, l; + int i, j, k, l, m; unsigned int branch_ct[UNCONSTRAINED_NODES][2]; - vp9_prob coef_probs[UNCONSTRAINED_NODES]; for (i = 0; i < BLOCK_TYPES; ++i) for (j = 0; j < REF_TYPES; ++j) @@ -332,15 +331,14 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { if (l >= 3 && k == 0) continue; - vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs, - branch_ct, coef_counts[i][j][k][l], - 0); + vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct, + coef_counts[i][j][k][l], 0); branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; - coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); - for (t = 0; t < UNCONSTRAINED_NODES; ++t) - dst_coef_probs[i][j][k][l][t] = merge_probs( - pre_coef_probs[i][j][k][l][t], coef_probs[t], - branch_ct[t], count_sat, update_factor); + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + dst_coef_probs[i][j][k][l][m] = merge_probs( + pre_coef_probs[i][j][k][l][m], + branch_ct[m], + count_sat, update_factor); } } diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index ec7d09a00..c58e852fe 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -153,8 +153,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -static int get_entropy_context(TX_SIZE tx_size, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { +static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; switch (tx_size) { @@ -163,16 +163,16 @@ static int get_entropy_context(TX_SIZE tx_size, left_ec = l[0] != 0; break; case TX_8X8: - above_ec = !!*(uint16_t *)a; - left_ec = !!*(uint16_t *)l; + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; break; case TX_16X16: - above_ec = !!*(uint32_t *)a; - left_ec = !!*(uint32_t *)l; + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; break; case TX_32X32: - above_ec = !!*(uint64_t *)a; - left_ec = !!*(uint64_t *)l; + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; break; default: assert(!"Invalid transform size."); diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 3347b35de..a963d55e6 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -161,51 +161,52 @@ static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = { { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm }; -static const vp9_prob default_partition_probs[FRAME_TYPES][PARTITION_CONTEXTS] +const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 158, 97, 94 }, // a/l both not split + { 93, 24, 99 }, // a split, l not split + { 85, 119, 44 }, // l split, a not split + { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 + { 149, 53, 53 }, // a/l both not split + { 94, 20, 48 }, // a split, l not split + { 83, 53, 24 }, // l split, a not split + { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 + { 150, 40, 39 }, // a/l both not split + { 78, 12, 26 }, // a split, l not split + { 67, 33, 11 }, // l split, a not split + { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 + { 174, 35, 49 }, // a/l both not split + { 68, 11, 27 }, // a split, l not split + { 57, 15, 9 }, // l split, a not split + { 12, 3, 3 }, // a/l both split +}; + +static const vp9_prob default_partition_probs[PARTITION_CONTEXTS] [PARTITION_TYPES - 1] = { - { // frame_type = keyframe - // 8x8 -> 4x4 - { 158, 97, 94 }, // a/l both not split - { 93, 24, 99 }, // a split, l not split - { 85, 119, 44 }, // l split, a not split - { 62, 59, 67 }, // a/l both split - // 16x16 -> 8x8 - { 149, 53, 53 }, // a/l both not split - { 94, 20, 48 }, // a split, l not split - { 83, 53, 24 }, // l split, a not split - { 52, 18, 18 }, // a/l both split - // 32x32 -> 16x16 - { 150, 40, 39 }, // a/l both not split - { 78, 12, 26 }, // a split, l not split - { 67, 33, 11 }, // l split, a not split - { 24, 7, 5 }, // a/l both split - // 64x64 -> 32x32 - { 174, 35, 49 }, // a/l both not split - { 68, 11, 27 }, // a split, l not split - { 57, 15, 9 }, // l split, a not split - { 12, 3, 3 }, // a/l both split - }, { // frame_type = interframe - // 8x8 -> 4x4 - { 199, 122, 141 }, // a/l both not split - { 147, 63, 159 }, // a split, l not split - { 148, 133, 118 }, // l split, a not split - { 121, 104, 114 }, // a/l both split - // 16x16 -> 8x8 - { 174, 73, 87 }, // a/l both not split - { 92, 41, 83 }, // a split, l not split - { 82, 99, 50 }, // l split, a not split - { 53, 39, 39 }, // a/l both split - // 32x32 -> 16x16 - { 177, 58, 59 }, // a/l both not split - { 68, 26, 63 }, // a split, l not split - { 52, 79, 25 }, // l split, a not split - { 17, 14, 12 }, // a/l both split - // 64x64 -> 32x32 - { 222, 34, 30 }, // a/l both not split - { 72, 16, 44 }, // a split, l not split - { 58, 32, 12 }, // l split, a not split - { 10, 7, 6 }, // a/l both split - } + // 8x8 -> 4x4 + { 199, 122, 141 }, // a/l both not split + { 147, 63, 159 }, // a split, l not split + { 148, 133, 118 }, // l split, a not split + { 121, 104, 114 }, // a/l both split + // 16x16 -> 8x8 + { 174, 73, 87 }, // a/l both not split + { 92, 41, 83 }, // a split, l not split + { 82, 99, 50 }, // l split, a not split + { 53, 39, 39 }, // a/l both split + // 32x32 -> 16x16 + { 177, 58, 59 }, // a/l both not split + { 68, 26, 63 }, // a split, l not split + { 52, 79, 25 }, // l split, a not split + { 17, 14, 12 }, // a/l both split + // 64x64 -> 32x32 + { 222, 34, 30 }, // a/l both not split + { 72, 16, 44 }, // a split, l not split + { 58, 32, 12 }, // l split, a not split + { 10, 7, 6 }, // a/l both split }; static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] @@ -309,8 +310,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = { 192, 128, 64 }; -static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1] - [SWITCHABLE_FILTERS-1] = { +static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1] = { { 235, 162, }, { 36, 255, }, { 34, 3, }, @@ -349,29 +350,15 @@ void vp9_entropy_mode_init() { #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 -static int update_ct(vp9_prob pre_prob, vp9_prob prob, - const unsigned int ct[2]) { - return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); -} - -static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) { - return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); +static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { + return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); } -static void update_mode_probs(int n_modes, - const vp9_tree_index *tree, - const unsigned int *cnt, - const vp9_prob *pre_probs, vp9_prob *dst_probs, - unsigned int tok0_offset) { -#define MAX_PROBS 32 - vp9_prob probs[MAX_PROBS]; - unsigned int branch_ct[MAX_PROBS][2]; - int t; - - assert(n_modes - 1 < MAX_PROBS); - vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset); - for (t = 0; t < n_modes - 1; ++t) - dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]); +static void adapt_probs(const vp9_tree_index *tree, + const vp9_prob *pre_probs, const unsigned int *counts, + unsigned int offset, vp9_prob *probs) { + tree_merge_probs(tree, pre_probs, counts, offset, + COUNT_SAT, MAX_UPDATE_FACTOR, probs); } void vp9_adapt_mode_probs(VP9_COMMON *cm) { @@ -381,46 +368,40 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { const FRAME_COUNTS *counts = &cm->counts; for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i], + fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i], counts->intra_inter[i]); for (i = 0; i < COMP_INTER_CONTEXTS; i++) - fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i], + fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i], counts->comp_inter[i]); for (i = 0; i < REF_CONTEXTS; i++) - fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i], + fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i], counts->comp_ref[i]); for (i = 0; i < REF_CONTEXTS; i++) for (j = 0; j < 2; j++) - fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j], + fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]); for (i = 0; i < INTER_MODE_CONTEXTS; i++) - update_mode_probs(INTER_MODES, vp9_inter_mode_tree, - counts->inter_mode[i], pre_fc->inter_mode_probs[i], - fc->inter_mode_probs[i], NEARESTMV); + adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], + counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]); for (i = 0; i < BLOCK_SIZE_GROUPS; i++) - update_mode_probs(INTRA_MODES, vp9_intra_mode_tree, - counts->y_mode[i], pre_fc->y_mode_prob[i], - fc->y_mode_prob[i], 0); + adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], + counts->y_mode[i], 0, fc->y_mode_prob[i]); for (i = 0; i < INTRA_MODES; ++i) - update_mode_probs(INTRA_MODES, vp9_intra_mode_tree, - counts->uv_mode[i], pre_fc->uv_mode_prob[i], - fc->uv_mode_prob[i], 0); + adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], + counts->uv_mode[i], 0, fc->uv_mode_prob[i]); for (i = 0; i < PARTITION_CONTEXTS; i++) - update_mode_probs(PARTITION_TYPES, vp9_partition_tree, - counts->partition[i], - pre_fc->partition_prob[INTER_FRAME][i], - fc->partition_prob[INTER_FRAME][i], 0); + adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i], + counts->partition[i], 0, fc->partition_prob[i]); if (cm->mcomp_filter_type == SWITCHABLE) { - for (i = 0; i <= SWITCHABLE_FILTERS; i++) - update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree, - counts->switchable_interp[i], - pre_fc->switchable_interp_prob[i], - fc->switchable_interp_prob[i], 0); + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], + counts->switchable_interp[i], 0, + fc->switchable_interp_prob[i]); } if (cm->tx_mode == TX_MODE_SELECT) { @@ -432,23 +413,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); for (j = 0; j < TX_SIZES - 3; ++j) - fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j], + fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); for (j = 0; j < TX_SIZES - 2; ++j) - fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j], + fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); for (j = 0; j < TX_SIZES - 1; ++j) - fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j], + fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); } } for (i = 0; i < MBSKIP_CONTEXTS; ++i) - fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i], + fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i], counts->mbskip[i]); } diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index ab37b75c6..38b419948 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -16,6 +16,7 @@ #define TX_SIZE_CONTEXTS 2 #define SWITCHABLE_FILTERS 3 // number of switchable filters +#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) // #define MODE_STATS @@ -37,6 +38,9 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] [INTRA_MODES - 1]; +extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1]; + extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index f70b571ef..b061cdb38 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -191,60 +191,47 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { } static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { - return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); + return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); } -static unsigned int adapt_probs(unsigned int i, - vp9_tree tree, - vp9_prob this_probs[], - const vp9_prob last_probs[], - const unsigned int num_events[]) { - const unsigned int left = tree[i] <= 0 - ? num_events[-tree[i]] - : adapt_probs(tree[i], tree, this_probs, last_probs, num_events); - - const unsigned int right = tree[i + 1] <= 0 - ? num_events[-tree[i + 1]] - : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events); - const unsigned int ct[2] = { left, right }; - this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct); - return left + right; +static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, vp9_prob *probs) { + tree_merge_probs(tree, pre_probs, counts, 0, + MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs); } - void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { int i, j; - const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - - nmv_context *ctx = &cm->fc.nmvc; - const nmv_context *pre_ctx = &pre_fc->nmvc; - const nmv_context_counts *cts = &cm->counts.mv; + nmv_context *fc = &cm->fc.nmvc; + const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; + const nmv_context_counts *counts = &cm->counts.mv; - adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints); + adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, + fc->joints); for (i = 0; i < 2; ++i) { - ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign); - adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes, - pre_ctx->comps[i].classes, cts->comps[i].classes); - adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0, - pre_ctx->comps[i].class0, cts->comps[i].class0); + nmv_component *comp = &fc->comps[i]; + const nmv_component *pre_comp = &pre_fc->comps[i]; + const nmv_component_counts *c = &counts->comps[i]; + + comp->sign = adapt_prob(pre_comp->sign, c->sign); + adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes, + comp->classes); + adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0); for (j = 0; j < MV_OFFSET_BITS; ++j) - ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j], - cts->comps[i].bits[j]); + comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]); for (j = 0; j < CLASS0_SIZE; ++j) - adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j], - pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]); + adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j], + comp->class0_fp[j]); - adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp, - cts->comps[i].fp); + adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp); if (allow_hp) { - ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp, - cts->comps[i].class0_hp); - ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp); + comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp); + comp->hp = adapt_prob(pre_comp->hp, c->hp); } } } diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 768ff2c94..1651b9050 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -76,4 +76,15 @@ typedef enum { ADST_ADST = 3 // ADST in both directions } TX_TYPE; +typedef enum { + UNKNOWN = 0, + BT_601 = 1, // YUV + BT_709 = 2, // YUV + SMPTE_170 = 3, // YUV + SMPTE_240 = 4, // YUV + RESERVED_1 = 5, + RESERVED_2 = 6, + SRGB = 7 // RGB +} COLOR_SPACE; + #endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index cedd44cad..79ace147c 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -97,19 +97,15 @@ DECLARE_ALIGNED(256, const subpel_kernel, { 0, -3, 1, 38, 64, 32, -1, -3} }; -const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type) { - switch (type) { - case EIGHTTAP: - return vp9_sub_pel_filters_8; - case EIGHTTAP_SMOOTH: - return vp9_sub_pel_filters_8lp; - case EIGHTTAP_SHARP: - return vp9_sub_pel_filters_8s; - case BILINEAR: - return vp9_bilinear_filters; - default: - assert(!"Invalid filter type."); - return NULL; - } + +static const subpel_kernel* vp9_filter_kernels[4] = { + vp9_sub_pel_filters_8, + vp9_sub_pel_filters_8lp, + vp9_sub_pel_filters_8s, + vp9_bilinear_filters +}; + +const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) { + return vp9_filter_kernels[type]; } diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index 302945374..b1e7e6499 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -27,7 +27,7 @@ typedef enum { EIGHTTAP_SHARP = 2, BILINEAR = 3, SWITCHABLE = 4 /* should be the last one */ -} INTERPOLATIONFILTERTYPE; +} INTERPOLATION_TYPE; typedef int16_t subpel_kernel[SUBPEL_TAPS]; @@ -36,10 +36,9 @@ struct subpix_fn_table { const subpel_kernel *filter_y; }; -const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type); +const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type); extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS]; extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS]; extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]; extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]; diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 592ef6afa..b91c50143 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -35,6 +35,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, } void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, int_mv *dst_nearest, int_mv *dst_near, int block_idx, int ref_idx, @@ -46,7 +47,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, assert(ref_idx == 0 || ref_idx == 1); assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier - vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi, + vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref_idx], mv_list, block_idx, mi_row, mi_col); diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h index d161d1b3e..2362caa41 100644 --- a/vp9/common/vp9_findnearmv.h +++ b/vp9/common/vp9_findnearmv.h @@ -34,8 +34,8 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, - MACROBLOCKD *xd, +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, int_mv *dst_nearest, int_mv *dst_near, int block_idx, int ref_idx, @@ -43,42 +43,30 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi, const MODE_INFO *left_mi, int b) { - // FIXME(rbultje, jingning): temporary hack because jenkins doesn't - // understand this condition. This will go away soon. - if (b == 0 || b == 2) { - /* On L edge, get from MB to left of us */ - if (!left_mi) + if (!left_mi || is_inter_block(&left_mi->mbmi)) return DC_PRED; - if (is_inter_block(&left_mi->mbmi)) - return DC_PRED; - else - return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode - : left_mi->mbmi.mode; + return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode + : left_mi->mbmi.mode; + } else { + assert(b == 1 || b == 3); + return cur_mi->bmi[b - 1].as_mode; } - assert(b == 1 || b == 3); - return cur_mi->bmi[b - 1].as_mode; } -static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, - const MODE_INFO *above_mb, int b) { - const MODE_INFO *mi = cur_mb; - - if (!(b >> 1)) { - /* On top edge, get from MB above us */ - mi = above_mb; - if (!mi) +static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { + if (b == 0 || b == 1) { + if (!above_mi || is_inter_block(&above_mi->mbmi)) return DC_PRED; - if (is_inter_block(&mi->mbmi)) - return DC_PRED; - else - return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode - : mi->mbmi.mode; + return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode + : above_mi->mbmi.mode; + } else { + assert(b == 2 || b == 3); + return cur_mi->bmi[b - 2].as_mode; } - - return (mi->bmi + b - 2)->as_mode; } #endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 52b039d99..ea8683ea1 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -1280,6 +1280,31 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { } } +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { + int16_t out[32 * 32] = {0}; + int16_t *outptr = out; + int i, j; + int16_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + idct32_1d(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); + } +} + void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { int i, j; int a1; @@ -1350,6 +1375,9 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, if (eob) { if (eob == 1) vp9_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vp9_idct32x32_34_add(input, dest, stride); else vp9_idct32x32_1024_add(input, dest, stride); } diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 85ac6d2bf..218e12e62 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -16,12 +16,6 @@ #include "vp9/common/vp9_seg_common.h" -struct loop_filter_info { - const uint8_t *mblim; - const uint8_t *lim; - const uint8_t *hev_thr; -}; - // This structure holds bit masks for all 8x8 blocks in a 64x64 region. // Each 1 bit represents a position in which we want to apply the loop filter. // Left_ entries refer to whether we apply a filter on the border to the @@ -259,8 +253,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { if (block_inside_limit < 1) block_inside_limit = 1; - vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH); - vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit), + vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), SIMD_WIDTH); } } @@ -268,7 +262,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { void vp9_loop_filter_init(VP9_COMMON *cm) { loop_filter_info_n *lfi = &cm->lf_info; struct loopfilter *lf = &cm->lf; - int i; + int lvl; // init limits for given sharpness update_sharpness(lfi, lf->sharpness_level); @@ -278,8 +272,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { lf_init_lut(lfi); // init hev threshold const vectors - for (i = 0; i < 4; i++) - vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); } void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { @@ -330,16 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { static int build_lfi(const loop_filter_info_n *lfi_n, const MB_MODE_INFO *mbmi, - struct loop_filter_info *lfi) { + const loop_filter_thresh **lfi) { const int seg = mbmi->segment_id; const int ref = mbmi->ref_frame[0]; const int mode = lfi_n->mode_lf_lut[mbmi->mode]; const int filter_level = lfi_n->lvl[seg][ref][mode]; if (filter_level > 0) { - lfi->mblim = lfi_n->mblim[filter_level]; - lfi->lim = lfi_n->lim[filter_level]; - lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4]; + *lfi = &lfi_n->lfthr[filter_level]; return 1; } else { return 0; @@ -351,11 +343,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const struct loop_filter_info *lfi) { + const loop_filter_thresh **p_lfi) { unsigned int mask; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= 1) { + const loop_filter_thresh *lfi = *p_lfi; + if (mask & 1) { if (mask_16x16 & 1) { vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, @@ -379,7 +373,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch, vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); s += 8; - lfi++; + p_lfi++; mask_16x16 >>= 1; mask_8x8 >>= 1; mask_4x4 >>= 1; @@ -393,12 +387,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, unsigned int mask_4x4, unsigned int mask_4x4_int, int only_4x4_1, - const struct loop_filter_info *lfi) { + const loop_filter_thresh **p_lfi) { unsigned int mask; int count; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= count) { + const loop_filter_thresh *lfi = *p_lfi; + count = 1; if (mask & 1) { if (!only_4x4_1) { @@ -432,7 +428,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, lfi->lim, lfi->hev_thr, 1); } s += 8 * count; - lfi += count; + p_lfi += count; mask_16x16 >>= count; mask_8x8 >>= count; mask_4x4 >>= count; @@ -805,7 +801,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, unsigned int mask_8x8[MI_BLOCK_SIZE] = {0}; unsigned int mask_4x4[MI_BLOCK_SIZE] = {0}; unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; - struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; int r, c; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { @@ -834,7 +830,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; // Filter level can vary per MI - if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x))) + if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x])) continue; // Build masks based on the transform size of each block @@ -925,7 +921,7 @@ static void filter_block_plane(VP9_COMMON *const cm, struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; - struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; int r, c; int row_shift = 3 - ss_x; int row_mask = 0xff >> (ss_x << 2); @@ -938,8 +934,8 @@ static void filter_block_plane(VP9_COMMON *const cm, // Determine the vertical edges that need filtering for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const MODE_INFO *mi = mi_8x8[c]; - if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x))) - continue; + + build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]); } if (!plane->plane_type) { mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y); diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index c698090d8..62389ea5e 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -46,12 +46,13 @@ struct loopfilter { // Need to align this structure so when it is declared and // passed it can be loaded into vector registers. typedef struct { - DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, - mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, - lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, - hev_thr[4][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 659079639..8df8aec84 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -170,17 +170,19 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. -static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row, +static INLINE int is_inside(const TileInfo *const tile, + int mi_col, int mi_row, int mi_rows, const MV *mv) { return !(mi_row + mv->row < 0 || - mi_col + mv->col < cm->cur_tile_mi_col_start || - mi_row + mv->row >= cm->mi_rows || - mi_col + mv->col >= cm->cur_tile_mi_col_end); + mi_col + mv->col < tile->mi_col_start || + mi_row + mv->row >= mi_rows || + mi_col + mv->col >= tile->mi_col_end); } // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, @@ -201,7 +203,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // and we also need to keep a mode count. for (i = 0; i < 2; ++i) { const MV *const mv_ref = &mv_ref_search[i]; - if (is_inside(cm, mi_col, mi_row, mv_ref)) { + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]; const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; @@ -228,7 +230,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // mode counts. for (; i < MVREF_NEIGHBOURS; ++i) { const MV *const mv_ref = &mv_ref_search[i]; - if (is_inside(cm, mi_col, mi_row, mv_ref)) { + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]->mbmi; @@ -258,7 +260,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, if (different_ref_found) { for (i = 0; i < MVREF_NEIGHBOURS; ++i) { const MV *mv_ref = &mv_ref_search[i]; - if (is_inside(cm, mi_col, mi_row, mv_ref)) { + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]->mbmi; diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 39ebdb078..ce4c55983 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -15,6 +15,7 @@ #define VP9_COMMON_VP9_MVREF_COMMON_H_ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, @@ -22,11 +23,12 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col); static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int mi_row, int mi_col) { - vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame, + vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, mv_ref_list, -1, mi_row, mi_col); } diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index f2244e555..a2af57acf 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -19,6 +19,7 @@ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_tile_common.h" #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" @@ -40,9 +41,9 @@ typedef struct frame_contexts { vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; - vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1]; + vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1] + vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS - 1]; vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; @@ -61,7 +62,7 @@ typedef struct { vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES]; unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES] [COEF_BANDS][PREV_COEF_CONTEXTS]; - unsigned int switchable_interp[SWITCHABLE_FILTERS + 1] + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; @@ -90,6 +91,8 @@ typedef struct VP9Common { DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]); #endif + COLOR_SPACE color_space; + int width; int height; int display_width; @@ -115,6 +118,7 @@ typedef struct VP9Common { // Each frame can reference ALLOWED_REFS_PER_FRAME buffers int active_ref_idx[ALLOWED_REFS_PER_FRAME]; struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; + struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME]; int new_fb_idx; YV12_BUFFER_CONFIG post_proc_buffer; @@ -171,7 +175,7 @@ typedef struct VP9Common { // Persistent mb segment id map used in prediction. unsigned char *last_frame_seg_map; - INTERPOLATIONFILTERTYPE mcomp_filter_type; + INTERPOLATION_TYPE mcomp_filter_type; loop_filter_info_n lf_info; @@ -182,14 +186,6 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; - /* Y,U,V */ - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; - ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; - - // partition contexts - PARTITION_CONTEXT *above_seg_context; - PARTITION_CONTEXT left_seg_context[8]; - // Context probabilities for reference frame prediction int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; @@ -212,10 +208,19 @@ typedef struct VP9Common { int frame_parallel_decoding_mode; int log2_tile_cols, log2_tile_rows; - int cur_tile_mi_col_start, cur_tile_mi_col_end; - int cur_tile_mi_row_start, cur_tile_mi_row_end; } VP9_COMMON; +// ref == 0 => LAST_FRAME +// ref == 1 => GOLDEN_FRAME +// ref == 2 => ALTREF_FRAME +static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) { + return &cm->yv12_fb[cm->active_ref_idx[ref]]; +} + +static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { + return &cm->yv12_fb[cm->new_fb_idx]; +} + static int get_free_fb(VP9_COMMON *cm) { int i; for (i = 0; i < NUM_YV12_BUFFERS; i++) @@ -240,47 +245,38 @@ static int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } -static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { +static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) { + return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx] + : cm->fc.partition_prob[ctx]; +} + +static INLINE void set_skip_context( + MACROBLOCKD *xd, + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE], + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16], + int mi_row, int mi_col) { const int above_idx = mi_col * 2; const int left_idx = (mi_row * 2) & 15; int i; for (i = 0; i < MAX_MB_PLANE; i++) { struct macroblockd_plane *const pd = &xd->plane[i]; - pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x); - pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y); + pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x); + pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y); } } -// return the node index in the prob tree for binary coding -static int check_bsize_coverage(int bs, int mi_rows, int mi_cols, - int mi_row, int mi_col) { - const int r = (mi_row + bs < mi_rows); - const int c = (mi_col + bs < mi_cols); - - if (r && c) - return 0; - - if (c && !r) - return 1; // only allow horizontal/split partition types - - if (r && !c) - return 2; // only allow vertical/split partition types - - return -1; -} - -static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int bh, - int mi_col, int bw) { +static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) * 8; + xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ((cm->mi_cols - bw - mi_col) * MI_SIZE) * 8; + xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8; // Are edges available for intra prediction? xd->up_available = (mi_row != 0); - xd->left_available = (mi_col > cm->cur_tile_mi_col_start); + xd->left_available = (mi_col > tile->mi_col_start); } static void set_prev_mi(VP9_COMMON *cm) { @@ -299,12 +295,14 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { return cm->frame_type == KEY_FRAME || cm->intra_only; } -static INLINE void update_partition_context(VP9_COMMON *cm, - int mi_row, int mi_col, - BLOCK_SIZE sb_type, - BLOCK_SIZE sb_size) { - PARTITION_CONTEXT *above_ctx = cm->above_seg_context + mi_col; - PARTITION_CONTEXT *left_ctx = cm->left_seg_context + (mi_row & MI_MASK); +static INLINE void update_partition_context( + PARTITION_CONTEXT *above_seg_context, + PARTITION_CONTEXT left_seg_context[8], + int mi_row, int mi_col, + BLOCK_SIZE sb_type, + BLOCK_SIZE sb_size) { + PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; + PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; const int bwl = b_width_log2(sb_type); @@ -323,11 +321,13 @@ static INLINE void update_partition_context(VP9_COMMON *cm, vpx_memset(left_ctx, pcvalue[bhl == bsl], bs); } -static INLINE int partition_plane_context(const VP9_COMMON *cm, - int mi_row, int mi_col, - BLOCK_SIZE sb_type) { - const PARTITION_CONTEXT *above_ctx = cm->above_seg_context + mi_col; - const PARTITION_CONTEXT *left_ctx = cm->left_seg_context + (mi_row & MI_MASK); +static INLINE int partition_plane_context( + const PARTITION_CONTEXT *above_seg_context, + const PARTITION_CONTEXT left_seg_context[8], + int mi_row, int mi_col, + BLOCK_SIZE sb_type) { + const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); int bsl = mi_width_log2(sb_type), bs = 1 << bsl; int above = 0, left = 0, i; diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index be42c56b5..6018e1775 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -35,14 +35,14 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. // left - const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode) + const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi) : 0; const int left_interp = left_in_image && left_mv_pred ? left_mi->mbmi.interp_filter : SWITCHABLE_FILTERS; // above - const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode) + const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi) : 0; const int above_interp = above_in_image && above_mv_pred ? above_mi->mbmi.interp_filter @@ -403,8 +403,8 @@ void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) { int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; const int xmis = MIN(cm->mi_cols - mi_col, bw); const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y, segment_id = INT_MAX; diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index a869dc0a6..19032bf62 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -127,14 +127,14 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd, return get_tx_probs(bsize, context, tx_probs); } -static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context, - TX_SIZE tx_size, struct tx_counts *tx_counts) { - if (bsize >= BLOCK_32X32) - tx_counts->p32x32[context][tx_size]++; - else if (bsize >= BLOCK_16X16) - tx_counts->p16x16[context][tx_size]++; +static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context, + struct tx_counts *tx_counts) { + if (bsize < BLOCK_16X16) + return tx_counts->p8x8[context]; + else if (bsize < BLOCK_32X32) + return tx_counts->p16x16[context]; else - tx_counts->p8x8[context][tx_size]++; + return tx_counts->p32x32[context]; } #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 6f16ac70a..1c96788db 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -21,7 +21,7 @@ #include "vp9/common/vp9_reconintra.h" void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE mcomp_filter_type, + INTERPOLATION_TYPE mcomp_filter_type, VP9_COMMON *cm) { if (xd->mi_8x8 && xd->mi_8x8[0]) { MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; @@ -40,6 +40,24 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd, assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } +static void inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV32 *mv, + const struct scale_factors *scale, + int w, int h, int ref, + const struct subpix_fn_table *subpix, + int xs, int ys) { + const int subpel_x = mv->col & SUBPEL_MASK; + const int subpel_y = mv->row & SUBPEL_MASK; + + src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS); + scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref]( + src, src_stride, dst, dst_stride, + subpix->filter_x[subpel_x], xs, + subpix->filter_y[subpel_y], ys, + w, h); +} + void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *src_mv, @@ -50,16 +68,11 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, const int is_q4 = precision == MV_PRECISION_Q4; const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, is_q4 ? src_mv->col : src_mv->col * 2 }; - const MV32 mv = scale->scale_mv(&mv_q4, scale); - const int subpel_x = mv.col & SUBPEL_MASK; - const int subpel_y = mv.row & SUBPEL_MASK; + const struct scale_factors_common *sfc = scale->sfc; + const MV32 mv = sfc->scale_mv(&mv_q4, scale); - src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); - scale->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, - subpix->filter_x[subpel_x], scale->x_step_q4, - subpix->filter_y[subpel_y], scale->y_step_q4, - w, h); + inter_predictor(src, src_stride, dst, dst_stride, &mv, scale, + w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4); } static INLINE int round_mv_comp_q4(int value) { @@ -133,10 +146,6 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, struct scale_factors *const scale = &xd->scale_factor[ref]; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; - - const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y, - pre_buf->stride, scale); - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the @@ -152,15 +161,32 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, // scaling case. It needs to be done on the scaled MV, not the pre-scaling // MV. Note however that it performs the subsampling aware scaling so // that the result is always q4. - const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, - pd->subsampling_x, - pd->subsampling_y); - - scale->set_scaled_offsets(scale, arg->y + y, arg->x + x); - vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, - &res_mv, scale, - 4 << pred_w, 4 << pred_h, ref, - &xd->subpix, MV_PRECISION_Q4); + // mv_precision precision is MV_PRECISION_Q4. + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + + uint8_t *pre; + MV32 scaled_mv; + int xs, ys; + + if (vp9_is_scaled(scale->sfc)) { + pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale); + scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x); + scaled_mv = scale->sfc->scale_mv(&mv_q4, scale); + xs = scale->sfc->x_step_q4; + ys = scale->sfc->y_step_q4; + } else { + pre = pre_buf->buf + (y * pre_buf->stride + x); + scaled_mv.row = mv_q4.row; + scaled_mv.col = mv_q4.col; + xs = ys = 16; + } + + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + &scaled_mv, scale, + 4 << pred_w, 4 << pred_h, ref, + &xd->subpix, xs, ys); } } @@ -220,15 +246,17 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { const int ref = cm->active_ref_idx[i]; struct scale_factors *const sf = &cm->active_ref_scale[i]; + struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i]; if (ref >= NUM_YV12_BUFFERS) { vp9_zero(*sf); + vp9_zero(*sfc); } else { YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref]; - vp9_setup_scale_factors_for_frame(sf, + vp9_setup_scale_factors_for_frame(sf, sfc, fb->y_crop_width, fb->y_crop_height, cm->width, cm->height); - if (vp9_is_scaled(sf)) + if (vp9_is_scaled(sfc)) vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y); } } diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 504b79356..2c8a6e4d9 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -25,7 +25,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE filter, + INTERPOLATION_TYPE filter, VP9_COMMON *cm); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, @@ -38,8 +38,10 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, static int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *scale) { - const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset; - const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset; + const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) : + x_offset; + const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) : + y_offset; return y * stride + x; } diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index bd609dcf0..eb643b090 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride, } } -void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, +void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, TX_SIZE tx_size, int mode, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride) { diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h index e9d0dbf04..6e3f55c4d 100644 --- a/vp9/common/vp9_reconintra.h +++ b/vp9/common/vp9_reconintra.h @@ -14,8 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, int mode, - const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride); +void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, + TX_SIZE tx_size, int mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride); #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index df92b5882..debec6154 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -22,10 +22,11 @@ forward_decls vp9_common_forward_decls # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly. [ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse && - sse2_x86inc=sse2 && ssse3_x86inc=ssse3 + sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2 # this variable is for functions that are 64 bit only. -[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3 +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && + ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2 # # RECON @@ -157,7 +158,7 @@ prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d63_predictor_32x32 $ssse3_x86inc prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_32x32 $ssse3 x86inc +specialize vp9_h_predictor_32x32 $ssse3_x86inc prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_32x32 @@ -199,7 +200,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8 specialize vp9_loop_filter_vertical_edge mmx neon prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_horizontal_edge_w sse2 neon +specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_horizontal_edge sse2 neon @@ -268,43 +269,46 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 # dct # prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct4x4_1_add sse2 neon +specialize vp9_idct4x4_1_add sse2 neon dspr2 prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct4x4_16_add sse2 neon +specialize vp9_idct4x4_16_add sse2 neon dspr2 prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_1_add sse2 neon +specialize vp9_idct8x8_1_add sse2 neon dspr2 prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_64_add sse2 neon +specialize vp9_idct8x8_64_add sse2 neon dspr2 prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_10_add sse2 neon +specialize vp9_idct8x8_10_add sse2 neon dspr2 prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_1_add sse2 neon +specialize vp9_idct16x16_1_add sse2 neon dspr2 prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_256_add sse2 neon +specialize vp9_idct16x16_256_add sse2 neon dspr2 prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_10_add sse2 neon +specialize vp9_idct16x16_10_add sse2 neon dspr2 prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1024_add sse2 neon +specialize vp9_idct32x32_1024_add sse2 neon dspr2 + +prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct32x32_34_add sse2 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1_add sse2 +specialize vp9_idct32x32_1_add sse2 dspr2 prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_iht4x4_16_add sse2 neon +specialize vp9_iht4x4_16_add sse2 neon dspr2 prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_iht8x8_64_add sse2 neon +specialize vp9_iht8x8_64_add sse2 neon dspr2 prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type" -specialize vp9_iht16x16_256_add sse2 +specialize vp9_iht16x16_256_add sse2 dspr2 # dct and add @@ -668,10 +672,10 @@ specialize vp9_block_error $sse2_x86inc prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" specialize vp9_subtract_block $sse2_x86inc -prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" specialize vp9_quantize_b $ssse3_x86_64 -prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" specialize vp9_quantize_b_32x32 $ssse3_x86_64 # @@ -686,32 +690,32 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then fi # fdct functions -prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht4x4 sse2 -prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht8x8 sse2 -prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht16x16 sse2 -prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int stride" -specialize vp9_short_fdct8x8 sse2 +prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fwht4x4 -prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int stride" -specialize vp9_short_fdct4x4 sse2 +prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct4x4 sse2 -prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride" -specialize vp9_short_fdct32x32 sse2 +prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct8x8 sse2 -prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride" -specialize vp9_short_fdct32x32_rd sse2 +prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct16x16 sse2 -prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int stride" -specialize vp9_short_fdct16x16 sse2 +prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct32x32 sse2 -prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh4x4 +prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct32x32_rd sse2 # # Motion search diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c index 989206c60..3f0994f80 100644 --- a/vp9/common/vp9_scale.c +++ b/vp9/common/vp9_scale.c @@ -12,23 +12,23 @@ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_scale.h" -static INLINE int scaled_x(int val, const struct scale_factors *scale) { - return val * scale->x_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) { + return val * sfc->x_scale_fp >> REF_SCALE_SHIFT; } -static INLINE int scaled_y(int val, const struct scale_factors *scale) { - return val * scale->y_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) { + return val * sfc->y_scale_fp >> REF_SCALE_SHIFT; } -static int unscaled_value(int val, const struct scale_factors *scale) { - (void) scale; +static int unscaled_value(int val, const struct scale_factors_common *sfc) { + (void) sfc; return val; } static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) { const MV32 res = { - scaled_y(mv->row, scale) + scale->y_offset_q4, - scaled_x(mv->col, scale) + scale->x_offset_q4 + scaled_y(mv->row, scale->sfc) + scale->y_offset_q4, + scaled_x(mv->col, scale->sfc) + scale->x_offset_q4 }; return res; } @@ -43,8 +43,8 @@ static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) { static void set_offsets_with_scaling(struct scale_factors *scale, int row, int col) { - scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK; - scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK; + scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; + scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; } static void set_offsets_without_scaling(struct scale_factors *scale, @@ -70,31 +70,30 @@ static int check_scale_factors(int other_w, int other_h, } void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + struct scale_factors_common *scale_comm, int other_w, int other_h, int this_w, int this_h) { if (!check_scale_factors(other_w, other_h, this_w, this_h)) { - scale->x_scale_fp = REF_INVALID_SCALE; - scale->y_scale_fp = REF_INVALID_SCALE; + scale_comm->x_scale_fp = REF_INVALID_SCALE; + scale_comm->y_scale_fp = REF_INVALID_SCALE; return; } - scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); - scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); - scale->x_step_q4 = scaled_x(16, scale); - scale->y_step_q4 = scaled_y(16, scale); - scale->x_offset_q4 = 0; // calculated per block - scale->y_offset_q4 = 0; // calculated per block + scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + scale_comm->x_step_q4 = scaled_x(16, scale_comm); + scale_comm->y_step_q4 = scaled_y(16, scale_comm); - if (vp9_is_scaled(scale)) { - scale->scale_value_x = scaled_x; - scale->scale_value_y = scaled_y; - scale->set_scaled_offsets = set_offsets_with_scaling; - scale->scale_mv = scaled_mv; + if (vp9_is_scaled(scale_comm)) { + scale_comm->scale_value_x = scaled_x; + scale_comm->scale_value_y = scaled_y; + scale_comm->set_scaled_offsets = set_offsets_with_scaling; + scale_comm->scale_mv = scaled_mv; } else { - scale->scale_value_x = unscaled_value; - scale->scale_value_y = unscaled_value; - scale->set_scaled_offsets = set_offsets_without_scaling; - scale->scale_mv = unscaled_mv; + scale_comm->scale_value_x = unscaled_value; + scale_comm->scale_value_y = unscaled_value; + scale_comm->set_scaled_offsets = set_offsets_without_scaling; + scale_comm->scale_mv = unscaled_mv; } // TODO(agrange): Investigate the best choice of functions to use here @@ -103,44 +102,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, // applied in one direction only, and not at all for 0,0, seems to give the // best quality, but it may be worth trying an additional mode that does // do the filtering on full-pel. - if (scale->x_step_q4 == 16) { - if (scale->y_step_q4 == 16) { + if (scale_comm->x_step_q4 == 16) { + if (scale_comm->y_step_q4 == 16) { // No scaling in either direction. - scale->predict[0][0][0] = vp9_convolve_copy; - scale->predict[0][0][1] = vp9_convolve_avg; - scale->predict[0][1][0] = vp9_convolve8_vert; - scale->predict[0][1][1] = vp9_convolve8_avg_vert; - scale->predict[1][0][0] = vp9_convolve8_horiz; - scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + scale_comm->predict[0][0][0] = vp9_convolve_copy; + scale_comm->predict[0][0][1] = vp9_convolve_avg; + scale_comm->predict[0][1][0] = vp9_convolve8_vert; + scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; + scale_comm->predict[1][0][0] = vp9_convolve8_horiz; + scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // No scaling in x direction. Must always scale in the y direction. - scale->predict[0][0][0] = vp9_convolve8_vert; - scale->predict[0][0][1] = vp9_convolve8_avg_vert; - scale->predict[0][1][0] = vp9_convolve8_vert; - scale->predict[0][1][1] = vp9_convolve8_avg_vert; - scale->predict[1][0][0] = vp9_convolve8; - scale->predict[1][0][1] = vp9_convolve8_avg; + scale_comm->predict[0][0][0] = vp9_convolve8_vert; + scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert; + scale_comm->predict[0][1][0] = vp9_convolve8_vert; + scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; + scale_comm->predict[1][0][0] = vp9_convolve8; + scale_comm->predict[1][0][1] = vp9_convolve8_avg; } } else { - if (scale->y_step_q4 == 16) { + if (scale_comm->y_step_q4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - scale->predict[0][0][0] = vp9_convolve8_horiz; - scale->predict[0][0][1] = vp9_convolve8_avg_horiz; - scale->predict[0][1][0] = vp9_convolve8; - scale->predict[0][1][1] = vp9_convolve8_avg; - scale->predict[1][0][0] = vp9_convolve8_horiz; - scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + scale_comm->predict[0][0][0] = vp9_convolve8_horiz; + scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz; + scale_comm->predict[0][1][0] = vp9_convolve8; + scale_comm->predict[0][1][1] = vp9_convolve8_avg; + scale_comm->predict[1][0][0] = vp9_convolve8_horiz; + scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // Must always scale in both directions. - scale->predict[0][0][0] = vp9_convolve8; - scale->predict[0][0][1] = vp9_convolve8_avg; - scale->predict[0][1][0] = vp9_convolve8; - scale->predict[0][1][1] = vp9_convolve8_avg; - scale->predict[1][0][0] = vp9_convolve8; - scale->predict[1][0][1] = vp9_convolve8_avg; + scale_comm->predict[0][0][0] = vp9_convolve8; + scale_comm->predict[0][0][1] = vp9_convolve8_avg; + scale_comm->predict[0][1][0] = vp9_convolve8; + scale_comm->predict[0][1][1] = vp9_convolve8_avg; + scale_comm->predict[1][0][0] = vp9_convolve8; + scale_comm->predict[1][0][1] = vp9_convolve8_avg; } } // 2D subpel motion always gets filtered in both directions - scale->predict[1][1][0] = vp9_convolve8; - scale->predict[1][1][1] = vp9_convolve8_avg; + scale_comm->predict[1][1][0] = vp9_convolve8; + scale_comm->predict[1][1][1] = vp9_convolve8_avg; + + scale->sfc = scale_comm; + scale->x_offset_q4 = 0; // calculated per block + scale->y_offset_q4 = 0; // calculated per block } diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index ece011477..1437fcd9c 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -18,34 +18,40 @@ #define REF_NO_SCALE (1 << REF_SCALE_SHIFT) #define REF_INVALID_SCALE -1 -struct scale_factors { +struct scale_factors; +struct scale_factors_common { int x_scale_fp; // horizontal fixed point scale factor int y_scale_fp; // vertical fixed point scale factor - int x_offset_q4; int x_step_q4; - int y_offset_q4; int y_step_q4; - int (*scale_value_x)(int val, const struct scale_factors *scale); - int (*scale_value_y)(int val, const struct scale_factors *scale); + int (*scale_value_x)(int val, const struct scale_factors_common *sfc); + int (*scale_value_y)(int val, const struct scale_factors_common *sfc); void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale); convolve_fn_t predict[2][2][2]; // horiz, vert, avg }; +struct scale_factors { + int x_offset_q4; + int y_offset_q4; + const struct scale_factors_common *sfc; +}; + void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + struct scale_factors_common *scale_comm, int other_w, int other_h, int this_w, int this_h); -static int vp9_is_valid_scale(const struct scale_factors *sf) { - return sf->x_scale_fp != REF_INVALID_SCALE && - sf->y_scale_fp != REF_INVALID_SCALE; +static int vp9_is_valid_scale(const struct scale_factors_common *sfc) { + return sfc->x_scale_fp != REF_INVALID_SCALE && + sfc->y_scale_fp != REF_INVALID_SCALE; } -static int vp9_is_scaled(const struct scale_factors *sf) { - return sf->x_scale_fp != REF_NO_SCALE || - sf->y_scale_fp != REF_NO_SCALE; +static int vp9_is_scaled(const struct scale_factors_common *sfc) { + return sfc->x_scale_fp != REF_NO_SCALE || + sfc->y_scale_fp != REF_NO_SCALE; } #endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index a5c8463d5..14a1a7eb0 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -191,8 +191,7 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { } static INLINE int get_coef_context(const int16_t *neighbors, - uint8_t *token_cache, - int c) { + const uint8_t *token_cache, int c) { return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c index 1791c1a8f..e3035d076 100644 --- a/vp9/common/vp9_tile_common.c +++ b/vp9/common/vp9_tile_common.c @@ -10,6 +10,8 @@ #include "vp9/common/vp9_tile_common.h" +#include "vp9/common/vp9_onyxc_int.h" + #define MIN_TILE_WIDTH_B64 4 #define MAX_TILE_WIDTH_B64 64 @@ -17,8 +19,8 @@ static int to_sbs(n_mis) { return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2; } -static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off, - int tile_idx, int log2_n_tiles, int n_mis) { +static void get_tile_offsets(int *min_tile_off, int *max_tile_off, + int tile_idx, int log2_n_tiles, int n_mis) { const int n_sbs = to_sbs(n_mis); const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles; const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles; @@ -27,17 +29,14 @@ static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off, *max_tile_off = MIN(sb_off2 << 3, n_mis); } -void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) { - vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end, - tile_col_idx, cm->log2_tile_cols, cm->mi_cols); -} - -void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) { - vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end, - tile_row_idx, cm->log2_tile_rows, cm->mi_rows); +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, + int row_idx, int col_idx) { + get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end, + row_idx, cm->log2_tile_rows, cm->mi_rows); + get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end, + col_idx, cm->log2_tile_cols, cm->mi_cols); } - void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols) { const int sb_cols = to_sbs(mi_cols); diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h index 6d14560b9..a110abbdb 100644 --- a/vp9/common/vp9_tile_common.h +++ b/vp9/common/vp9_tile_common.h @@ -11,11 +11,17 @@ #ifndef VP9_COMMON_VP9_TILE_COMMON_H_ #define VP9_COMMON_VP9_TILE_COMMON_H_ -#include "vp9/common/vp9_onyxc_int.h" +struct VP9Common; -void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx); +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; +} TileInfo; -void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx); +// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, + int row_idx, int col_idx); void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols); diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c index da1213d71..1805fb4d8 100644 --- a/vp9/common/vp9_treecoder.c +++ b/vp9/common/vp9_treecoder.c @@ -40,9 +40,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t, tree2tok(p - offset, t, 0, 0, 0); } -static unsigned int convert_distribution(unsigned int i, - vp9_tree tree, - vp9_prob probs[], +static unsigned int convert_distribution(unsigned int i, vp9_tree tree, unsigned int branch_ct[][2], const unsigned int num_events[], unsigned int tok0_offset) { @@ -51,24 +49,25 @@ static unsigned int convert_distribution(unsigned int i, if (tree[i] <= 0) { left = num_events[-tree[i] - tok0_offset]; } else { - left = convert_distribution(tree[i], tree, probs, branch_ct, - num_events, tok0_offset); + left = convert_distribution(tree[i], tree, branch_ct, num_events, + tok0_offset); } if (tree[i + 1] <= 0) right = num_events[-tree[i + 1] - tok0_offset]; else - right = convert_distribution(tree[i + 1], tree, probs, branch_ct, - num_events, tok0_offset); + right = convert_distribution(tree[i + 1], tree, branch_ct, num_events, + tok0_offset); - probs[i>>1] = get_binary_prob(left, right); - branch_ct[i>>1][0] = left; - branch_ct[i>>1][1] = right; + branch_ct[i >> 1][0] = left; + branch_ct[i >> 1][1] = right; return left + right; } -void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */], +void vp9_tree_probs_from_distribution(vp9_tree tree, unsigned int branch_ct[/* n-1 */][2], const unsigned int num_events[/* n */], unsigned int tok0_offset) { - convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset); + convert_distribution(0, tree, branch_ct, num_events, tok0_offset); } + + diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h index 4ba171f46..9c776d61c 100644 --- a/vp9/common/vp9_treecoder.h +++ b/vp9/common/vp9_treecoder.h @@ -50,11 +50,11 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset); probability updates. */ void vp9_tree_probs_from_distribution(vp9_tree tree, - vp9_prob probs[ /* n - 1 */ ], unsigned int branch_ct[ /* n - 1 */ ][2], const unsigned int num_events[ /* n */ ], unsigned int tok0_offset); + static INLINE vp9_prob clip_prob(int p) { return (p > 255) ? 255u : (p < 1) ? 1u : p; } @@ -81,21 +81,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); } -static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob, +static INLINE vp9_prob merge_probs(vp9_prob pre_prob, const unsigned int ct[2], unsigned int count_sat, unsigned int max_update_factor) { + const vp9_prob prob = get_binary_prob(ct[0], ct[1]); const unsigned int count = MIN(ct[0] + ct[1], count_sat); const unsigned int factor = max_update_factor * count / count_sat; return weighted_prob(pre_prob, prob, factor); } -static INLINE vp9_prob merge_probs2(vp9_prob pre_prob, - const unsigned int ct[2], - unsigned int count_sat, - unsigned int max_update_factor) { - return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat, - max_update_factor); +static unsigned int tree_merge_probs_impl(unsigned int i, + const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, + unsigned int count_sat, + unsigned int max_update_factor, + vp9_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, + count_sat, max_update_factor, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, + count_sat, max_update_factor, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, + count_sat, max_update_factor); + return left_count + right_count; +} + +static void tree_merge_probs(const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, int offset, + unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset], + count_sat, max_update_factor, probs); } diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index cfec36b42..ccf5aac17 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -415,7 +415,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, res3 = _mm_packs_epi32(tmp6, tmp7); \ } -#define IDCT8x8_1D \ +#define IDCT8_1D \ /* Stage1 */ \ { \ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ @@ -525,12 +525,12 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { // 2-D for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8x8_1D + IDCT8_1D } // Final rounding and shift @@ -638,12 +638,12 @@ static void idct8_1d_sse2(__m128i *in) { in6 = in[6]; in7 = in[7]; - // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8x8_1D + IDCT8_1D in[0] = in0; in[1] = in1; in[2] = in2; @@ -1068,7 +1068,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { in4, in5, in6, in7) // 1D idct8x8 - IDCT8x8_1D + IDCT8_1D // Final rounding and shift in0 = _mm_adds_epi16(in0, final_rounding); @@ -1099,7 +1099,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -#define IDCT16x16_1D \ +#define IDCT16_1D \ /* Stage2 */ \ { \ const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ @@ -1321,7 +1321,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, in12, in13, in14, in15); } - IDCT16x16_1D + IDCT16_1D // Stage7 if (i == 0) { @@ -2703,7 +2703,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; - IDCT16x16_1D + IDCT16_1D // Stage7 in0 = _mm_add_epi16(stp2_0, stp1_15); @@ -2785,6 +2785,698 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, input += 8; \ } \ +#define IDCT32_1D \ +/* Stage1 */ \ +{ \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ + stp1_17, stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ + stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ + stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + +// Only upper-left 8x8 has non-zero coeff +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, + in24, in25, in26, in27, in28, in29, in30, in31; + __m128i col[128]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. + for (i = 0; i < 8; i++) { + i32 = (i << 5); + if (i == 0) { + // First 1-D idct: first 8 rows + // Load input data. + LOAD_DQCOEFF(in0, input); + LOAD_DQCOEFF(in8, input); + LOAD_DQCOEFF(in16, input); + LOAD_DQCOEFF(in24, input); + LOAD_DQCOEFF(in1, input); + LOAD_DQCOEFF(in9, input); + LOAD_DQCOEFF(in17, input); + LOAD_DQCOEFF(in25, input); + LOAD_DQCOEFF(in2, input); + LOAD_DQCOEFF(in10, input); + LOAD_DQCOEFF(in18, input); + LOAD_DQCOEFF(in26, input); + LOAD_DQCOEFF(in3, input); + LOAD_DQCOEFF(in11, input); + LOAD_DQCOEFF(in19, input); + LOAD_DQCOEFF(in27, input); + + LOAD_DQCOEFF(in4, input); + LOAD_DQCOEFF(in12, input); + LOAD_DQCOEFF(in20, input); + LOAD_DQCOEFF(in28, input); + LOAD_DQCOEFF(in5, input); + LOAD_DQCOEFF(in13, input); + LOAD_DQCOEFF(in21, input); + LOAD_DQCOEFF(in29, input); + LOAD_DQCOEFF(in6, input); + LOAD_DQCOEFF(in14, input); + LOAD_DQCOEFF(in22, input); + LOAD_DQCOEFF(in30, input); + LOAD_DQCOEFF(in7, input); + LOAD_DQCOEFF(in15, input); + LOAD_DQCOEFF(in23, input); + LOAD_DQCOEFF(in31, input); + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, + in18, in19, in20, in21, in22, in23); + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, + in26, in27, in28, in29, in30, in31); + } else if (i < 4) { + // First 1-D idct: next 24 zero-coeff rows + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } else { + // Second 1-D idct + j = i - 4; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, + in5, in6, in7); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, + in11, in12, in13, in14, in15); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, + in19, in20, in21, in22, in23); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, + in28, in29, in30, in31); + } + + IDCT32_1D + + // final stage + if (i < 4) { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } else { + const __m128i zero = _mm_setzero_si128(); + + // 2_D: Calculate the results and store them to destination. + in0 = _mm_add_epi16(stp1_0, stp1_31); + in1 = _mm_add_epi16(stp1_1, stp1_30); + in2 = _mm_add_epi16(stp1_2, stp1_29); + in3 = _mm_add_epi16(stp1_3, stp1_28); + in4 = _mm_add_epi16(stp1_4, stp1_27); + in5 = _mm_add_epi16(stp1_5, stp1_26); + in6 = _mm_add_epi16(stp1_6, stp1_25); + in7 = _mm_add_epi16(stp1_7, stp1_24); + in8 = _mm_add_epi16(stp1_8, stp1_23); + in9 = _mm_add_epi16(stp1_9, stp1_22); + in10 = _mm_add_epi16(stp1_10, stp1_21); + in11 = _mm_add_epi16(stp1_11, stp1_20); + in12 = _mm_add_epi16(stp1_12, stp1_19); + in13 = _mm_add_epi16(stp1_13, stp1_18); + in14 = _mm_add_epi16(stp1_14, stp1_17); + in15 = _mm_add_epi16(stp1_15, stp1_16); + in16 = _mm_sub_epi16(stp1_15, stp1_16); + in17 = _mm_sub_epi16(stp1_14, stp1_17); + in18 = _mm_sub_epi16(stp1_13, stp1_18); + in19 = _mm_sub_epi16(stp1_12, stp1_19); + in20 = _mm_sub_epi16(stp1_11, stp1_20); + in21 = _mm_sub_epi16(stp1_10, stp1_21); + in22 = _mm_sub_epi16(stp1_9, stp1_22); + in23 = _mm_sub_epi16(stp1_8, stp1_23); + in24 = _mm_sub_epi16(stp1_7, stp1_24); + in25 = _mm_sub_epi16(stp1_6, stp1_25); + in26 = _mm_sub_epi16(stp1_5, stp1_26); + in27 = _mm_sub_epi16(stp1_4, stp1_27); + in28 = _mm_sub_epi16(stp1_3, stp1_28); + in29 = _mm_sub_epi16(stp1_2, stp1_29); + in30 = _mm_sub_epi16(stp1_1, stp1_30); + in31 = _mm_sub_epi16(stp1_0, stp1_31); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + in16 = _mm_adds_epi16(in16, final_rounding); + in17 = _mm_adds_epi16(in17, final_rounding); + in18 = _mm_adds_epi16(in18, final_rounding); + in19 = _mm_adds_epi16(in19, final_rounding); + in20 = _mm_adds_epi16(in20, final_rounding); + in21 = _mm_adds_epi16(in21, final_rounding); + in22 = _mm_adds_epi16(in22, final_rounding); + in23 = _mm_adds_epi16(in23, final_rounding); + in24 = _mm_adds_epi16(in24, final_rounding); + in25 = _mm_adds_epi16(in25, final_rounding); + in26 = _mm_adds_epi16(in26, final_rounding); + in27 = _mm_adds_epi16(in27, final_rounding); + in28 = _mm_adds_epi16(in28, final_rounding); + in29 = _mm_adds_epi16(in29, final_rounding); + in30 = _mm_adds_epi16(in30, final_rounding); + in31 = _mm_adds_epi16(in31, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + in16 = _mm_srai_epi16(in16, 6); + in17 = _mm_srai_epi16(in17, 6); + in18 = _mm_srai_epi16(in18, 6); + in19 = _mm_srai_epi16(in19, 6); + in20 = _mm_srai_epi16(in20, 6); + in21 = _mm_srai_epi16(in21, 6); + in22 = _mm_srai_epi16(in22, 6); + in23 = _mm_srai_epi16(in23, 6); + in24 = _mm_srai_epi16(in24, 6); + in25 = _mm_srai_epi16(in25, 6); + in26 = _mm_srai_epi16(in26, 6); + in27 = _mm_srai_epi16(in27, 6); + in28 = _mm_srai_epi16(in28, 6); + in29 = _mm_srai_epi16(in29, 6); + in30 = _mm_srai_epi16(in30, 6); + in31 = _mm_srai_epi16(in31, 6); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); + RECON_AND_STORE(dest, in8); + RECON_AND_STORE(dest, in9); + RECON_AND_STORE(dest, in10); + RECON_AND_STORE(dest, in11); + RECON_AND_STORE(dest, in12); + RECON_AND_STORE(dest, in13); + RECON_AND_STORE(dest, in14); + RECON_AND_STORE(dest, in15); + RECON_AND_STORE(dest, in16); + RECON_AND_STORE(dest, in17); + RECON_AND_STORE(dest, in18); + RECON_AND_STORE(dest, in19); + RECON_AND_STORE(dest, in20); + RECON_AND_STORE(dest, in21); + RECON_AND_STORE(dest, in22); + RECON_AND_STORE(dest, in23); + RECON_AND_STORE(dest, in24); + RECON_AND_STORE(dest, in25); + RECON_AND_STORE(dest, in26); + RECON_AND_STORE(dest, in27); + RECON_AND_STORE(dest, in28); + RECON_AND_STORE(dest, in29); + RECON_AND_STORE(dest, in30); + RECON_AND_STORE(dest, in31); + + dest += 8 - (stride * 32); + } + } +} + void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -3009,336 +3701,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, in28, in29, in30, in31); } - // Stage1 - { - const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); - const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); - const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); - const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); - - const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); - const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); - const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); - const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); - - const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); - const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); - const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); - const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); - - const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); - const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); - const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); - const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); - - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, - stp1_17, stp1_30) - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, - stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, - stp1_19, stp1_28) - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, - stp1_21, stp1_26) - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, - stp1_23, stp1_24) - } - - // Stage2 - { - const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); - const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); - const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); - const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); - - const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); - const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); - const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); - const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); - - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, - stp2_14) - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, - stp2_11, stp2_12) - - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); - - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); - - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); - - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); - } - - // Stage3 - { - const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); - const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); - const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); - const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); - - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); - - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); - - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, - stp1_6) - - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); - - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, - stp1_18, stp1_29) - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, - stp1_22, stp1_25) - - stp1_16 = stp2_16; - stp1_31 = stp2_31; - stp1_19 = stp2_19; - stp1_20 = stp2_20; - stp1_23 = stp2_23; - stp1_24 = stp2_24; - stp1_27 = stp2_27; - stp1_28 = stp2_28; - } - - // Stage4 - { - const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); - const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); - const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); - const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); - - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); - - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, - stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, - stp2_2, stp2_3) - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, - stp2_10, stp2_13) - - stp2_8 = stp1_8; - stp2_15 = stp1_15; - stp2_11 = stp1_11; - stp2_12 = stp1_12; - - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); - - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); - } - - // Stage5 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); - - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); - - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); - - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp1); - stp1_6 = _mm_packs_epi32(tmp2, tmp3); - - stp1_4 = stp2_4; - stp1_7 = stp2_7; - - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); - - stp1_16 = stp2_16; - stp1_17 = stp2_17; - - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, - stp1_19, stp1_28) - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, - stp1_21, stp1_26) - - stp1_22 = stp2_22; - stp1_23 = stp2_23; - stp1_24 = stp2_24; - stp1_25 = stp2_25; - stp1_30 = stp2_30; - stp1_31 = stp2_31; - } - - // Stage6 - { - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); - - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); - - stp2_8 = stp1_8; - stp2_9 = stp1_9; - stp2_14 = stp1_14; - stp2_15 = stp1_15; - - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, - stp2_13, stp2_11, stp2_12) - - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); - - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); - } - - // Stage7 - { - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); - - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); - - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); - - stp1_16 = stp2_16; - stp1_17 = stp2_17; - stp1_18 = stp2_18; - stp1_19 = stp2_19; - - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, - stp1_21, stp1_26) - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, - stp1_23, stp1_24) - - stp1_28 = stp2_28; - stp1_29 = stp2_29; - stp1_30 = stp2_30; - stp1_31 = stp2_31; - } + IDCT32_1D // final stage if (i < 4) { diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm index 568e2080e..88df9b2d1 100644 --- a/vp9/common/x86/vp9_intrapred_ssse3.asm +++ b/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -991,7 +991,7 @@ cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset lea dst8q, [dst8q+strideq*4] ; output 2nd half of 3rd 8 lines and half of 4th 8 lines - mova m0, [sh_b23456789abcdefff] + mova m0, [GLOBAL(sh_b23456789abcdefff)] mova [dstq +16], m7 mova [dst8q ], m7 pshufb m7, m0 diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c new file mode 100644 index 000000000..3c5cb8ffd --- /dev/null +++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -0,0 +1,943 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> /* AVX2 */ + +static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128( + _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *) (s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *) (s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), + _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), + _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *) (s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), + _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), + _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, + _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, + _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, + _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), + 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + p4 = _mm_loadu_si128((__m128i *) (s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *) (s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *) (s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *) (s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *) (s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *) (s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *) (s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *) (s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *) (s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *) (s + 4 * p)); + + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, + flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, + flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p5 = _mm_loadu_si128((__m128i *) (s - 6 * p)); + q5 = _mm_loadu_si128((__m128i *) (s + 5 * p)); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p6 = _mm_loadu_si128((__m128i *) (s - 7 * p)); + q6 = _mm_loadu_si128((__m128i *) (s + 6 * p)); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p7 = _mm_loadu_si128((__m128i *) (s - 8 * p)); + q7 = _mm_loadu_si128((__m128i *) (s + 7 * p)); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, + q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, + p256_0, q256_0; + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, + res_q; + + p256_7 = _mm256_cvtepu8_epi16(p7); + p256_6 = _mm256_cvtepu8_epi16(p6); + p256_5 = _mm256_cvtepu8_epi16(p5); + p256_4 = _mm256_cvtepu8_epi16(p4); + p256_3 = _mm256_cvtepu8_epi16(p3); + p256_2 = _mm256_cvtepu8_epi16(p2); + p256_1 = _mm256_cvtepu8_epi16(p1); + p256_0 = _mm256_cvtepu8_epi16(p0); + q256_0 = _mm256_cvtepu8_epi16(q0); + q256_1 = _mm256_cvtepu8_epi16(q1); + q256_2 = _mm256_cvtepu8_epi16(q2); + q256_3 = _mm256_cvtepu8_epi16(q3); + q256_4 = _mm256_cvtepu8_epi16(q4); + q256_5 = _mm256_cvtepu8_epi16(q5); + q256_6 = _mm256_cvtepu8_epi16(q6); + q256_7 = _mm256_cvtepu8_epi16(q7); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, + _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, + _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16(eight, + _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(four, + _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *) (s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *) (s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *) (s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *) (s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *) (s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *) (s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *) (s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *) (s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *) (s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *) (s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *) (s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *) (s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *) (s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *) (s + 6 * p), q6); + } +} + +void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int count) { + if (count == 1) + mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh); + else + mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh); +} diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h index c86451649..fd8e74ca4 100644 --- a/vp9/decoder/vp9_dboolhuff.h +++ b/vp9/decoder/vp9_dboolhuff.h @@ -44,7 +44,7 @@ static int vp9_read(vp9_reader *br, int probability) { VP9_BD_VALUE bigsplit; int count; unsigned int range; - unsigned int split = 1 + (((br->range - 1) * probability) >> 8); + unsigned int split = ((br->range * probability) + (256 - probability)) >> 8; if (br->count < 0) vp9_reader_fill(br); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 33793eee0..9792d2c6d 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -72,7 +72,7 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, } if (!cm->frame_parallel_decoding_mode) - update_tx_counts(bsize, context, tx_size, &cm->counts.tx); + ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size]; return tx_size; } @@ -91,8 +91,8 @@ static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd, static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) { const int mi_offset = mi_row * cm->mi_cols + mi_col; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; const int xmis = MIN(cm->mi_cols - mi_col, bw); const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y; @@ -149,16 +149,17 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, return segment_id; } -static uint8_t read_skip_coeff(VP9_COMMON *const cm, MACROBLOCKD *const xd, - int segment_id, vp9_reader *r) { - int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); - if (!skip_coeff) { +static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, vp9_reader *r) { + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { const int ctx = vp9_get_pred_context_mbskip(xd); - skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd)); + const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]); if (!cm->frame_parallel_decoding_mode) - ++cm->counts.mbskip[ctx][skip_coeff]; + ++cm->counts.mbskip[ctx][skip]; + return skip; } - return skip_coeff; } static void read_intra_frame_mode_info(VP9_COMMON *const cm, @@ -311,7 +312,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, } -static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type( +static INLINE INTERPOLATION_TYPE read_switchable_filter_type( VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) { const int ctx = vp9_get_pred_context_switchable_interp(xd); const int type = treed_read(r, vp9_switchable_interp_tree, @@ -414,6 +415,7 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, static void read_inter_block_mode_info(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { MB_MODE_INFO *const mbmi = &mi->mbmi; @@ -430,7 +432,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, ref0 = mbmi->ref_frame[0]; is_compound = has_second_ref(mbmi); - vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0], + vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0], mi_row, mi_col); inter_mode_ctx = mbmi->mode_context[ref0]; @@ -456,7 +458,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if (is_compound) { const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; - vp9_find_mv_refs(cm, xd, mi, xd->last_mi, + vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref1, mbmi->ref_mvs[ref1], mi_row, mi_col); if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { @@ -482,12 +484,12 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, b_mode = read_inter_mode(cm, r, inter_mode_ctx); if (b_mode == NEARESTMV || b_mode == NEARMV) { - vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest[0], + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0], &nearmv[0], j, 0, mi_row, mi_col); if (is_compound) - vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest[1], + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1], &nearmv[1], j, 1, mi_row, mi_col); } @@ -523,6 +525,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, static void read_inter_frame_mode_info(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { MB_MODE_INFO *const mbmi = &mi->mbmi; @@ -537,17 +540,18 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm, !mbmi->skip_coeff || !inter_block, r); if (inter_block) - read_inter_block_mode_info(cm, xd, mi, mi_row, mi_col, r); + read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); else read_intra_block_mode_info(cm, mi, r); } void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { MODE_INFO *const mi = xd->mi_8x8[0]; const BLOCK_SIZE bsize = mi->mbmi.sb_type; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; const int y_mis = MIN(bh, cm->mi_rows - mi_row); const int x_mis = MIN(bw, cm->mi_cols - mi_col); int x, y, z; @@ -555,7 +559,7 @@ void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, if (frame_is_intra_only(cm)) read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r); else - read_inter_frame_mode_info(cm, xd, mi, mi_row, mi_col, r); + read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) { for (x = !y; x < x_mis; x++) { diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h index cec99f253..8e9ae4a54 100644 --- a/vp9/decoder/vp9_decodemv.h +++ b/vp9/decoder/vp9_decodemv.h @@ -14,7 +14,10 @@ #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_dboolhuff.h" +struct TileInfo; + void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, + const struct TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r); #endif // VP9_DECODER_VP9_DECODEMV_H_ diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 3ee8ba41d..4746a3abd 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -37,10 +37,44 @@ #include "vp9/decoder/vp9_thread.h" #include "vp9/decoder/vp9_treereader.h" +typedef struct TileWorkerData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); +} TileWorkerData; + static int read_be32(const uint8_t *p) { return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; } +static int is_compound_prediction_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) + return 1; + + return 0; +} + +static void setup_compound_prediction(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + // len == 0 is not allowed static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return start + len > start && start + len <= end; @@ -76,7 +110,7 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) { static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; - for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j) + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); } @@ -98,8 +132,11 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { int i; - cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r) - : SINGLE_PREDICTION_ONLY; + const int compound_allowed = is_compound_prediction_allowed(cm); + cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r) + : SINGLE_PREDICTION_ONLY; + if (compound_allowed) + setup_compound_prediction(cm); if (cm->comp_pred_mode == HYBRID_PREDICTION) for (i = 0; i < COMP_INTER_CONTEXTS; i++) @@ -169,11 +206,49 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { xd->plane[i].dequant = cm->uv_dequant[q_index]; } -static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - MACROBLOCKD* const xd = arg; +// Allocate storage for each tile column. +// TODO(jzern): when max_threads <= 1 the same storage could be used for each +// tile. +static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) { + VP9_COMMON *const cm = &pbi->common; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + int i, tile_col; + + CHECK_MEM_ERROR(cm, pbi->mi_streams, + vpx_realloc(pbi->mi_streams, tile_cols * + sizeof(*pbi->mi_streams))); + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo tile; + + vp9_tile_init(&tile, cm, 0, tile_col); + pbi->mi_streams[tile_col] = + &cm->mi[cm->mi_rows * tile.mi_col_start]; + } + + // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm + // block where mi unit size is 8x8. + CHECK_MEM_ERROR(cm, pbi->above_context[0], + vpx_realloc(pbi->above_context[0], + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols)); + for (i = 1; i < MAX_MB_PLANE; ++i) { + pbi->above_context[i] = pbi->above_context[0] + + i * sizeof(*pbi->above_context[0]) * + 2 * aligned_mi_cols; + } + + // This is sized based on the entire frame. Each tile operates within its + // column bounds. + CHECK_MEM_ERROR(cm, pbi->above_seg_context, + vpx_realloc(pbi->above_seg_context, + sizeof(*pbi->above_seg_context) * + aligned_mi_cols)); +} + +static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block); + int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int stride = pd->dst.stride; const int eob = pd->eobs[block]; if (eob > 0) { @@ -186,40 +261,53 @@ static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, case TX_4X4: tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block); if (tx_type == DCT_DCT) - xd->itxm_add(qcoeff, dst, stride, eob); + xd->itxm_add(dqcoeff, dst, stride, eob); else - vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob); + vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type); break; case TX_8X8: tx_type = get_tx_type_8x8(pd->plane_type, xd); - vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob); + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); break; case TX_16X16: tx_type = get_tx_type_16x16(pd->plane_type, xd); - vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob); + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); break; case TX_32X32: tx_type = DCT_DCT; - vp9_idct32x32_add(qcoeff, dst, stride, eob); + vp9_idct32x32_add(dqcoeff, dst, stride, eob); break; default: assert(!"Invalid transform size"); } if (eob == 1) { - *((int32_t *)qcoeff) = 0; + vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0])); } else { if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) - vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0])); + vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); else - vpx_memset(qcoeff, 0, (16 << (tx_size << 1)) * sizeof(qcoeff[0])); + vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); } } } -static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - MACROBLOCKD* const xd = arg; +struct intra_args { + VP9_COMMON *cm; + MACROBLOCKD *xd; + vp9_reader *r; + unsigned char* token_cache; +}; + +static void predict_and_reconstruct_intra_block(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct intra_args *const args = arg; + VP9_COMMON *const cm = args->cm; + MACROBLOCKD *const xd = args->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; MODE_INFO *const mi = xd->mi_8x8[0]; const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, @@ -238,31 +326,37 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, b_width_log2(plane_bsize), tx_size, mode, dst, pd->dst.stride, dst, pd->dst.stride); - if (!mi->mbmi.skip_coeff) - decode_block(plane, block, plane_bsize, tx_size, arg); + if (!mi->mbmi.skip_coeff) { + vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size, + args->r, args->token_cache); + inverse_transform_block(xd, plane, block, plane_bsize, tx_size); + } } -static int decode_tokens(VP9_COMMON *const cm, MACROBLOCKD *const xd, - BLOCK_SIZE bsize, vp9_reader *r) { - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - - if (mbmi->skip_coeff) { - reset_skip_context(xd, bsize); - return -1; - } else { - if (cm->seg.enabled) - setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, - cm->base_qindex)); - - // TODO(dkovalev) if (!vp9_reader_has_error(r)) - return vp9_decode_tokens(cm, xd, &cm->seg, r, bsize); - } +struct inter_args { + VP9_COMMON *cm; + MACROBLOCKD *xd; + vp9_reader *r; + int *eobtotal; + unsigned char* token_cache; +}; + +static void reconstruct_inter_block(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct inter_args *args = arg; + VP9_COMMON *const cm = args->cm; + MACROBLOCKD *const xd = args->xd; + + *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block, + plane_bsize, tx_size, + args->r, args->token_cache); + inverse_transform_block(xd, plane, block, plane_bsize, tx_size); } -static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + BLOCK_SIZE bsize, int mi_row, int mi_col) { const int bh = num_8x8_blocks_high_lookup[bsize]; const int bw = num_8x8_blocks_wide_lookup[bsize]; const int offset = mi_row * cm->mode_info_stride + mi_col; @@ -281,143 +375,163 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize, // cannot be used. xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; - set_skip_context(cm, xd, mi_row, mi_col); + set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col); // Distance of Mb to the various image edges. These are specified to 8th pel // as they are always compared to values that are in 1/8th pel units - set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); - setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col); + setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col); } static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, int idx, int mi_row, int mi_col) { MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; const int ref = mbmi->ref_frame[idx] - LAST_FRAME; - const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]]; - const struct scale_factors *sf = &cm->active_ref_scale[ref]; - if (!vp9_is_valid_scale(sf)) + const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref); + const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref]; + if (!vp9_is_valid_scale(sfc)) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid scale factors"); - xd->scale_factor[idx] = *sf; - setup_pre_planes(xd, idx, cfg, mi_row, mi_col, sf); + xd->scale_factor[idx].sfc = sfc; + setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]); xd->corrupted |= cfg->corrupted; } -static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE bsize, int index) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + int mi_row, int mi_col, + vp9_reader *r, BLOCK_SIZE bsize, + unsigned char *token_cache) { const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi; - int eobtotal; - - if (less8x8) - if (index > 0) - return; - set_offsets(pbi, bsize, mi_row, mi_col); - vp9_read_mode_info(cm, xd, mi_row, mi_col, r); + set_offsets(cm, xd, tile, bsize, mi_row, mi_col); + vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r); if (less8x8) bsize = BLOCK_8X8; // Has to be called after set_offsets mbmi = &xd->mi_8x8[0]->mbmi; - eobtotal = decode_tokens(cm, xd, bsize, r); - if (!is_inter_block(mbmi)) { - // Intra reconstruction - foreach_transformed_block(xd, bsize, decode_block_intra, xd); + if (mbmi->skip_coeff) { + reset_skip_context(xd, bsize); } else { - // Inter reconstruction - const int decode_blocks = (eobtotal > 0); - - if (!less8x8) { - assert(mbmi->sb_type == bsize); - if (eobtotal == 0) - mbmi->skip_coeff = 1; // skip loopfilter - } + if (cm->seg.enabled) + setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, + cm->base_qindex)); + } + if (!is_inter_block(mbmi)) { + struct intra_args arg = { cm, xd, r, token_cache }; + foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, + &arg); + } else { + // Setup set_ref(cm, xd, 0, mi_row, mi_col); if (has_second_ref(mbmi)) set_ref(cm, xd, 1, mi_row, mi_col); xd->subpix.filter_x = xd->subpix.filter_y = vp9_get_filter_kernel(mbmi->interp_filter); + + // Prediction vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - if (decode_blocks) - foreach_transformed_block(xd, bsize, decode_block, xd); + // Reconstruction + if (!mbmi->skip_coeff) { + int eobtotal = 0; + struct inter_args arg = { cm, xd, r, &eobtotal, token_cache }; + foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); + if (!less8x8 && eobtotal == 0) + mbmi->skip_coeff = 1; // skip loopfilter + } } + xd->corrupted |= vp9_reader_has_error(r); } -static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader* r, BLOCK_SIZE bsize, int index) { - VP9_COMMON *const cm = &pbi->common; +static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs, + int mi_row, int mi_col, BLOCK_SIZE bsize, + vp9_reader *r) { + const int ctx = partition_plane_context(xd->above_seg_context, + xd->left_seg_context, + mi_row, mi_col, bsize); + const vp9_prob *const probs = get_partition_probs(cm, ctx); + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + PARTITION_TYPE p; + + if (has_rows && has_cols) + p = treed_read(r, vp9_partition_tree, probs); + else if (!has_rows && has_cols) + p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ; + else if (has_rows && !has_cols) + p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT; + else + p = PARTITION_SPLIT; + + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.partition[ctx][p]; + + return p; +} + +static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + int mi_row, int mi_col, + vp9_reader* r, BLOCK_SIZE bsize, + unsigned char *token_cache) { const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; - PARTITION_TYPE partition = PARTITION_NONE; + PARTITION_TYPE partition; BLOCK_SIZE subsize; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - if (bsize < BLOCK_8X8) { - if (index > 0) - return; - } else { - int pl; - const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols, - mi_row, mi_col); - pl = partition_plane_context(cm, mi_row, mi_col, bsize); - - if (idx == 0) - partition = treed_read(r, vp9_partition_tree, - cm->fc.partition_prob[cm->frame_type][pl]); - else if (idx > 0 && - !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx])) - partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT; - else - partition = PARTITION_SPLIT; - - if (!cm->frame_parallel_decoding_mode) - ++cm->counts.partition[pl][partition]; - } - + partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); subsize = get_subsize(bsize, partition); - - switch (partition) { - case PARTITION_NONE: - decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0); - break; - case PARTITION_HORZ: - decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0); - if (mi_row + hbs < cm->mi_rows) - decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize, 1); - break; - case PARTITION_VERT: - decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0); - if (mi_col + hbs < cm->mi_cols) - decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize, 1); - break; - case PARTITION_SPLIT: { - int n; - for (n = 0; n < 4; n++) { - const int j = n >> 1, i = n & 1; - decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, - r, subsize, n); - } - } break; - default: - assert(!"Invalid partition type"); + if (subsize < BLOCK_8X8) { + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + } else { + switch (partition) { + case PARTITION_NONE: + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + break; + case PARTITION_HORZ: + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + if (mi_row + hbs < cm->mi_rows) + decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, + token_cache); + break; + case PARTITION_VERT: + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + if (mi_col + hbs < cm->mi_cols) + decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, + token_cache); + break; + case PARTITION_SPLIT: + decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize, + token_cache); + break; + default: + assert(!"Invalid partition type"); + } } // update partition context if (bsize >= BLOCK_8X8 && (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - update_partition_context(cm, mi_row, mi_col, subsize, bsize); + update_partition_context(xd->above_seg_context, xd->left_seg_context, + mi_row, mi_col, subsize, bsize); } static void setup_token_decoder(const uint8_t *data, @@ -453,16 +567,10 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, vp9_reader *r) { - read_coef_probs_common(fc->coef_probs[TX_4X4], r); - - if (tx_mode > ONLY_4X4) - read_coef_probs_common(fc->coef_probs[TX_8X8], r); - - if (tx_mode > ALLOW_8X8) - read_coef_probs_common(fc->coef_probs[TX_16X16], r); - - if (tx_mode > ALLOW_16X16) - read_coef_probs_common(fc->coef_probs[TX_32X32], r); + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + read_coef_probs_common(fc->coef_probs[tx_size], r); } static void setup_segmentation(struct segmentation *seg, @@ -549,9 +657,8 @@ static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { return old != *delta_q; } -static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { - MACROBLOCKD *const xd = &pbi->mb; - VP9_COMMON *const cm = &pbi->common; +static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, + struct vp9_read_bit_buffer *rb) { int update = 0; cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS); @@ -569,12 +676,12 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; } -static INTERPOLATIONFILTERTYPE read_interp_filter_type( - struct vp9_read_bit_buffer *rb) { - const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH, - EIGHTTAP, - EIGHTTAP_SHARP, - BILINEAR }; +static INTERPOLATION_TYPE read_interp_filter_type( + struct vp9_read_bit_buffer *rb) { + const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH, + EIGHTTAP, + EIGHTTAP_SHARP, + BILINEAR }; return vp9_rb_read_bit(rb) ? SWITCHABLE : literal_to_type[vp9_rb_read_literal(rb, 2)]; } @@ -620,7 +727,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { vp9_update_frame_size(cm); } - vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height, + vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9BORDERINPIXELS); } @@ -641,7 +748,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, int found = 0, i; for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { if (vp9_rb_read_bit(rb)) { - YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]]; + YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i); width = cfg->y_crop_width; height = cfg->y_crop_height; found = 1; @@ -660,18 +767,28 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, setup_display_size(cm, rb); } -static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) { +static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd, + int tile_col) { + int i; + xd->mi_stream = pbi->mi_streams[tile_col]; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + xd->above_context[i] = pbi->above_context[i]; + } + // see note in alloc_tile_storage(). + xd->above_seg_context = pbi->above_seg_context; +} + +static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile, + vp9_reader *r) { const int num_threads = pbi->oxcf.max_threads; VP9_COMMON *const cm = &pbi->common; int mi_row, mi_col; - YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx]; MACROBLOCKD *xd = &pbi->mb; - xd->mi_stream = pbi->mi_streams[tile_col]; - if (pbi->do_loopfilter_inline) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - lf_data->frame_buffer = fb; + lf_data->frame_buffer = get_frame_new_buffer(cm); lf_data->cm = cm; lf_data->xd = pbi->mb; lf_data->stop = 0; @@ -679,14 +796,15 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) { vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } - for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { // For a SB there are 2 left contexts, each pertaining to a MB row within - vp9_zero(cm->left_context); - vp9_zero(cm->left_seg_context); - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + vp9_zero(xd->left_context); + vp9_zero(xd->left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64, 0); + decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, + pbi->token_cache); if (pbi->do_loopfilter_inline) { const int lf_start = mi_row - MI_BLOCK_SIZE; @@ -696,7 +814,7 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) { if (lf_start < 0) continue; // decoding has completed: finish up the loop filter in this thread. - if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue; + if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue; vp9_worker_sync(&pbi->lf_worker); lf_data->start = lf_start; @@ -735,10 +853,32 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { cm->log2_tile_rows += vp9_rb_read_bit(rb); } +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static size_t get_tile(const uint8_t *const data_end, + int is_last, + struct vpx_internal_error_info *error_info, + const uint8_t **data) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, 4, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + size = read_be32(*data); + *data += 4; + } else { + size = data_end - *data; + } + return size; +} + static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { vp9_reader residual_bc; VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; const uint8_t *const data_end = pbi->source + pbi->source_sz; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); @@ -748,70 +888,57 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cm->above_context[0], 0, - sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols)); + vpx_memset(pbi->above_context[0], 0, + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols); - vpx_memset(cm->above_seg_context, 0, - sizeof(PARTITION_CONTEXT) * aligned_mi_cols); + vpx_memset(pbi->above_seg_context, 0, + sizeof(*pbi->above_seg_context) * aligned_mi_cols); if (pbi->oxcf.inv_tile_order) { const uint8_t *data_ptr2[4][1 << 6]; vp9_reader bc_bak = {0}; - // pre-initialize the offsets, we're going to read in inverse order + // pre-initialize the offsets, we're going to decode in inverse order data_ptr2[0][0] = data; for (tile_row = 0; tile_row < tile_rows; tile_row++) { - if (tile_row) { - const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]); - data_ptr2[tile_row - 1][tile_cols - 1] += 4; - data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size; - } - - for (tile_col = 1; tile_col < tile_cols; tile_col++) { - const int size = read_be32(data_ptr2[tile_row][tile_col - 1]); - data_ptr2[tile_row][tile_col - 1] += 4; - data_ptr2[tile_row][tile_col] = - data_ptr2[tile_row][tile_col - 1] + size; + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + const int last_tile = + tile_row == tile_rows - 1 && tile_col == tile_cols - 1; + const size_t size = get_tile(data_end, last_tile, &cm->error, &data); + data_ptr2[tile_row][tile_col] = data; + data += size; } } for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) { - vp9_get_tile_col_offsets(cm, tile_col); + TileInfo tile; + + vp9_tile_init(&tile, cm, tile_row, tile_col); setup_token_decoder(data_ptr2[tile_row][tile_col], data_end, data_end - data_ptr2[tile_row][tile_col], &cm->error, &residual_bc); - decode_tile(pbi, &residual_bc, tile_col); + setup_tile_context(pbi, xd, tile_col); + decode_tile(pbi, &tile, &residual_bc); if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1) bc_bak = residual_bc; } } residual_bc = bc_bak; } else { - int has_more; - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = 0; tile_col < tile_cols; tile_col++) { - size_t size; + const int last_tile = + tile_row == tile_rows - 1 && tile_col == tile_cols - 1; + const size_t size = get_tile(data_end, last_tile, &cm->error, &data); + TileInfo tile; - vp9_get_tile_col_offsets(cm, tile_col); - - has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1; - if (has_more) { - if (!read_is_valid(data, 4, data_end)) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt tile length"); - - size = read_be32(data); - data += 4; - } else { - size = data_end - data; - } + vp9_tile_init(&tile, cm, tile_row, tile_col); setup_token_decoder(data, data_end, size, &cm->error, &residual_bc); - decode_tile(pbi, &residual_bc, tile_col); + setup_tile_context(pbi, xd, tile_col); + decode_tile(pbi, &tile, &residual_bc); data += size; } } @@ -820,10 +947,113 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { return vp9_reader_find_end(&residual_bc); } +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *tile_data = (TileWorkerData*)arg1; + const TileInfo *const tile = (TileInfo*)arg2; + int mi_row, mi_col; + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_modes_sb(tile_data->cm, &tile_data->xd, tile, + mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, + tile_data->token_cache); + } + } + return !tile_data->xd.corrupted; +} + +static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { + VP9_COMMON *const cm = &pbi->common; + const uint8_t *const data_end = pbi->source + pbi->source_sz; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); + int tile_col = 0; + + assert(tile_rows == 1); + (void)tile_rows; + + if (num_workers > pbi->num_tile_workers) { + int i; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + vpx_realloc(pbi->tile_workers, + num_workers * sizeof(*pbi->tile_workers))); + for (i = pbi->num_tile_workers; i < num_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + ++pbi->num_tile_workers; + + vp9_worker_init(worker); + worker->hook = (VP9WorkerHook)tile_worker_hook; + CHECK_MEM_ERROR(cm, worker->data1, + vpx_memalign(32, sizeof(TileWorkerData))); + CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); + if (i < num_workers - 1 && !vp9_worker_reset(worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + vpx_memset(pbi->above_context[0], 0, + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols); + vpx_memset(pbi->above_seg_context, 0, + sizeof(*pbi->above_seg_context) * aligned_mi_cols); + + while (tile_col < tile_cols) { + int i; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + TileInfo *const tile = (TileInfo*)worker->data2; + const size_t size = + get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data); + + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(tile, tile_data->cm, 0, tile_col); + + setup_token_decoder(data, data_end, size, &cm->error, + &tile_data->bit_reader); + setup_tile_context(pbi, &tile_data->xd, tile_col); + + worker->had_error = 0; + if (i == num_workers - 1 || tile_col == tile_cols - 1) { + vp9_worker_execute(worker); + } else { + vp9_worker_launch(worker); + } + + data += size; + ++tile_col; + } + + for (; i > 0; --i) { + VP9Worker *const worker = &pbi->tile_workers[i - 1]; + pbi->mb.corrupted |= !vp9_worker_sync(worker); + } + } + + { + const int final_worker = (tile_cols + num_workers - 1) % num_workers; + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[final_worker].data1; + return vp9_reader_find_end(&tile_data->bit_reader); + } +} + static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 || - vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 || - vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) { + if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 || + vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 || + vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) { vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); } @@ -834,34 +1064,6 @@ static void error_handler(void *data, size_t bit_offset) { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); } -static void setup_inter_inter(VP9_COMMON *cm) { - int i; - - cm->allow_comp_inter_inter = 0; - for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i) - cm->allow_comp_inter_inter |= - cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]; - - if (cm->allow_comp_inter_inter) { - // which one is always-on in comp inter-inter? - if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) { - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; - } else if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[ALTREF_FRAME]) { - cm->comp_fixed_ref = GOLDEN_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } else { - cm->comp_fixed_ref = LAST_FRAME; - cm->comp_var_ref[0] = GOLDEN_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } - } -} - #define RESERVED \ if (vp9_rb_read_bit(rb)) \ vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \ @@ -875,7 +1077,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->last_frame_type = cm->frame_type; - if (vp9_rb_read_literal(rb, 2) != 0x2) + if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame marker"); @@ -896,12 +1098,10 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->error_resilient_mode = vp9_rb_read_bit(rb); if (cm->frame_type == KEY_FRAME) { - int csp; - check_sync_code(cm, rb); - csp = vp9_rb_read_literal(rb, 3); // colorspace - if (csp != 7) { // != sRGB + cm->color_space = vp9_rb_read_literal(rb, 3); // colorspace + if (cm->color_space != SRGB) { vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range if (cm->version == 1) { cm->subsampling_x = vp9_rb_read_bit(rb); @@ -953,8 +1153,6 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) vp9_setup_scale_factors(cm, i); - - setup_inter_inter(cm); } } @@ -974,13 +1172,17 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, vp9_setup_past_independence(cm); setup_loopfilter(&cm->lf, rb); - setup_quantization(pbi, rb); + setup_quantization(cm, &pbi->mb, rb); setup_segmentation(&cm->seg, rb); setup_tile_info(cm, rb); sz = vp9_rb_read_literal(rb, 16); - return sz > 0 ? sz : -1; + if (sz == 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid header size"); + + return sz; } static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, @@ -1023,7 +1225,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, for (j = 0; j < PARTITION_CONTEXTS; ++j) for (i = 0; i < PARTITION_TYPES - 1; ++i) - vp9_diff_update_prob(&r, &fc->partition_prob[INTER_FRAME][j][i]); + vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); read_mv_probs(&r, nmvc, cm->allow_high_precision_mv); } @@ -1087,69 +1289,65 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { MACROBLOCKD *const xd = &pbi->mb; const uint8_t *data = pbi->source; - const uint8_t *data_end = pbi->source + pbi->source_sz; + const uint8_t *const data_end = pbi->source + pbi->source_sz; - struct vp9_read_bit_buffer rb = { data, data_end, 0, - cm, error_handler }; + struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler }; const size_t first_partition_size = read_uncompressed_header(pbi, &rb); const int keyframe = cm->frame_type == KEY_FRAME; - YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx]; + const int tile_rows = 1 << cm->log2_tile_rows; const int tile_cols = 1 << cm->log2_tile_cols; - int tile_col; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); if (!first_partition_size) { - if (!keyframe) { // showing a frame directly *p_data_end = data + 1; return 0; - } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid key frame"); - return -1; - } } - data += vp9_rb_bytes_read(&rb); - xd->corrupted = 0; - new_fb->corrupted = 0; - pbi->do_loopfilter_inline = - (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; if (!pbi->decoded_key_frame && !keyframe) return -1; + data += vp9_rb_bytes_read(&rb); if (!read_is_valid(data, first_partition_size, data_end)) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt header length"); - setup_plane_dequants(cm, &pbi->mb, cm->base_qindex); + pbi->do_loopfilter_inline = + (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; + if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData))); + pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + alloc_tile_storage(pbi, tile_cols); xd->mi_8x8 = cm->mi_grid_visible; xd->mode_info_stride = cm->mode_info_stride; + set_prev_mi(cm); - CHECK_MEM_ERROR(cm, pbi->mi_streams, - vpx_realloc(pbi->mi_streams, tile_cols * - sizeof(*pbi->mi_streams))); - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - vp9_get_tile_col_offsets(cm, tile_col); - pbi->mi_streams[tile_col] = - &cm->mi[cm->mi_rows * cm->cur_tile_mi_col_start]; - } + setup_plane_dequants(cm, xd, cm->base_qindex); + setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y); cm->fc = cm->frame_contexts[cm->frame_context_idx]; - vp9_zero(cm->counts); - - new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size); - - setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y); - - // clear out the coeff buffer for (i = 0; i < MAX_MB_PLANE; ++i) - vp9_zero(xd->plane[i].qcoeff); + vp9_zero(xd->plane[i].dqcoeff); - set_prev_mi(cm); + xd->corrupted = 0; + new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); - *p_data_end = decode_tiles(pbi, data + first_partition_size); + // TODO(jzern): remove frame_parallel_decoding_mode restriction for + // single-frame tile decoding. + if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 && + cm->frame_parallel_decoding_mode) { + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size); + } else { + *p_data_end = decode_tiles(pbi, data + first_partition_size); + } cm->last_width = cm->width; cm->last_height = cm->height; diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 0d0f0dfe0..010b8fe33 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -70,28 +70,28 @@ static const vp9_prob cat6_prob[15] = { DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \ token]; \ } \ - token_cache[scan[c]] = vp9_pt_energy_class[token]; \ } while (0) #define WRITE_COEF_CONTINUE(val, token) \ { \ - qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \ + dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \ dq[c > 0] / (1 + (tx_size == TX_32X32)); \ INCREMENT_COUNT(token); \ + token_cache[scan[c]] = vp9_pt_energy_class[token]; \ c++; \ continue; \ } -#define ADJUST_COEF(prob, bits_count) \ - do { \ - if (vp9_read(r, prob)) \ - val += 1 << bits_count; \ +#define ADJUST_COEF(prob, bits_count) \ + do { \ + val += (vp9_read(r, prob) << bits_count); \ } while (0); static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_reader *r, int block_idx, - PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr, - TX_SIZE tx_size, const int16_t *dq, int pt) { + PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr, + TX_SIZE tx_size, const int16_t *dq, int pt, + uint8_t *token_cache) { const FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); @@ -104,7 +104,6 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_coeff_count_model *coef_counts = counts->coef[tx_size]; const int16_t *scan, *nb; const uint8_t *const band_translate = get_band_translate(tx_size); - uint8_t token_cache[1024]; get_scan(xd, tx_size, type, block_idx, &scan, &nb); while (1) { @@ -131,6 +130,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); + token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN]; ++c; goto SKIP_START; } @@ -210,45 +210,26 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, return c; } -struct decode_block_args { - VP9_COMMON *cm; - MACROBLOCKD *xd; - struct segmentation *seg; - vp9_reader *r; - int *eobtotal; -}; - -static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *argv) { - const struct decode_block_args* const arg = argv; - - // find the maximum eob for this transform size, adjusted by segment - MACROBLOCKD *xd = arg->xd; - const struct segmentation *seg = arg->seg; - struct macroblockd_plane* pd = &xd->plane[plane]; - const int segment_id = xd->mi_8x8[0]->mbmi.segment_id; - const int seg_eob = get_tx_eob(seg, segment_id, tx_size); +int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, + int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, vp9_reader *r, + uint8_t *token_cache) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id, + tx_size); int aoff, loff, eob, pt; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); pt = get_entropy_context(tx_size, pd->above_context + aoff, pd->left_context + loff); - eob = decode_coefs(arg->cm, xd, arg->r, block, - pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block), - tx_size, pd->dequant, pt); + eob = decode_coefs(cm, xd, r, block, + pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block), + tx_size, pd->dequant, pt, token_cache); set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff); pd->eobs[block] = eob; - *arg->eobtotal += eob; + return eob; } -int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, - struct segmentation *seg, - vp9_reader *r, BLOCK_SIZE bsize) { - int eobtotal = 0; - struct decode_block_args args = {cm, xd, seg, r, &eobtotal}; - foreach_transformed_block(xd, bsize, decode_block, &args); - return eobtotal; -} + diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index 0fb4c3cc9..04939ead3 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -15,8 +15,9 @@ #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_dboolhuff.h" -int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, - struct segmentation *seg, - vp9_reader *r, BLOCK_SIZE bsize); +int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, + int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, vp9_reader *r, + uint8_t *token_cache); #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index 243dbef21..5f970a3d5 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -142,18 +142,12 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { pbi->decoded_key_frame = 0; vp9_worker_init(&pbi->lf_worker); - pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData)); - pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; - if (pbi->lf_worker.data1 == NULL || - (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker))) { - vp9_remove_decompressor(pbi); - return NULL; - } return pbi; } void vp9_remove_decompressor(VP9D_PTR ptr) { + int i; VP9D_COMP *const pbi = (VP9D_COMP *)ptr; if (!pbi) @@ -162,7 +156,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { vp9_remove_common(&pbi->common); vp9_worker_end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); + for (i = 0; i < pbi->num_tile_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + vp9_worker_end(worker); + vpx_free(worker->data1); + vpx_free(worker->data2); + } + vpx_free(pbi->tile_workers); vpx_free(pbi->mi_streams); + vpx_free(pbi->above_context[0]); + vpx_free(pbi->above_seg_context); vpx_free(pbi); } @@ -176,7 +179,6 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd) { VP9D_COMP *pbi = (VP9D_COMP *) ptr; VP9_COMMON *cm = &pbi->common; - int ref_fb_idx; /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the * encoder is using the frame buffers for. This is just a stub to keep the @@ -184,18 +186,15 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, * later commit that adds VP9-specific controls for this functionality. */ if (ref_frame_flag == VP9_LAST_FLAG) { - ref_fb_idx = cm->ref_frame_map[0]; + YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]]; + if (!equal_dimensions(cfg, sd)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + vp8_yv12_copy_frame(cfg, sd); } else { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); - return cm->error.error_code; - } - - if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Incorrect buffer dimensions"); - } else { - vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); } return cm->error.error_code; @@ -267,7 +266,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) { ++ref_index; } - cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + cm->frame_to_show = get_frame_new_buffer(cm); cm->fb_idx_ref_cnt[cm->new_fb_idx]--; // Invalidate these references until the next frame starts. @@ -305,7 +304,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, * thing to do here. */ if (cm->active_ref_idx[0] != INT_MAX) - cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1; + get_frame_ref_buffer(cm, 0)->corrupted = 1; } cm->new_fb_idx = get_free_fb(cm); @@ -322,7 +321,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, * thing to do here. */ if (cm->active_ref_idx[0] != INT_MAX) - cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1; + get_frame_ref_buffer(cm, 0)->corrupted = 1; if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) cm->fb_idx_ref_cnt[cm->new_fb_idx]--; diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index 68b30347e..7c4c9db36 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -40,9 +40,17 @@ typedef struct VP9Decompressor { int do_loopfilter_inline; // apply loopfilter to available rows immediately VP9Worker lf_worker; + VP9Worker *tile_workers; + int num_tile_workers; + /* Each tile column has its own MODE_INFO stream. This array indexes them by tile column index. */ MODE_INFO **mi_streams; + + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + PARTITION_CONTEXT *above_seg_context; + + DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); } VP9D_COMP; #endif // VP9_DECODER_VP9_ONYXD_INT_H_ diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 8378a78e1..87bd36c2b 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -53,8 +53,7 @@ extern unsigned int active_section; int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES]; int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1]; int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2]; -int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1] - [SWITCHABLE_FILTERS]; +int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; void init_tx_count_stats() { vp9_zero(tx_count_32x32p_stats); @@ -87,10 +86,9 @@ static void update_tx_count_stats(VP9_COMMON *cm) { static void update_switchable_interp_stats(VP9_COMMON *cm) { int i, j; - for (i = 0; i < SWITCHABLE_FILTERS+1; ++i) - for (j = 0; j < SWITCHABLE_FILTERS; ++j) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + for (j = 0; j < SWITCHABLE_FILTERS; ++j) switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j]; - } } void write_tx_count_stats() { @@ -140,9 +138,9 @@ void write_switchable_interp_stats() { fclose(fp); printf( - "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]" + "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]" "[SWITCHABLE_FILTERS] = {\n"); - for (i = 0; i < SWITCHABLE_FILTERS+1; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { printf(" { "); for (j = 0; j < SWITCHABLE_FILTERS; j++) { printf("%"PRId64", ", switchable_interp_stats[i][j]); @@ -165,18 +163,13 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb, vp9_wb_write_literal(wb, data, get_unsigned_bits(max)); } -static void update_mode( - vp9_writer *w, - int n, - vp9_tree tree, - vp9_prob Pnew[/* n-1 */], - vp9_prob Pcur[/* n-1 */], - unsigned int bct[/* n-1 */] [2], - const unsigned int num_events[/* n */] -) { +static void update_mode(vp9_writer *w, int n, vp9_tree tree, + vp9_prob Pcur[/* n-1 */], + unsigned int bct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { int i = 0; - vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0); + vp9_tree_probs_from_distribution(tree, bct, num_events, 0); n--; for (i = 0; i < n; ++i) @@ -187,11 +180,10 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi, vp9_writer* const bc) { VP9_COMMON *const cm = &cpi->common; int j; - vp9_prob pnew[INTRA_MODES - 1]; unsigned int bct[INTRA_MODES - 1][2]; for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew, + update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, cm->fc.y_mode_prob[j], bct, (unsigned int *)cpi->y_mode_count[j]); } @@ -233,44 +225,35 @@ static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) { write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m); } -static void update_switchable_interp_probs(VP9_COMP *const cpi, - vp9_writer* const bc) { +static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) { VP9_COMMON *const cm = &cpi->common; - unsigned int branch_ct[SWITCHABLE_FILTERS + 1] - [SWITCHABLE_FILTERS - 1][2]; - vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1]; + unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2]; int i, j; - for (j = 0; j <= SWITCHABLE_FILTERS; ++j) { - vp9_tree_probs_from_distribution( - vp9_switchable_interp_tree, - new_prob[j], branch_ct[j], - cm->counts.switchable_interp[j], 0); - } - for (j = 0; j <= SWITCHABLE_FILTERS; ++j) { - for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) { - vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i], - branch_ct[j][i]); - } + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) { + vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct, + cm->counts.switchable_interp[j], 0); + + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i], + branch_ct[i]); } + #ifdef MODE_STATS if (!cpi->dummy_packing) update_switchable_interp_stats(cm); #endif } -static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) { +static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { int i, j; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { unsigned int branch_ct[INTER_MODES - 1][2]; - vp9_prob new_prob[INTER_MODES - 1]; - - vp9_tree_probs_from_distribution(vp9_inter_mode_tree, - new_prob, branch_ct, + vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct, cm->counts.inter_mode[i], NEARESTMV); for (j = 0; j < INTER_MODES - 1; ++j) - vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j], + vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j], branch_ct[j]); } } @@ -561,7 +544,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]); } -static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, +static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, int mi_row, int mi_col, int index) { VP9_COMMON *const cm = &cpi->common; @@ -574,9 +558,10 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, xd->mi_8x8 = mi_8x8; - set_mi_row_col(&cpi->common, xd, + set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], - mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]); + mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], + cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { write_mb_modes_kf(cpi, mi_8x8, bc); #ifdef ENTROPY_STATS @@ -593,7 +578,31 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, pack_mb_tokens(bc, tok, tok_end); } -static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, +static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, + PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) { + VP9_COMMON *const cm = &cpi->common; + const int ctx = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); + const vp9_prob *const probs = get_partition_probs(cm, ctx); + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + + if (has_rows && has_cols) { + write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]); + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + vp9_write(w, p == PARTITION_SPLIT, probs[1]); + } else if (has_rows && !has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + vp9_write(w, p == PARTITION_SPLIT, probs[2]); + } else { + assert(p == PARTITION_SPLIT); + } +} + +static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize, int index) { @@ -615,42 +624,32 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, if (index > 0) return; } else { - int pl; - const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols, - mi_row, mi_col); - pl = partition_plane_context(cm, mi_row, mi_col, bsize); - // encode the partition information - if (idx == 0) - write_token(bc, vp9_partition_tree, - cm->fc.partition_prob[cm->frame_type][pl], - vp9_partition_encodings + partition); - else if (idx > 0) - vp9_write(bc, partition == PARTITION_SPLIT, - cm->fc.partition_prob[cm->frame_type][pl][idx]); + write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc); } subsize = get_subsize(bsize, partition); switch (partition) { case PARTITION_NONE: - write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); + write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); break; case PARTITION_HORZ: - write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); + write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); if ((mi_row + bs) < cm->mi_rows) - write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs, - mi_col, 1); + write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end, + mi_row + bs, mi_col, 1); break; case PARTITION_VERT: - write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); + write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); if ((mi_col + bs) < cm->mi_cols) - write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs, - 1); + write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end, + mi_row, mi_col + bs, 1); break; case PARTITION_SPLIT: for (n = 0; n < 4; n++) { const int j = n >> 1, i = n & 1; - write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end, + write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc, + tok, tok_end, mi_row + j * bs, mi_col + i * bs, subsize, n); } break; @@ -661,10 +660,12 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, // update partition context if (bsize >= BLOCK_8X8 && (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - update_partition_context(cm, mi_row, mi_col, subsize, bsize); + update_partition_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, subsize, bsize); } -static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, +static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, + vp9_writer* const bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; @@ -672,15 +673,15 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, MODE_INFO **mi_8x8 = cm->mi_grid_visible; MODE_INFO **m_8x8; - mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis; + mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis; - for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += 8, mi_8x8 += 8 * mis) { m_8x8 = mi_8x8; - vp9_zero(cm->left_seg_context); - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + vp9_zero(cpi->left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) { - write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col, + write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64, 0); } } @@ -692,8 +693,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = cpi->common.counts.eob_branch[tx_size]; vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size]; - vp9_prob full_probs[ENTROPY_NODES]; - int i, j, k, l; + int i, j, k, l, m; for (i = 0; i < BLOCK_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { @@ -702,16 +702,14 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { if (l >= 3 && k == 0) continue; vp9_tree_probs_from_distribution(vp9_coef_tree, - full_probs, coef_branch_ct[i][j][k][l], coef_counts[i][j][k][l], 0); - vpx_memcpy(coef_probs[i][j][k][l], full_probs, - sizeof(vp9_prob) * UNCONSTRAINED_NODES); coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; - coef_probs[i][j][k][l][0] = - get_binary_prob(coef_branch_ct[i][j][k][l][0][0], - coef_branch_ct[i][j][k][l][0][1]); + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + coef_probs[i][j][k][l][m] = get_binary_prob( + coef_branch_ct[i][j][k][l][m][0], + coef_branch_ct[i][j][k][l][m][1]); #ifdef ENTROPY_STATS if (!cpi->dummy_packing) { int t; @@ -1103,7 +1101,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { } } -static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type, +static void write_interp_filter_type(INTERPOLATION_TYPE type, struct vp9_write_bit_buffer *wb) { const int type_to_literal[] = { 1, 0, 2, 3 }; @@ -1121,7 +1119,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { int i, j, c = 0; for (i = 0; i < SWITCHABLE_FILTERS; ++i) { count[i] = 0; - for (j = 0; j <= SWITCHABLE_FILTERS; ++j) + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) count[i] += cm->counts.switchable_interp[j][i]; c += (count[i] > 0); } @@ -1201,7 +1199,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; - vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * + vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); tok[0][0] = cpi->tok; @@ -1216,9 +1214,10 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { } for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = 0; tile_col < tile_cols; tile_col++) { - vp9_get_tile_col_offsets(cm, tile_col); + TileInfo tile; + + vp9_tile_init(&tile, cm, 0, tile_col); tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) @@ -1226,7 +1225,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { else vp9_start_encode(&residual_bc, data_ptr + total_size); - write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end); + write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end); assert(tok[tile_row][tile_col] == tok_end); vp9_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { @@ -1295,17 +1294,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, } static void write_sync_code(struct vp9_write_bit_buffer *wb) { - vp9_wb_write_literal(wb, SYNC_CODE_0, 8); - vp9_wb_write_literal(wb, SYNC_CODE_1, 8); - vp9_wb_write_literal(wb, SYNC_CODE_2, 8); + vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8); + vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8); + vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8); } static void write_uncompressed_header(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { VP9_COMMON *const cm = &cpi->common; - // frame marker bits - vp9_wb_write_literal(wb, 0x2, 2); + vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2); // bitstream version. // 00 - profile 0. 4:2:0 only @@ -1319,18 +1317,10 @@ static void write_uncompressed_header(VP9_COMP *cpi, vp9_wb_write_bit(wb, cm->error_resilient_mode); if (cm->frame_type == KEY_FRAME) { + const COLOR_SPACE cs = UNKNOWN; write_sync_code(wb); - // colorspaces - // 000 - Unknown - // 001 - BT.601 - // 010 - BT.709 - // 011 - SMPTE-170 - // 100 - SMPTE-240 - // 101 - Reserved - // 110 - Reserved - // 111 - sRGB (RGB) - vp9_wb_write_literal(wb, 0, 3); - if (1 /* colorspace != sRGB */) { + vp9_wb_write_literal(wb, cs, 3); + if (cs != SRGB) { vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] if (cm->version == 1) { vp9_wb_write_bit(wb, cm->subsampling_x); @@ -1457,11 +1447,9 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { update_mbintra_mode_probs(cpi, &header_bc); for (i = 0; i < PARTITION_CONTEXTS; ++i) { - vp9_prob pnew[PARTITION_TYPES - 1]; unsigned int bct[PARTITION_TYPES - 1][2]; - update_mode(&header_bc, PARTITION_TYPES, - vp9_partition_tree, pnew, - fc->partition_prob[cm->frame_type][i], bct, + update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree, + fc->partition_prob[i], bct, (unsigned int *)cpi->partition_count[i]); } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 12dad0311..8033a4d15 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -42,7 +42,7 @@ typedef struct { int comp_pred_diff; int single_pred_diff; int64_t tx_rd_diff[TX_MODES]; - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; // motion vector cache for adaptive motion search control in partition // search loop @@ -118,8 +118,7 @@ struct macroblock { unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; int intra_uv_mode_cost[2][MB_MODE_COUNT]; int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; - int switchable_interp_costs[SWITCHABLE_FILTERS + 1] - [SWITCHABLE_FILTERS]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; // These define limits to motion vector components to prevent them // from extending outside the UMV borders @@ -137,7 +136,7 @@ struct macroblock { // note that token_costs is the cost when eob node is skipped vp9_coeff_cost token_costs[TX_SIZES]; - uint8_t token_cache[1024]; + DECLARE_ALIGNED(16, uint8_t, token_cache[1024]); int optimize; @@ -173,7 +172,7 @@ struct macroblock { BLOCK_SIZE sb_partitioning[4]; BLOCK_SIZE sb64_partitioning; - void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); }; // TODO(jingning): the variables used here are little complicated. need further diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 550cdee60..065992a25 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -8,14 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include <assert.h> #include <math.h> + #include "./vpx_config.h" -#include "vp9/common/vp9_systemdependent.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/encoder/vp9_dct.h" static void fdct4(const int16_t *input, int16_t *output) { int16_t step[4]; @@ -36,7 +39,7 @@ static void fdct4(const int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(temp2); } -void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) { +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -46,7 +49,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) { int pass; // We need an intermediate buffer between passes. int16_t intermediate[4 * 4]; - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { @@ -148,8 +151,8 @@ static const transform_2d FHT_4[] = { { fadst4, fadst4 } // ADST_ADST = 3 }; -void vp9_short_fht4x4_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { int16_t out[4 * 4]; int16_t *outptr = &out[0]; int i, j; @@ -159,7 +162,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) - temp_in[j] = input[j * pitch + i] * 16; + temp_in[j] = input[j * stride + i] * 16; if (i == 0 && temp_in[0]) temp_in[0] += 1; ht.cols(temp_in, temp_out); @@ -229,7 +232,7 @@ static void fdct8(const int16_t *input, int16_t *output) { output[7] = dct_const_round_shift(t3); } -void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) { +void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { int i, j; int16_t intermediate[64]; @@ -300,7 +303,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) { } } -void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) { +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -310,7 +313,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) { int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { @@ -556,8 +559,8 @@ static const transform_2d FHT_8[] = { { fadst8, fadst8 } // ADST_ADST = 3 }; -void vp9_short_fht8x8_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { int16_t out[64]; int16_t *outptr = &out[0]; int i, j; @@ -567,7 +570,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) - temp_in[j] = input[j * pitch + i] * 4; + temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) outptr[j * 8 + i] = temp_out[j]; @@ -585,10 +588,10 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ -void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int stride) { +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { int i; int a1, b1, c1, d1, e1; - int16_t *ip = input; + const int16_t *ip = input; int16_t *op = output; for (i = 0; i < 4; i++) { @@ -949,8 +952,8 @@ static const transform_2d FHT_16[] = { { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_short_fht16x16_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { int16_t out[256]; int16_t *outptr = &out[0]; int i, j; @@ -960,7 +963,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) - temp_in[j] = input[j * pitch + i] * 4; + temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; @@ -1311,7 +1314,7 @@ static void dct32_1d(const int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) { +void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1339,7 +1342,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) { // Note that although we use dct_32_round in dct32_1d computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) { +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1366,3 +1369,27 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) { out[j + i * 32] = temp_out[j]; } } + +void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride) { + if (tx_type == DCT_DCT) + vp9_fdct4x4(input, output, stride); + else + vp9_short_fht4x4(input, output, stride, tx_type); +} + +void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride) { + if (tx_type == DCT_DCT) + vp9_fdct8x8(input, output, stride); + else + vp9_short_fht8x8(input, output, stride, tx_type); +} + +void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride) { + if (tx_type == DCT_DCT) + vp9_fdct16x16(input, output, stride); + else + vp9_short_fht16x16(input, output, stride, tx_type); +} diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h new file mode 100644 index 000000000..aaf976d93 --- /dev/null +++ b/vp9/encoder/vp9_dct.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_ENCODER_VP9_DCT_H_ +#define VP9_ENCODER_VP9_DCT_H_ + +void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride); + +void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride); + +void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride); + +#endif // VP9_ENCODER_VP9_DCT_H_ diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 98284a690..a45299b59 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -282,7 +282,7 @@ static void build_activity_map(VP9_COMP *cpi) { VP9_COMMON * const cm = &cpi->common; #if ALT_ACT_MEASURE - YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm); int recon_yoffset; int recon_y_stride = new_yv12->y_stride; #endif @@ -465,7 +465,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; } } @@ -484,8 +484,8 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, x->e_mbd.plane[i].subsampling_y); } -static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, - BLOCK_SIZE bsize) { +static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, BLOCK_SIZE bsize) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -499,7 +499,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, const int idx_map = mb_row * cm->mb_cols + mb_col; const struct segmentation *const seg = &cm->seg; - set_skip_context(cm, xd, mi_row, mi_col); + set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col); // Activity map pointer x->mb_activity_ptr = &cpi->mb_activity_map[idx_map]; @@ -528,7 +528,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, // Set up distance of MB to edge of frame in 1/8th pel units assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); - set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_rows, cm->mi_cols); /* set up source buffers */ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); @@ -555,9 +556,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, const int x = mb_col & ~3; const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); - const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1; - const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) - >> 1; + const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1; + const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1; cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs; @@ -570,7 +570,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, } } -static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, +static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, int *totalrate, int64_t *totaldist, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { @@ -596,7 +597,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, } } - set_offsets(cpi, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); xd->mi_8x8[0]->mbmi.sb_type = bsize; // Set to zero to make sure we do not use the previous encoded frame stats @@ -632,10 +633,10 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, best_rd); } else { if (bsize >= BLOCK_8X8) - vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist, - bsize, ctx, best_rd); + vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, + totalrate, totaldist, bsize, ctx, best_rd); else - vp9_rd_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, totalrate, + vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate, totaldist, bsize, ctx, best_rd); } @@ -682,10 +683,6 @@ static void update_stats(VP9_COMP *cpi) { [mbmi->ref_frame[0] != GOLDEN_FRAME]++; } } - - // Count of last ref frame 0,0 usage - if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME) - cpi->inter_zz_count++; } } @@ -711,7 +708,6 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int p; @@ -721,28 +717,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, int mi_height = num_8x8_blocks_high_lookup[bsize]; for (p = 0; p < MAX_MB_PLANE; p++) { vpx_memcpy( - cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), a + num_4x4_blocks_wide * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); vpx_memcpy( - cm->left_context[p] + cpi->left_context[p] + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), l + num_4x4_blocks_high * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } - vpx_memcpy(cm->above_seg_context + mi_col, sa, - sizeof(PARTITION_CONTEXT) * mi_width); - vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl, - sizeof(PARTITION_CONTEXT) * mi_height); + vpx_memcpy(cpi->above_seg_context + mi_col, sa, + sizeof(*cpi->above_seg_context) * mi_width); + vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl, + sizeof(cpi->left_seg_context[0]) * mi_height); } static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; const MACROBLOCK *const x = &cpi->mb; const MACROBLOCKD *const xd = &x->e_mbd; int p; @@ -755,23 +750,24 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, for (p = 0; p < MAX_MB_PLANE; ++p) { vpx_memcpy( a + num_4x4_blocks_wide * p, - cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); vpx_memcpy( l + num_4x4_blocks_high * p, - cm->left_context[p] + cpi->left_context[p] + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } - vpx_memcpy(sa, cm->above_seg_context + mi_col, - sizeof(PARTITION_CONTEXT) * mi_width); - vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK), - sizeof(PARTITION_CONTEXT) * mi_height); + vpx_memcpy(sa, cpi->above_seg_context + mi_col, + sizeof(*cpi->above_seg_context) * mi_width); + vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK), + sizeof(cpi->left_seg_context[0]) * mi_height); } -static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, +static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize, int sub_index) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; @@ -789,7 +785,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, if (xd->ab_index > 0) return; } - set_offsets(cpi, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); update_state(cpi, get_block_context(x, bsize), bsize, output_enabled); encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize); @@ -801,7 +797,8 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, } } -static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, +static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; @@ -818,7 +815,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, c1 = BLOCK_4X4; if (bsize >= BLOCK_8X8) { - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, bsize); c1 = *(get_sb_partitioning(x, bsize)); } partition = partition_lookup[bsl][c1]; @@ -827,19 +825,19 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, case PARTITION_NONE: if (output_enabled && bsize >= BLOCK_8X8) cpi->partition_count[pl][PARTITION_NONE]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1); break; case PARTITION_VERT: if (output_enabled) cpi->partition_count[pl][PARTITION_VERT]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1); break; case PARTITION_HORZ: if (output_enabled) cpi->partition_count[pl][PARTITION_HORZ]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1); break; case PARTITION_SPLIT: subsize = get_subsize(bsize, PARTITION_SPLIT); @@ -851,7 +849,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, const int x_idx = i & 1, y_idx = i >> 1; *get_sb_index(xd, subsize) = i; - encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, + encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, output_enabled, subsize); } break; @@ -861,7 +859,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) - update_partition_context(cm, mi_row, mi_col, c1, bsize); + update_partition_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, c1, bsize); } // Check to see if the given partition size is allowed for a specified number @@ -889,13 +888,13 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, // However, at the bottom and right borders of the image the requested size // may not be allowed in which case this code attempts to choose the largest // allowable partition. -static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, - int mi_row, int mi_col) { +static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; BLOCK_SIZE bsize = cpi->sf.always_this_block_size; const int mis = cm->mode_info_stride; - int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row; - int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col; + int row8x8_remaining = tile->mi_row_end - mi_row; + int col8x8_remaining = tile->mi_col_end - mi_col; int block_row, block_col; MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col; int bh = num_8x8_blocks_high_lookup[bsize]; @@ -970,7 +969,9 @@ static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) { return 0; } -static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, +static void rd_use_partition(VP9_COMP *cpi, + const TileInfo *const tile, + MODE_INFO **mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon) { @@ -1022,7 +1023,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); if (bsize == BLOCK_16X16) { - set_offsets(cpi, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); } @@ -1049,10 +1050,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, mi_row + (ms >> 1) < cm->mi_rows && mi_col + (ms >> 1) < cm->mi_cols) { *(get_sb_partitioning(x, bsize)) = bsize; - pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, get_block_context(x, bsize), INT64_MAX); - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); none_rate += x->partition_cost[pl][PARTITION_NONE]; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1063,12 +1066,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: *get_sb_index(xd, subsize) = 0; - pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) { @@ -1077,7 +1080,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(xd, subsize) = 1; - pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, + pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; @@ -1091,7 +1094,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, break; case PARTITION_VERT: *get_sb_index(xd, subsize) = 0; - pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) { @@ -1100,7 +1103,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(xd, subsize) = 1; - pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, + pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; @@ -1127,7 +1130,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, *get_sb_index(xd, subsize) = i; - rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp, + rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, i != 3); if (rt == INT_MAX || dt == INT_MAX) { @@ -1143,7 +1146,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, assert(0); } - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, bsize); if (last_part_rate < INT_MAX) last_part_rate += x->partition_cost[pl][partition]; @@ -1175,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt, + pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, split_subsize, get_block_context(x, split_subsize), INT64_MAX); @@ -1188,15 +1192,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, } if (i != 3) - encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0, + encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0, split_subsize); split_rate += rt; split_dist += dt; - pl = partition_plane_context(cm, mi_row + y_idx, mi_col + x_idx, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row + y_idx, mi_col + x_idx, bsize); split_rate += x->partition_cost[pl][PARTITION_NONE]; } - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, bsize); if (split_rate < INT_MAX) { split_rate += x->partition_cost[pl][PARTITION_SPLIT]; @@ -1231,7 +1238,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX); if (do_recon) - encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); *rate = chosen_rate; *dist = chosen_dist; @@ -1279,7 +1286,8 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, // Look at neighboring blocks and set a min and max partition size based on // what they chose. -static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col, +static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, + int row, int col, BLOCK_SIZE *min_block_size, BLOCK_SIZE *max_block_size) { VP9_COMMON * const cm = &cpi->common; @@ -1293,8 +1301,8 @@ static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col, MODE_INFO ** above_sb64_mi_8x8; MODE_INFO ** left_sb64_mi_8x8; - int row8x8_remaining = cm->cur_tile_mi_row_end - row; - int col8x8_remaining = cm->cur_tile_mi_col_end - col; + int row8x8_remaining = tile->mi_row_end - row; + int col8x8_remaining = tile->mi_col_end - col; int bh, bw; // Trap case where we do not have a prediction. @@ -1444,7 +1452,8 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, +static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, int64_t best_rd) { VP9_COMMON * const cm = &cpi->common; @@ -1481,10 +1490,11 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, return; } } - assert(mi_height_log2(bsize) == mi_width_log2(bsize)); + assert(num_8x8_blocks_wide_lookup[bsize] == + num_8x8_blocks_high_lookup[bsize]); if (bsize == BLOCK_16X16) { - set_offsets(cpi, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); } @@ -1521,11 +1531,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, // PARTITION_NONE if (partition_none_allowed) { - pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, get_block_context(x, bsize), best_rd); if (this_rate != INT_MAX) { if (bsize >= BLOCK_8X8) { - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); this_rate += x->partition_cost[pl][PARTITION_NONE]; } sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); @@ -1573,7 +1585,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = i; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, + rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rate, &this_dist, i != 3, best_rd - sum_rd); if (this_rate == INT_MAX) { @@ -1585,7 +1597,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } } if (sum_rd < best_rd && i == 4) { - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); sum_rate += x->partition_cost[pl][PARTITION_SPLIT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { @@ -1618,7 +1632,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); @@ -1629,7 +1643,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate, + pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, &this_dist, subsize, get_block_context(x, subsize), best_rd - sum_rd); if (this_rate == INT_MAX) { @@ -1641,7 +1655,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } } if (sum_rd < best_rd) { - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); sum_rate += x->partition_cost[pl][PARTITION_HORZ]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { @@ -1661,7 +1677,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { @@ -1671,7 +1687,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate, + pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, &this_dist, subsize, get_block_context(x, subsize), best_rd - sum_rd); if (this_rate == INT_MAX) { @@ -1683,7 +1699,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } } if (sum_rd < best_rd) { - pl = partition_plane_context(cm, mi_row, mi_col, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); sum_rate += x->partition_cost[pl][PARTITION_VERT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { @@ -1701,7 +1719,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *dist = best_dist; if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) - encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); assert(best_rate < INT_MAX); @@ -1712,7 +1730,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } // Examines 64x64 block and chooses a best reference frame -static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { +static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl; @@ -1732,9 +1751,10 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { if ((mi_row + (ms >> 1) < cm->mi_rows) && (mi_col + (ms >> 1) < cm->mi_cols)) { cpi->set_ref_frame_mask = 1; - pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64, + pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, get_block_context(x, BLOCK_64X64), INT64_MAX); - pl = partition_plane_context(cm, mi_row, mi_col, BLOCK_64X64); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, BLOCK_64X64); r += x->partition_cost[pl][PARTITION_NONE]; *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64; @@ -1744,17 +1764,17 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); } -static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, - int *totalrate) { +static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp, int *totalrate) { VP9_COMMON * const cm = &cpi->common; int mi_col; // Initialize the left context for the new SB row - vpx_memset(&cm->left_context, 0, sizeof(cm->left_context)); - vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context)); + vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context)); + vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context)); // Code each SB in the row - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { int dummy_rate; int64_t dummy_dist; @@ -1762,7 +1782,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, vp9_zero(cpi->mb.pred_mv); if (cpi->sf.reference_masking) - rd_pick_reference_frame(cpi, mi_row, mi_col); + rd_pick_reference_frame(cpi, tile, mi_row, mi_col); if (cpi->sf.use_lastframe_partitioning || cpi->sf.use_one_partition_size_always ) { @@ -1772,9 +1792,9 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, cpi->mb.source_variance = UINT_MAX; if (cpi->sf.use_one_partition_size_always) { - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - set_partitioning(cpi, mi_8x8, mi_row, mi_col); - rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col); + rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } else { if ((cpi->common.current_video_frame @@ -1788,28 +1808,28 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, sb_has_motion(cpi, prev_mi_8x8))) { // If required set upper and lower partition size limits if (cpi->sf.auto_min_max_partition_size) { - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - rd_auto_partition_range(cpi, mi_row, mi_col, + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, tile, mi_row, mi_col, &cpi->sf.min_partition_size, &cpi->sf.max_partition_size); } - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64, + rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX); } else { copy_partitioning(cpi, mi_8x8, prev_mi_8x8); - rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, + rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } } } else { // If required set upper and lower partition size limits if (cpi->sf.auto_min_max_partition_size) { - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - rd_auto_partition_range(cpi, mi_row, mi_col, + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, tile, mi_row, mi_col, &cpi->sf.min_partition_size, &cpi->sf.max_partition_size); } - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64, + rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX); } } @@ -1836,7 +1856,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // TODO(jkoleszar): are these initializations required? setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], 0, 0, NULL); - setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0); + setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0); setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -1856,16 +1876,17 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cm->above_context[0], 0, - sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols); - vpx_memset(cm->above_seg_context, 0, - sizeof(PARTITION_CONTEXT) * aligned_mi_cols); + vpx_memset(cpi->above_context[0], 0, + sizeof(*cpi->above_context[0]) * + 2 * aligned_mi_cols * MAX_MB_PLANE); + vpx_memset(cpi->above_seg_context, 0, + sizeof(*cpi->above_seg_context) * aligned_mi_cols); } static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { // printf("Switching to lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + cpi->mb.fwd_txm4x4 = vp9_fwht4x4; cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; cpi->mb.optimize = 0; cpi->common.lf.filter_level = 0; @@ -1873,7 +1894,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { cpi->common.tx_mode = ONLY_4X4; } else { // printf("Not lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + cpi->mb.fwd_txm4x4 = vp9_fdct4x4; cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; } } @@ -1907,9 +1928,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { totalrate = 0; - // Reset frame count of inter 0,0 motion vector usage. - cpi->inter_zz_count = 0; - vp9_zero(cm->counts.switchable_interp); vp9_zero(cpi->tx_stepdown_count); @@ -1963,16 +1981,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { const int tile_rows = 1 << cm->log2_tile_rows; for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); - for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileInfo tile; TOKENEXTRA *tp_old = tp; // For each row of SBs in the frame - vp9_get_tile_col_offsets(cm, tile_col); - for (mi_row = cm->cur_tile_mi_row_start; - mi_row < cm->cur_tile_mi_row_end; mi_row += 8) - encode_sb_row(cpi, mi_row, &tp, &totalrate); + vp9_tile_init(&tile, cm, tile_row, tile_col); + for (mi_row = tile.mi_row_start; + mi_row < tile.mi_row_end; mi_row += 8) + encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate); cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); @@ -2188,7 +2205,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (cpi->sf.RD) { int i, pred_type; - INTERPOLATIONFILTERTYPE filter_type; + INTERPOLATION_TYPE filter_type; /* * This code does a single RD pass over the whole frame assuming * either compound, single or hybrid prediction as per whatever has @@ -2256,7 +2273,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; } - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs; cpi->rd_filter_threshes[frame_type][i] = (cpi->rd_filter_threshes[frame_type][i] + diff) / 2; @@ -2470,7 +2487,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, (mbmi->skip_coeff || vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { const uint8_t context = vp9_get_pred_context_tx_size(xd); - update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx); + ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size]; } else { int x, y; TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode]; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 3358fbbe9..e52e8ec1e 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -19,6 +19,7 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_systemdependent.h" +#include "vp9/encoder/vp9_dct.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rdopt.h" @@ -365,9 +366,9 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, yoff = 32 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; if (x->use_lp32x32fdct) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4); + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); else - vp9_short_fdct32x32(src_diff, coeff, bw * 4); + vp9_fdct32x32(src_diff, coeff, bw * 4); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -379,7 +380,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 16 * (block & twmask); yoff = 16 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_short_fdct16x16(src_diff, coeff, bw * 4); + vp9_fdct16x16(src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -391,7 +392,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 8 * (block & twmask); yoff = 8 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_short_fdct8x8(src_diff, coeff, bw * 4); + vp9_fdct8x8(src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -417,6 +418,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, struct encode_b_args *const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; + struct optimize_ctx *const ctx = args->ctx; struct macroblockd_plane *const pd = &xd->plane[plane]; const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, block); @@ -428,14 +430,18 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, // TODO(jingning): per transformed block zero forcing only enabled for // luma component. will integrate chroma components as well. if (x->zcoeff_blk[tx_size][block] && plane == 0) { + int x, y; pd->eobs[block] = 0; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); + ctx->ta[plane][x] = 0; + ctx->tl[plane][y] = 0; return; } vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); if (x->optimize) - vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); + vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); if (x->skip_encode || pd->eobs[block] == 0) return; @@ -461,6 +467,27 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, } } +static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, + block); + + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, + pd->dst.buf, pd->dst.stride); + + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); + + if (pd->eobs[block] == 0) + return; + + xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); +} + void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; @@ -470,7 +497,7 @@ void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { if (x->optimize) optimize_init_b(0, bsize, &arg); - foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); + foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg); } void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { @@ -532,9 +559,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_subtract_block(32, 32, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); if (x->use_lp32x32fdct) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4); + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); else - vp9_short_fdct32x32(src_diff, coeff, bw * 4); + vp9_fdct32x32(src_diff, coeff, bw * 4); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -556,10 +583,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(16, 16, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); - if (tx_type != DCT_DCT) - vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type); - else - vp9_short_fdct16x16(src_diff, coeff, bw * 4); + vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -581,10 +605,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(8, 8, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); - if (tx_type != DCT_DCT) - vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type); - else - vp9_short_fdct8x8(src_diff, coeff, bw * 4); + vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index 9ebcc4983..e2c6c4c0c 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -124,8 +124,9 @@ static void build_nmv_component_cost_table(int *mvcost, } } -static int update_mv(vp9_writer *w, const unsigned int ct[2], - vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) { +static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p, + vp9_prob upd_p) { + const vp9_prob new_p = get_binary_prob(ct[0], ct[1]); vp9_prob mod_p = new_p | 1; const int cur_b = cost_branch256(ct, *cur_p); const int mod_b = cost_branch256(ct, mod_p); @@ -143,7 +144,6 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2], static void counts_to_nmv_context( nmv_context_counts *nmv_count, - nmv_context *prob, int usehp, unsigned int (*branch_ct_joint)[2], unsigned int (*branch_ct_sign)[2], @@ -156,29 +156,24 @@ static void counts_to_nmv_context( unsigned int (*branch_ct_hp)[2]) { int i, j, k; vp9_tree_probs_from_distribution(vp9_mv_joint_tree, - prob->joints, branch_ct_joint, nmv_count->joints, 0); for (i = 0; i < 2; ++i) { const uint32_t s0 = nmv_count->comps[i].sign[0]; const uint32_t s1 = nmv_count->comps[i].sign[1]; - prob->comps[i].sign = get_binary_prob(s0, s1); branch_ct_sign[i][0] = s0; branch_ct_sign[i][1] = s1; vp9_tree_probs_from_distribution(vp9_mv_class_tree, - prob->comps[i].classes, - branch_ct_classes[i], - nmv_count->comps[i].classes, 0); + branch_ct_classes[i], + nmv_count->comps[i].classes, 0); vp9_tree_probs_from_distribution(vp9_mv_class0_tree, - prob->comps[i].class0, branch_ct_class0[i], nmv_count->comps[i].class0, 0); for (j = 0; j < MV_OFFSET_BITS; ++j) { const uint32_t b0 = nmv_count->comps[i].bits[j][0]; const uint32_t b1 = nmv_count->comps[i].bits[j][1]; - prob->comps[i].bits[j] = get_binary_prob(b0, b1); branch_ct_bits[i][j][0] = b0; branch_ct_bits[i][j][1] = b1; } @@ -186,12 +181,10 @@ static void counts_to_nmv_context( for (i = 0; i < 2; ++i) { for (k = 0; k < CLASS0_SIZE; ++k) { vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - prob->comps[i].class0_fp[k], branch_ct_class0_fp[i][k], nmv_count->comps[i].class0_fp[k], 0); } vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - prob->comps[i].fp, branch_ct_fp[i], nmv_count->comps[i].fp, 0); } @@ -202,11 +195,9 @@ static void counts_to_nmv_context( const uint32_t hp0 = nmv_count->comps[i].hp[0]; const uint32_t hp1 = nmv_count->comps[i].hp[1]; - prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1); branch_ct_class0_hp[i][0] = c0_hp0; branch_ct_class0_hp[i][1] = c0_hp1; - prob->comps[i].hp = get_binary_prob(hp0, hp1); branch_ct_hp[i][0] = hp0; branch_ct_hp[i][1] = hp1; } @@ -215,7 +206,6 @@ static void counts_to_nmv_context( void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { int i, j; - nmv_context prob; unsigned int branch_ct_joint[MV_JOINTS - 1][2]; unsigned int branch_ct_sign[2][2]; unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; @@ -227,30 +217,28 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { unsigned int branch_ct_hp[2][2]; nmv_context *mvc = &cpi->common.fc.nmvc; - counts_to_nmv_context(&cpi->NMVcount, &prob, usehp, + counts_to_nmv_context(&cpi->NMVcount, usehp, branch_ct_joint, branch_ct_sign, branch_ct_classes, branch_ct_class0, branch_ct_bits, branch_ct_class0_fp, branch_ct_fp, branch_ct_class0_hp, branch_ct_hp); for (j = 0; j < MV_JOINTS - 1; ++j) - update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j], - NMV_UPDATE_PROB); + update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB); for (i = 0; i < 2; ++i) { - update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, - prob.comps[i].sign, NMV_UPDATE_PROB); + update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB); for (j = 0; j < MV_CLASSES - 1; ++j) update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j], - prob.comps[i].classes[j], NMV_UPDATE_PROB); + NMV_UPDATE_PROB); for (j = 0; j < CLASS0_SIZE - 1; ++j) update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j], - prob.comps[i].class0[j], NMV_UPDATE_PROB); + NMV_UPDATE_PROB); for (j = 0; j < MV_OFFSET_BITS; ++j) update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j], - prob.comps[i].bits[j], NMV_UPDATE_PROB); + NMV_UPDATE_PROB); } for (i = 0; i < 2; ++i) { @@ -258,21 +246,19 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { int k; for (k = 0; k < 3; ++k) update_mv(bc, branch_ct_class0_fp[i][j][k], - &mvc->comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB); + &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB); } for (j = 0; j < 3; ++j) - update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], - prob.comps[i].fp[j], NMV_UPDATE_PROB); + update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB); } if (usehp) { for (i = 0; i < 2; ++i) { update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp, - prob.comps[i].class0_hp, NMV_UPDATE_PROB); + NMV_UPDATE_PROB); update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp, - prob.comps[i].hp, NMV_UPDATE_PROB); + NMV_UPDATE_PROB); } } } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index caf41625c..6a3555d68 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -481,13 +481,14 @@ void vp9_first_pass(VP9_COMP *cpi) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + TileInfo tile; int recon_yoffset, recon_uvoffset; const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx]; const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx]; YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx]; - YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx]; + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const int recon_y_stride = lst_yv12->y_stride; const int recon_uv_stride = lst_yv12->uv_stride; int64_t intra_error = 0; @@ -532,6 +533,9 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_initialize_rd_consts(cpi); } + // tiling is ignored in the first pass + vp9_tile_init(&tile, cm, 0, 0); + // for each macroblock row in image for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { int_mv best_ref_mv; @@ -578,11 +582,12 @@ void vp9_first_pass(VP9_COMP *cpi) { } } xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME; - set_mi_row_col(cm, xd, + set_mi_row_col(xd, &tile, mb_row << 1, - 1 << mi_height_log2(xd->mi_8x8[0]->mbmi.sb_type), + num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type], mb_col << 1, - 1 << mi_width_log2(xd->mi_8x8[0]->mbmi.sb_type)); + num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type], + cm->mi_rows, cm->mi_cols); if (cpi->sf.variance_adaptive_quantization) { int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type); @@ -2164,17 +2169,14 @@ void vp9_second_pass(VP9_COMP *cpi) { cpi->ni_av_qi = tmp_q; cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); -#ifndef ONE_SHOT_Q_ESTIMATE // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial // estimate for the clip is bad, but helps prevent excessive // variation in Q, especially near the end of a clip // where for example a small overspend may cause Q to crash adjust_maxq_qrange(cpi); -#endif } -#ifndef ONE_SHOT_Q_ESTIMATE // The last few frames of a clip almost always have to few or too many // bits and for the sake of over exact rate control we dont want to make // radical adjustments to the allowed quantizer range just to use up a @@ -2197,7 +2199,6 @@ void vp9_second_pass(VP9_COMP *cpi) { cpi->active_worst_quality = adjust_active_maxq(cpi->active_worst_quality, tmp_q); } -#endif } vp9_zero(this_frame); if (EOF == input_stats(cpi, &this_frame)) diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 644363158..7b605b212 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -194,8 +194,8 @@ static void update_mbgraph_mb_stats x->plane[0].src.buf = buf->y_buffer + mb_y_offset; x->plane[0].src.stride = buf->y_stride; - xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset; - xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride; + xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset; + xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride; // do intra 16x16 prediction intra_error = find_best_16x16_intra(cpi, mb_y_offset, diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c index b867d8b71..7eb659232 100644 --- a/vp9/encoder/vp9_modecosts.c +++ b/vp9/encoder/vp9_modecosts.c @@ -36,7 +36,7 @@ void vp9_init_mode_costs(VP9_COMP *c) { vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree); - for (i = 0; i <= SWITCHABLE_FILTERS; ++i) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i], cm->fc.switchable_interp_prob[i], vp9_switchable_interp_tree); diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index accc338fb..f922f900a 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -312,6 +312,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { cpi->mb_activity_map = 0; vpx_free(cpi->mb_norm_activity_map); cpi->mb_norm_activity_map = 0; + + vpx_free(cpi->above_context[0]); + cpi->above_context[0] = NULL; + + vpx_free(cpi->above_seg_context); + cpi->above_seg_context = NULL; } // Computes a q delta (in "q index" terms) to get from a starting q value @@ -959,9 +965,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->optimize_coefficients = 0; } - cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + cpi->mb.fwd_txm4x4 = vp9_fdct4x4; if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + cpi->mb.fwd_txm4x4 = vp9_fwht4x4; } if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) { @@ -1026,11 +1032,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); } - // Data used for real time vc mode to see if gf needs refreshing - cpi->inter_zz_count = 0; - cpi->gf_bad_count = 0; - cpi->gf_update_recommended = 0; - vpx_free(cpi->mb_activity_map); CHECK_MEM_ERROR(cm, cpi->mb_activity_map, vpx_calloc(sizeof(unsigned int), @@ -1040,6 +1041,19 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map, vpx_calloc(sizeof(unsigned int), cm->mb_rows * cm->mb_cols)); + + // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm + // block where mi unit size is 8x8. + vpx_free(cpi->above_context[0]); + CHECK_MEM_ERROR(cm, cpi->above_context[0], + vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) * + MAX_MB_PLANE, + sizeof(*cpi->above_context[0]))); + + vpx_free(cpi->above_seg_context); + CHECK_MEM_ERROR(cm, cpi->above_seg_context, + vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols), + sizeof(*cpi->above_seg_context))); } @@ -1072,6 +1086,15 @@ static void update_frame_size(VP9_COMP *cpi) { vp9_init_dsmotion_compensation(&cpi->mb, y_stride); } } + + { + int i; + for (i = 1; i < MAX_MB_PLANE; ++i) { + cpi->above_context[i] = cpi->above_context[0] + + i * sizeof(*cpi->above_context[0]) * 2 * + mi_cols_aligned_to_sb(cm->mi_cols); + } + } } @@ -1157,7 +1180,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { int i; cpi->oxcf = *oxcf; - cpi->goldfreq = 7; cm->version = oxcf->version; @@ -2669,8 +2691,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { vp9_clear_system_state(); // __asm emms; - recon_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); + recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); if (cpi->twopass.total_left_stats.coded_error != 0.0) fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" @@ -2829,19 +2850,11 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { cpi->active_best_quality = cpi->cq_target_quality; } else { -#ifdef ONE_SHOT_Q_ESTIMATE -#ifdef STRICT_ONE_SHOT_Q - cpi->active_best_quality = q; -#else - cpi->active_best_quality = inter_minq[q]; -#endif -#else cpi->active_best_quality = inter_minq[q]; // 1-pass: for now, use the average Q for the active_best, if its lower // than active_worst. - if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality)) + if (cpi->pass == 0 && (cpi->avg_frame_qindex < q)) cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex]; -#endif // For the constrained quality mode we don't want // q to fall below the cq level. @@ -2875,7 +2888,14 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) { *top_index = (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4; + // If this is the first (key) frame in 1-pass, active best is the user + // best-allowed, and leave the top_index to active_worst. + if (cpi->pass == 0 && cpi->common.current_video_frame == 0) { + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + *top_index = cpi->oxcf.worst_allowed_q; + } } else if (!cpi->is_src_frame_alt_ref && + (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { *top_index = (cpi->active_worst_quality + cpi->active_best_quality) / 2; @@ -3169,8 +3189,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Special case handling for forced key frames if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { int last_q = q; - int kf_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); + int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); int high_err_target = cpi->ambient_err; int low_err_target = cpi->ambient_err >> 1; @@ -3306,14 +3325,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // fixed interval. Note the reconstruction error if it is the frame before // the force key frame if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) { - cpi->ambient_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); + cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); } if (cm->frame_type == KEY_FRAME) cpi->refresh_last_frame = 1; - cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + cm->frame_to_show = get_frame_new_buffer(cm); #if WRITE_RECON_BUFFER if (cm->show_frame) @@ -3912,7 +3930,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cm->frame_flags = *frame_flags; // Reset the frame pointers to the current frame size - vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], + vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9BORDERINPIXELS); diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index b1dfcbb9c..9429c7fed 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -29,11 +29,6 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/encoder/vp9_lookahead.h" -// Experimental rate control switches -#if CONFIG_ONESHOTQ -#define ONE_SHOT_Q_ESTIMATE 0 -#define STRICT_ONE_SHOT_Q 0 -#endif #define DISABLE_RC_LONG_TERM_MEM 0 // #define MODE_TEST_HIT_STATS @@ -396,9 +391,9 @@ typedef struct VP9_COMP { // FIXME(rbultje) can this overflow? int rd_tx_select_threshes[4][TX_MODES]; - int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1]; - int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1]; - int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS]; + int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS]; int RDMULT; int RDDIV; @@ -506,14 +501,9 @@ typedef struct VP9_COMP { int decimation_count; // for real time encoding - int avg_encode_time; // microsecond - int avg_pick_mode_time; // microsecond int speed; - unsigned int cpu_freq; // Mhz int compressor_speed; - int interquantizer; - int goldfreq; int auto_worst_q; int cpu_used; int pass; @@ -529,12 +519,6 @@ typedef struct VP9_COMP { unsigned int max_mv_magnitude; int mv_step_param; - // Data used for real time conferencing mode to help determine if it - // would be good to update the gf - int inter_zz_count; - int gf_bad_count; - int gf_update_recommended; - unsigned char *segmentation_map; // segment threashold for encode breakout @@ -641,7 +625,7 @@ typedef struct VP9_COMP { int dummy_packing; /* flag to indicate if packing is dummy */ - unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1] + unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; unsigned int tx_stepdown_count[TX_SIZES]; @@ -675,6 +659,13 @@ typedef struct VP9_COMP { // Debug / test stats int64_t mode_test_hits[BLOCK_SIZES]; #endif + + /* Y,U,V,(A) */ + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; } VP9_COMP; static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 7ad8d1fb2..fca752524 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -22,12 +22,14 @@ extern int enc_debug; #endif -void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, - int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { +void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, rc, eob; int zbins[2], nzbins[2], zbin; int x, y, z, sz; @@ -86,14 +88,15 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, *eob_ptr = eob + 1; } -void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - int16_t *zbin_ptr, int16_t *round_ptr, - int16_t *quant_ptr, int16_t *quant_shift_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, - int16_t *dequant_ptr, int zbin_oq_value, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, rc, eob; int zbins[2], nzbins[2]; int x, y, z, sz; @@ -174,25 +177,19 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks, return res; } -void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, - int y_blocks) { - MACROBLOCKD *const xd = &mb->e_mbd; +void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx, + const int16_t *scan, const int16_t *iscan) { + MACROBLOCKD *const xd = &x->e_mbd; const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx); - const int16_t *scan = get_scan_4x4(tx_type); - const int16_t *iscan = get_iscan_4x4(tx_type); - - vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block), - 16, mb->skip_block, - mb->plane[pb_idx.plane].zbin, - mb->plane[pb_idx.plane].round, - mb->plane[pb_idx.plane].quant, - mb->plane[pb_idx.plane].quant_shift, - BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block), - BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block), - xd->plane[pb_idx.plane].dequant, - mb->plane[pb_idx.plane].zbin_extra, - &xd->plane[pb_idx.plane].eobs[pb_idx.block], - scan, iscan); + struct macroblock_plane* p = &x->plane[pb_idx.plane]; + struct macroblockd_plane* pd = &xd->plane[pb_idx.plane]; + + vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block), + 16, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, + BLOCK_OFFSET(pd->qcoeff, pb_idx.block), + BLOCK_OFFSET(pd->dqcoeff, pb_idx.block), + pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 459aa3359..c078e1d41 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -13,8 +13,9 @@ #include "vp9/encoder/vp9_block.h" -void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, - int y_blocks); +void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx, + const int16_t *scan, const int16_t *iscan); + struct VP9_COMP; void vp9_set_quantizer(struct VP9_COMP *cpi, int q); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index f166b10a1..993919e5b 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -251,8 +251,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs); for (i = 0; i < PARTITION_CONTEXTS; i++) - vp9_cost_tokens(cpi->mb.partition_cost[i], - cm->fc.partition_prob[cm->frame_type][i], + vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i), vp9_partition_tree); /*rough estimate for costing*/ @@ -611,7 +610,7 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, // TODO(jingning): temporarily enabled only for luma component rd = MIN(rd1, rd2); if (plane == 0) - x->zcoeff_blk[tx_size][block] = rd1 > rd2; + x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block]; args->this_rate += args->rate; args->this_dist += args->dist; @@ -933,14 +932,15 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack; + const int b_inter_mode = is_inter_block(mbmi); assert(bs == mbmi->sb_type); - if (mbmi->ref_frame[0] > INTRA_FRAME) + if (b_inter_mode) vp9_subtract_sby(x, bs); if (cpi->sf.tx_size_search_method == USE_LARGESTALL || (cpi->sf.tx_size_search_method != USE_FULL_RD && - mbmi->ref_frame[0] == INTRA_FRAME)) { + !b_inter_mode)) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, bs); @@ -950,7 +950,7 @@ static void super_block_yrd(VP9_COMP *cpi, } if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && - mbmi->ref_frame[0] > INTRA_FRAME) { + b_inter_mode) { if (bs >= BLOCK_32X32) model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); @@ -1031,10 +1031,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT ta[2], tempa[2]; ENTROPY_CONTEXT tl[2], templ[2]; - TX_TYPE tx_type = DCT_DCT; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - int idx, idy, block; + int idx, idy; uint8_t best_dst[8 * 8]; assert(ib < 4); @@ -1070,8 +1070,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, const int16_t *nb; uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride; uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride; - - block = ib + idy * 2 + idx; + const int block = ib + idy * 2 + idx; + TX_TYPE tx_type; xd->mi_8x8[0]->bmi[block].as_mode = mode; src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); coeff = BLOCK_OFFSET(x->plane[0].coeff, block); @@ -1085,13 +1085,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, dst, dst_stride); tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block); + get_scan_nb_4x4(tx_type, &scan, &nb); + if (tx_type != DCT_DCT) vp9_short_fht4x4(src_diff, coeff, 8, tx_type); else x->fwd_txm4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, block, tx_type, 16); - get_scan_nb_4x4(tx_type, &scan, &nb); + vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type)); + ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, scan, nb); distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), @@ -1431,10 +1433,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv); -static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, - int mi_row, int mi_col, - int_mv *tmp_mv, int *rate_mv); static int labels2mode(MACROBLOCK *x, int i, MB_PREDICTION_MODE this_mode, @@ -1561,7 +1559,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, coeff = BLOCK_OFFSET(p->coeff, k); x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); - vp9_regular_quantize_b_4x4(x, k, DCT_DCT, 16); + vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT), + get_iscan_4x4(DCT_DCT)); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); thissse += ssz; @@ -1645,6 +1644,7 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, } static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, BEST_SEG_INFO *bsi_buf, int filter_idx, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { @@ -1653,6 +1653,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, MB_PREDICTION_MODE this_mode; MODE_INFO *mi = x->e_mbd.mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; + struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; const int label_count = 4; int64_t this_segment_rd = 0; int label_mv_thresh; @@ -1667,8 +1668,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, int subpelmv = 1, have_ref = 0; const int has_second_rf = has_second_ref(mbmi); - vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above)); - vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left)); + vpx_memcpy(t_above, pd->above_context, sizeof(t_above)); + vpx_memcpy(t_left, pd->left_context, sizeof(t_left)); v_fn_ptr = &cpi->fn_ptr[bsize]; @@ -1690,13 +1691,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, i = idy * 2 + idx; frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, + vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile, &frame_mv[NEARESTMV][mbmi->ref_frame[0]], &frame_mv[NEARMV][mbmi->ref_frame[0]], i, 0, mi_row, mi_col); if (has_second_rf) { frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, + vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile, &frame_mv[NEARESTMV][mbmi->ref_frame[1]], &frame_mv[NEARMV][mbmi->ref_frame[1]], i, 1, mi_row, mi_col); @@ -1746,7 +1747,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } } - vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre)); + vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre)); vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above, sizeof(bsi->rdstat[i][mode_idx].ta)); vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left, @@ -1870,12 +1871,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, mi_buf_restore(x, orig_src, orig_pre); } - if (has_second_rf && this_mode == NEWMV && - mbmi->interp_filter == EIGHTTAP) { + if (has_second_rf) { if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV || seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) continue; + } + if (has_second_rf && this_mode == NEWMV && + mbmi->interp_filter == EIGHTTAP) { // adjust src pointers mi_buf_shift(x, i); if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { @@ -1950,6 +1953,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx], sizeof(SEG_RDSTAT)); + if (num_4x4_blocks_wide > 1) + bsi->rdstat[i + 1][mode_idx].eobs = + ref_bsi->rdstat[i + 1][mode_idx].eobs; + if (num_4x4_blocks_high > 1) + bsi->rdstat[i + 2][mode_idx].eobs = + ref_bsi->rdstat[i + 2][mode_idx].eobs; + if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { mode_selected = this_mode; best_rd = bsi->rdstat[i][mode_idx].brdcost; @@ -1970,7 +1980,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0); bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; - bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i]; + bsi->rdstat[i][mode_idx].eobs = pd->eobs[i]; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1]; + if (num_4x4_blocks_high > 1) + bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2]; } if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { @@ -2026,6 +2040,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int_mv *best_ref_mv, int_mv *second_best_ref_mv, int64_t best_rd, @@ -2056,7 +2071,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < 4; i++) bsi->modes[i] = ZEROMV; - rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col); + rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs, + mi_row, mi_col); if (bsi->segment_rd > best_rd) return INT64_MAX; @@ -2204,7 +2220,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int_mv *second_ref_mv, int64_t comp_pred_diff[NB_PREDICTION_TYPES], int64_t tx_size_diff[TX_MODES], - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) { + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be @@ -2222,7 +2238,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); vpx_memcpy(ctx->best_filter_diff, best_filter_diff, - sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1)); + sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS); } static void setup_pred_block(const MACROBLOCKD *xd, @@ -2252,6 +2268,7 @@ static void setup_pred_block(const MACROBLOCKD *xd, } static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int idx, MV_REFERENCE_FRAME frame_type, BLOCK_SIZE block_size, int mi_row, int mi_col, @@ -2267,12 +2284,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // set up scaling factors scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1]; - scale[frame_type].x_offset_q4 = - ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp, - REF_SCALE_SHIFT) & 0xf; - scale[frame_type].y_offset_q4 = - ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp, - REF_SCALE_SHIFT) & 0xf; + scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type], + mi_row * MI_SIZE, mi_col * MI_SIZE); // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. @@ -2280,7 +2293,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, &scale[frame_type], &scale[frame_type]); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(&cpi->common, xd, xd->mi_8x8[0], + vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0], xd->last_mi, frame_type, mbmi->ref_mvs[frame_type], mi_row, mi_col); @@ -2294,7 +2307,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. - if (!vp9_is_scaled(&scale[frame_type]) && block_size >= BLOCK_8X8) + if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8) mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride, frame_type, block_size); } @@ -2317,6 +2330,7 @@ static INLINE int get_switchable_rate(const MACROBLOCK *x) { } static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { @@ -2501,9 +2515,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL); } - xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0], + xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0], mi_row, mi_col); - xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1], + xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1], mi_row, mi_col); scaled_first_yv12 = xd->plane[0].pre[0]; @@ -2613,6 +2627,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, BLOCK_SIZE bsize, int64_t txfm_cache[], int *rate2, int64_t *distortion, @@ -2620,7 +2635,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate_y, int64_t *distortion_y, int *rate_uv, int64_t *distortion_uv, int *mode_excluded, int *disable_skip, - INTERPOLATIONFILTERTYPE *best_filter, + INTERPOLATION_TYPE *best_filter, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], @@ -2647,6 +2662,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int orig_dst_stride[MAX_MB_PLANE]; int rs = 0; + if (is_comp_pred) { + if (frame_mv[refs[0]].as_int == INVALID_MV || + frame_mv[refs[1]].as_int == INVALID_MV) + return INT64_MAX; + } + if (this_mode == NEWMV) { int rate_mv; if (is_comp_pred) { @@ -2665,13 +2686,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, &mbmi->ref_mvs[refs[1]][0].as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } - if (frame_mv[refs[0]].as_int == INVALID_MV || - frame_mv[refs[1]].as_int == INVALID_MV) - return INT64_MAX; *rate2 += rate_mv; } else { int_mv tmp_mv; - single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); + single_motion_search(cpi, x, tile, bsize, mi_row, mi_col, + &tmp_mv, &rate_mv); *rate2 += rate_mv; frame_mv[refs[0]].as_int = xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int; @@ -3082,6 +3101,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int mi_row, int mi_col, int *returnrate, int64_t *returndistortion, @@ -3111,8 +3131,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_tx_diff[TX_MODES]; int64_t best_pred_diff[NB_PREDICTION_TYPES]; int64_t best_pred_rd[NB_PREDICTION_TYPES]; - int64_t best_filter_rd[SWITCHABLE_FILTERS + 1]; - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; MB_MODE_INFO best_mbmode = { 0 }; int j; int mode_index, best_mode_index = 0; @@ -3122,7 +3142,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_inter_rd = INT64_MAX; MB_PREDICTION_MODE best_intra_mode = DC_PRED; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; + INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; @@ -3150,7 +3170,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_pred_rd[i] = INT64_MAX; for (i = 0; i < TX_MODES; i++) best_tx_rd[i] = INT64_MAX; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = INT64_MAX; for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX; @@ -3192,8 +3212,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size, - mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], + setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame, + block_size, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb, scale_factor); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; @@ -3437,7 +3458,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } else { mbmi->mode = this_mode; compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME); - this_rd = handle_inter_mode(cpi, x, bsize, + this_rd = handle_inter_mode(cpi, x, tile, bsize, tx_cache, &rate2, &distortion2, &skippable, &rate_y, &distortion_y, @@ -3520,17 +3541,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } // Keep record of best intra rd - if (xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME && - is_intra_mode(xd->mi_8x8[0]->mbmi.mode) && + if (!is_inter_block(&xd->mi_8x8[0]->mbmi) && this_rd < best_intra_rd) { best_intra_rd = this_rd; best_intra_mode = xd->mi_8x8[0]->mbmi.mode; } + // Keep record of best inter rd with single reference - if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME && - xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE && - !mode_excluded && - this_rd < best_inter_rd) { + if (is_inter_block(&xd->mi_8x8[0]->mbmi) && + !has_second_ref(&xd->mi_8x8[0]->mbmi) && + !mode_excluded && this_rd < best_inter_rd) { best_inter_rd = this_rd; best_inter_ref_frame = ref_frame; } @@ -3538,7 +3558,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!disable_skip && ref_frame == INTRA_FRAME) { for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); } @@ -3621,7 +3641,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cm->mcomp_filter_type != BILINEAR) { int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? SWITCHABLE_FILTERS : cm->mcomp_filter_type]; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int64_t adj_rd; // In cases of poor prediction, filter_cache[] can contain really big // values, which actually are bigger than this_rd itself. This can @@ -3743,7 +3763,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (!x->skip) { - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { if (best_filter_rd[i] == INT64_MAX) best_filter_diff[i] = 0; else @@ -3779,6 +3799,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int mi_row, int mi_col, int *returnrate, int64_t *returndistortion, @@ -3807,15 +3828,15 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_tx_diff[TX_MODES]; int64_t best_pred_diff[NB_PREDICTION_TYPES]; int64_t best_pred_rd[NB_PREDICTION_TYPES]; - int64_t best_filter_rd[SWITCHABLE_FILTERS + 1]; - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; MB_MODE_INFO best_mbmode = { 0 }; int mode_index, best_mode_index = 0; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; int64_t best_inter_rd = INT64_MAX; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; + INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; @@ -3830,7 +3851,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int best_skip2 = 0; x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; - vp9_zero(x->zcoeff_blk); + vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); for (i = 0; i < 4; i++) { int j; @@ -3845,7 +3866,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, best_pred_rd[i] = INT64_MAX; for (i = 0; i < TX_MODES; i++) best_tx_rd[i] = INT64_MAX; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = INT64_MAX; for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX; @@ -3863,8 +3884,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size, - mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], + setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame, + block_size, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb, scale_factor); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; @@ -3962,11 +3984,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // TODO(jingning, jkoleszar): scaling reference frame not supported for // sub8x8 blocks. if (ref_frame > 0 && - vp9_is_scaled(&scale_factor[ref_frame])) + vp9_is_scaled(scale_factor[ref_frame].sfc)) continue; if (second_ref_frame > 0 && - vp9_is_scaled(&scale_factor[second_ref_frame])) + vp9_is_scaled(scale_factor[second_ref_frame].sfc)) continue; set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); @@ -4094,7 +4116,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, mbmi->interp_filter = switchable_filter_index; vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, @@ -4130,8 +4152,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, tmp_best_sse = total_sse; tmp_best_skippable = skippable; tmp_best_mbmode = *mbmi; - for (i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i]; + x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i]; + } pred_exists = 1; if (switchable_filter_index == 0 && cpi->sf.use_rd_breakout && @@ -4158,7 +4182,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (!pred_exists) { // Handles the special case when a filter that is not in the // switchable list (bilinear, 6-tap) is indicated at the frame level - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, @@ -4286,7 +4310,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (!disable_skip && ref_frame == INTRA_FRAME) { for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); } @@ -4364,7 +4388,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, cm->mcomp_filter_type != BILINEAR) { int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? SWITCHABLE_FILTERS : cm->mcomp_filter_type]; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int64_t adj_rd; // In cases of poor prediction, filter_cache[] can contain really big // values, which actually are bigger than this_rd itself. This can @@ -4480,7 +4504,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } if (!x->skip) { - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { if (best_filter_rd[i] == INT64_MAX) best_filter_diff[i] = 0; else diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 0b0bb18d7..92fb23548 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -18,6 +18,8 @@ (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) #define QIDX_SKIP_THRESH 115 +struct TileInfo; + int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex); void vp9_initialize_rd_consts(VP9_COMP *cpi); @@ -29,14 +31,22 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd); int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, + const struct TileInfo *const tile, int mi_row, int mi_col, - int *r, int64_t *d, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx, int64_t best_rd); + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, + const struct TileInfo *const tile, int mi_row, int mi_col, - int *r, int64_t *d, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx, int64_t best_rd); + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); void vp9_init_me_luts(); diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 72e6be1e8..24f011f83 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -117,7 +117,8 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, +static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, @@ -132,7 +133,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, xd->mi_8x8 = mi_8x8; segment_id = xd->mi_8x8[0]->mbmi.segment_id; - set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); // Count the number of hits on each segment with no prediction no_pred_segcounts[segment_id]++; @@ -157,7 +158,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, } } -static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, +static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, @@ -175,19 +177,20 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; if (bw == bs && bh == bs) { - count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, bs, mi_row, mi_col); } else if (bw == bs && bh < bs) { - count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row, mi_col); - count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts, + count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col); } else if (bw < bs && bh == bs) { - count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col); - count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs); + count_segs(cpi, tile, mi_8x8 + hbs, + no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, + hbs, bs, mi_row, mi_col + hbs); } else { const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; int n; @@ -198,7 +201,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, const int mi_dc = hbs * (n & 1); const int mi_dr = hbs * (n >> 1); - count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], + count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc], no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row + mi_dr, mi_col + mi_dc, subsize); @@ -234,15 +237,18 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // First of all generate stats regarding how well the last segment map // predicts this one for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { - vp9_get_tile_col_offsets(cm, tile_col); - mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start; + TileInfo tile; + + vp9_tile_init(&tile, cm, 0, tile_col); + mi_ptr = cm->mi_grid_visible + tile.mi_col_start; for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { mi = mi_ptr; - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += 8, mi += 8) - count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64); + count_segs_sb(cpi, &tile, mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + mi_row, mi_col, BLOCK_64X64); } } diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c index eb864d96c..387fc9056 100644 --- a/vp9/encoder/vp9_subexp.c +++ b/vp9/encoder/vp9_subexp.c @@ -221,7 +221,7 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, } void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp, - unsigned int *ct) { + const unsigned int ct[2]) { const vp9_prob upd = DIFF_UPDATE_PROB; vp9_prob newp = get_binary_prob(ct[0], ct[1]); const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp, diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 6ea05793d..2cace0378 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -38,14 +38,15 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, int stride, int mv_row, int mv_col, - uint8_t *pred) { + uint8_t *pred, + struct scale_factors *scale) { const int which_mv = 0; MV mv = { mv_row, mv_col }; vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, - &xd->scale_factor[which_mv], + scale, 16, 16, which_mv, &xd->subpix, MV_PRECISION_Q3); @@ -55,7 +56,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, vp9_build_inter_predictor(u_mb_ptr, stride, &pred[256], 8, &mv, - &xd->scale_factor[which_mv], + scale, 8, 8, which_mv, &xd->subpix, MV_PRECISION_Q4); @@ -63,7 +64,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, vp9_build_inter_predictor(v_mb_ptr, stride, &pred[320], 8, &mv, - &xd->scale_factor[which_mv], + scale, 8, 8, which_mv, &xd->subpix, MV_PRECISION_Q4); @@ -186,7 +187,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, static void temporal_filter_iterate_c(VP9_COMP *cpi, int frame_count, int alt_ref_index, - int strength) { + int strength, + struct scale_factors *scale) { int byte; int frame; int mb_col, mb_row; @@ -280,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->frames[frame]->y_stride, mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row, mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col, - predictor); + predictor, scale); // Apply the filter (YUV) vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, @@ -374,6 +376,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead) - (num_frames_backward + 1); + struct scale_factors scale; + struct scale_factors_common scale_comm; + switch (blur_type) { case 1: // Backward Blur @@ -432,9 +437,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { #endif // Setup scaling factors. Scaling on each of the arnr frames is not supported - vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0], - cm->yv12_fb[cm->new_fb_idx].y_crop_width, - cm->yv12_fb[cm->new_fb_idx].y_crop_height, + vp9_setup_scale_factors_for_frame(&scale, &scale_comm, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, cm->width, cm->height); // Setup frame pointers, NULL indicates frame not included in filter @@ -447,7 +452,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { } temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, - strength); + strength, &scale); } void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame, diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 550263aa8..7d4676e97 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -21,14 +21,6 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_entropy.h" -/* Global event counters used for accumulating statistics across several - compressions, then generating vp9_context.c = initial stats. */ - -#ifdef ENTROPY_STATS -vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES]; -extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES]; -#endif /* ENTROPY_STATS */ - static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; const TOKENVALUE *vp9_dct_value_tokens_ptr; static int dct_value_cost[DCT_MAX_VALUE * 2]; @@ -89,6 +81,7 @@ struct tokenize_b_args { MACROBLOCKD *xd; TOKENEXTRA **tp; TX_SIZE tx_size; + uint8_t *token_cache; }; static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize, @@ -107,6 +100,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, VP9_COMP *cpi = args->cpi; MACROBLOCKD *xd = args->xd; TOKENEXTRA **tp = args->tp; + uint8_t *token_cache = args->token_cache; struct macroblockd_plane *pd = &xd->plane[plane]; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; int pt; /* near block/prev token context index */ @@ -121,7 +115,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, vp9_coeff_count *const counts = cpi->coef_counts[tx_size]; vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size]; const int ref = is_inter_block(mbmi); - uint8_t token_cache[1024]; const uint8_t *const band_translate = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); int aoff, loff; @@ -205,7 +198,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, const int mb_skip_context = vp9_get_pred_context_mbskip(xd); const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size}; + struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache}; mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize); if (mbmi->skip_coeff) { @@ -226,149 +219,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, } } -#ifdef ENTROPY_STATS -void init_context_counters(void) { - FILE *f = fopen("context.bin", "rb"); - if (!f) { - vp9_zero(context_counters); - } else { - fread(context_counters, sizeof(context_counters), 1, f); - fclose(f); - } - - f = fopen("treeupdate.bin", "rb"); - if (!f) { - vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist)); - } else { - fread(tree_update_hist, sizeof(tree_update_hist), 1, f); - fclose(f); - } -} - -static void print_counter(FILE *f, vp9_coeff_accum *context_counters, - int block_types, const char *header) { - int type, ref, band, pt, t; - - fprintf(f, "static const vp9_coeff_count %s = {\n", header); - -#define Comma(X) (X ? "," : "") - type = 0; - do { - ref = 0; - fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); - do { - fprintf(f, "%s\n { /* %s */", Comma(type), ref ? "Inter" : "Intra"); - band = 0; - do { - fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); - pt = 0; - do { - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; - do { - const int64_t x = context_counters[type][ref][band][pt][t]; - const int y = (int) x; - - assert(x == (int64_t) y); /* no overflow handling yet */ - fprintf(f, "%s %d", Comma(t), y); - } while (++t < 1 + MAX_ENTROPY_TOKENS); - fprintf(f, "}"); - } while (++pt < PREV_COEF_CONTEXTS); - fprintf(f, "\n }"); - } while (++band < COEF_BANDS); - fprintf(f, "\n }"); - } while (++ref < REF_TYPES); - fprintf(f, "\n }"); - } while (++type < block_types); - fprintf(f, "\n};\n"); -} - -static void print_probs(FILE *f, vp9_coeff_accum *context_counters, - int block_types, const char *header) { - int type, ref, band, pt, t; - - fprintf(f, "static const vp9_coeff_probs %s = {", header); - - type = 0; -#define Newline(x, spaces) (x ? " " : "\n" spaces) - do { - fprintf(f, "%s%s{ /* block Type %d */", - Comma(type), Newline(type, " "), type); - ref = 0; - do { - fprintf(f, "%s%s{ /* %s */", - Comma(band), Newline(band, " "), ref ? "Inter" : "Intra"); - band = 0; - do { - fprintf(f, "%s%s{ /* Coeff Band %d */", - Comma(band), Newline(band, " "), band); - pt = 0; - do { - unsigned int branch_ct[ENTROPY_NODES][2]; - unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1]; - vp9_prob coef_probs[ENTROPY_NODES]; - - if (pt >= 3 && band == 0) - break; - for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t) - coef_counts[t] = context_counters[type][ref][band][pt][t]; - vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs, - branch_ct, coef_counts, 0); - branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0]; - coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; - do { - fprintf(f, "%s %3d", Comma(t), coef_probs[t]); - } while (++t < ENTROPY_NODES); - - fprintf(f, " }"); - } while (++pt < PREV_COEF_CONTEXTS); - fprintf(f, "\n }"); - } while (++band < COEF_BANDS); - fprintf(f, "\n }"); - } while (++ref < REF_TYPES); - fprintf(f, "\n }"); - } while (++type < block_types); - fprintf(f, "\n};\n"); -} - -void print_context_counters() { - FILE *f = fopen("vp9_context.c", "w"); - - fprintf(f, "#include \"vp9_entropy.h\"\n"); - fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); - - /* print counts */ - print_counter(f, context_counters[TX_4X4], BLOCK_TYPES, - "vp9_default_coef_counts_4x4[BLOCK_TYPES]"); - print_counter(f, context_counters[TX_8X8], BLOCK_TYPES, - "vp9_default_coef_counts_8x8[BLOCK_TYPES]"); - print_counter(f, context_counters[TX_16X16], BLOCK_TYPES, - "vp9_default_coef_counts_16x16[BLOCK_TYPES]"); - print_counter(f, context_counters[TX_32X32], BLOCK_TYPES, - "vp9_default_coef_counts_32x32[BLOCK_TYPES]"); - - /* print coefficient probabilities */ - print_probs(f, context_counters[TX_4X4], BLOCK_TYPES, - "default_coef_probs_4x4[BLOCK_TYPES]"); - print_probs(f, context_counters[TX_8X8], BLOCK_TYPES, - "default_coef_probs_8x8[BLOCK_TYPES]"); - print_probs(f, context_counters[TX_16X16], BLOCK_TYPES, - "default_coef_probs_16x16[BLOCK_TYPES]"); - print_probs(f, context_counters[TX_32X32], BLOCK_TYPES, - "default_coef_probs_32x32[BLOCK_TYPES]"); - - fclose(f); - - f = fopen("context.bin", "wb"); - fwrite(context_counters, sizeof(context_counters), 1, f); - fclose(f); -} -#endif - void vp9_tokenize_initialize() { fill_value_tokens(); } diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index b78e100ec..e24e31b80 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -28,9 +28,6 @@ typedef struct { uint8_t skip_eob_node; } TOKENEXTRA; -typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS + 1]; - int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize); int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane); @@ -39,13 +36,6 @@ struct VP9_COMP; void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize); -#ifdef ENTROPY_STATS -void init_context_counters(); -void print_context_counters(); - -extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES]; -#endif - extern const int *vp9_dct_value_cost_ptr; /* TODO: The Token field should be broken out into a separate char array to * improve cache locality, since it's needed for costing when the rest of the diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c index 3179ae301..1f9cb8709 100644 --- a/vp9/encoder/vp9_vaq.c +++ b/vp9/encoder/vp9_vaq.c @@ -118,8 +118,8 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, ((-xd->mb_to_bottom_edge) >> 3) : 0; if (right_overflow || bottom_overflow) { - int bw = (1 << (mi_width_log2(bs) + 3)) - right_overflow; - int bh = (1 << (mi_height_log2(bs) + 3)) - bottom_overflow; + const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow; + const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow; int avg; variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, bw, bh, &sse, &avg); diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c index de47a5bf1..2d59775ce 100644 --- a/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -29,7 +29,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { } #endif -void FDCT32x32_2D(int16_t *input, +void FDCT32x32_2D(const int16_t *input, int16_t *output_org, int stride) { // Calculate pre-multiplied strides const int str1 = stride; @@ -93,13 +93,13 @@ void FDCT32x32_2D(int16_t *input, // Note: even though all the loads below are aligned, using the aligned // intrinsic make the code slightly slower. if (0 == pass) { - int16_t *in = &input[column_start]; + const int16_t *in = &input[column_start]; // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - int16_t *ina = in + 0 * str1; - int16_t *inb = in + 31 * str1; + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; __m128i *step1a = &step1[ 0]; __m128i *step1b = &step1[31]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -128,8 +128,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 4 * str1; - int16_t *inb = in + 27 * str1; + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; __m128i *step1a = &step1[ 4]; __m128i *step1b = &step1[27]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -158,8 +158,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 8 * str1; - int16_t *inb = in + 23 * str1; + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; __m128i *step1a = &step1[ 8]; __m128i *step1b = &step1[23]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -188,8 +188,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 12 * str1; - int16_t *inb = in + 19 * str1; + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; __m128i *step1a = &step1[12]; __m128i *step1b = &step1[19]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index fa60e80eb..dc115018e 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,7 +12,7 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" -void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) { +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -111,7 +111,8 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) { } } -static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); __m128i mask; @@ -242,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) { transpose_4x4(in); } -void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, +void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in[4]; load_buffer_4x4(input, in, stride); @@ -270,7 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, write_buffer_4x4(output, in); } -void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) { +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { int pass; // Constants // When we use them, in one case, they are all the same. In all others @@ -527,15 +528,16 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) { } // load 8x8 array -static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { - in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); in[0] = _mm_slli_epi16(in[0], 2); in[1] = _mm_slli_epi16(in[1], 2); @@ -1025,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, +void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in[8]; load_buffer_8x8(input, in, stride); @@ -1054,7 +1056,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, write_buffer_8x8(output, in, 8); } -void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) { +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, @@ -1064,7 +1066,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) { int pass; // We need an intermediate buffer between passes. DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others @@ -1679,7 +1681,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) { } } -static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, +static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, __m128i *in1, int stride) { // load first 8 columns load_buffer_8x8(input, in0, stride); @@ -2531,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); } -void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, +void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in0[16], in1[16]; load_buffer_16x16(input, in0, in1, stride); @@ -2563,13 +2565,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, write_buffer_16x16(output, in0, in1, 16); } -#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 +#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" #undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION -#define FDCT32x32_2D vp9_short_fdct32x32_sse2 +#define FDCT32x32_2D vp9_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT #undef FDCT32x32_2D diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index af6e66538..0badb0855 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -74,6 +74,7 @@ VP9_COMMON_SRCS-yes += common/vp9_scan.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm @@ -102,6 +103,11 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index ec2eac359..194203967 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -973,37 +973,16 @@ static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); - - if (data) { - vpx_roi_map_t *roi = (vpx_roi_map_t *)data; - - if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols, - roi->delta_q, roi->delta_lf, roi->static_threshold)) - return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { - return VPX_CODEC_INVALID_PARAM; - } + // TODO(yaowu): Need to re-implement and test for VP9. + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_active_map_t *data = va_arg(args, vpx_active_map_t *); - - if (data) { - vpx_active_map_t *map = (vpx_active_map_t *)data; - - if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols)) - return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { - return VPX_CODEC_INVALID_PARAM; - } + // TODO(yaowu): Need to re-implement and test for VP9. + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx, @@ -1014,8 +993,9 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx, if (data) { int res; vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, - scalemode.v_scaling_mode); + res = vp9_set_internal_size(ctx->cpi, + (VPX_SCALING)scalemode.h_scaling_mode, + (VPX_SCALING)scalemode.v_scaling_mode); if (!res) { return VPX_CODEC_OK; diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 6b923162f..5dacab454 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -172,9 +172,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data, rb.bit_offset += 1; // show frame rb.bit_offset += 1; // error resilient - if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 || - vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 || - vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) { + if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 || + vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 || + vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) { return VPX_CODEC_UNSUP_BITSTREAM; } diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index b454eee02..0993c6ce6 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -20,6 +20,7 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c VP9_CX_SRCS-yes += encoder/vp9_bitstream.c VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c VP9_CX_SRCS-yes += encoder/vp9_dct.c +VP9_CX_SRCS-yes += encoder/vp9_dct.h VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c |