diff options
Diffstat (limited to 'vp9')
96 files changed, 6473 insertions, 3676 deletions
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm index 388a7d719..72e933eee 100644 --- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm @@ -72,7 +72,7 @@ cospi_31_64 EQU 804 ; reg1 = output[first_offset] ; reg2 = output[second_offset] ; for proper address calculation, the last offset used when manipulating - ; output, wethere reading or storing) must be passed in. use 0 for first + ; output, whether reading or storing) must be passed in. use 0 for first ; use. MACRO LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 @@ -88,7 +88,7 @@ cospi_31_64 EQU 804 ; output[first_offset] = reg1 ; output[second_offset] = reg2 ; for proper address calculation, the last offset used when manipulating - ; output, wethere reading or storing) must be passed in. use 0 for first + ; output, whether reading or storing) must be passed in. use 0 for first ; use. MACRO STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 @@ -242,7 +242,7 @@ cospi_31_64 EQU 804 ; TODO(cd): have special case to re-use constants when they are similar for ; consecutive butterflies ; TODO(cd): have special case when both constants are the same, do the - ; additions/substractions before the multiplies. + ; additions/subtractions before the multiplies. ; generate the constants ; generate scalar constants mov r8, #$first_constant & 0xFF00 @@ -260,7 +260,7 @@ cospi_31_64 EQU 804 vmull.s16 q11, $regB, d31 vmull.s16 q12, $regC, d31 ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/substractions (to get back two register) + ; do some addition/subtractions (to get back two register) vsub.s32 q8, q8, q10 vsub.s32 q9, q9, q11 ; do more multiplications (ordered for maximum latency hiding) @@ -268,7 +268,7 @@ cospi_31_64 EQU 804 vmull.s16 q11, $regA, d30 vmull.s16 q15, $regB, d30 ; (used) six for intermediate (q8-q12, q15) - ; do more addition/substractions + ; do more addition/subtractions vadd.s32 q11, q12, q11 vadd.s32 q10, q10, q15 ; (used) four for intermediate (q8-q11) diff --git a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm index 8cb913cb8..5fe2bba46 100644 --- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm +++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm @@ -439,6 +439,9 @@ v_end tst r7, #1 bxne lr + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + ; mbfilter flat && mask branch ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's ; and using vibt on the q's? diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm index 279f678b1..dc9856fa8 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -315,8 +315,8 @@ loop_h vdup.u16 q2, r2 vadd.s16 q1, q1, q3 vadd.s16 q2, q2, q3 - vqshrun.s16 d0, q1, #0 - vqshrun.s16 d1, q2, #0 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1 @@ -327,8 +327,8 @@ loop_h vdup.u16 q2, r2 vadd.s16 q1, q1, q3 vadd.s16 q2, q2, q3 - vqshrun.s16 d0, q1, #0 - vqshrun.s16 d1, q2, #0 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1 bx lr @@ -349,7 +349,7 @@ loop_h vdup.u8 d0, r12 ; preload 8 left - vld1.8 d30, [r3] + vld1.8 {d30}, [r3] ; Load above 8 pixels vld1.64 {d2}, [r2] @@ -372,10 +372,10 @@ loop_h vadd.s16 q8, q3, q8 vadd.s16 q9, q3, q9 - vqshrun.s16 d0, q0, #0 - vqshrun.s16 d1, q1, #0 - vqshrun.s16 d2, q8, #0 - vqshrun.s16 d3, q9, #0 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 vst1.64 {d0}, [r0], r1 vst1.64 {d1}, [r0], r1 @@ -394,10 +394,10 @@ loop_h vadd.s16 q8, q3, q8 vadd.s16 q9, q3, q9 - vqshrun.s16 d0, q0, #0 - vqshrun.s16 d1, q1, #0 - vqshrun.s16 d2, q8, #0 - vqshrun.s16 d3, q9, #0 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 vst1.64 {d0}, [r0], r1 vst1.64 {d1}, [r0], r1 @@ -422,10 +422,10 @@ loop_h vdup.u8 q0, r12 ; Load above 8 pixels - vld1.8 q1, [r2] + vld1.8 {q1}, [r2] ; preload 8 left into r12 - vld1.8 d18, [r3]! + vld1.8 {d18}, [r3]! ; Compute above - ytop_left vsubl.u8 q2, d2, d0 @@ -445,10 +445,10 @@ loop_16x16_neon vadd.s16 q0, q0, q3 vadd.s16 q11, q8, q2 vadd.s16 q8, q8, q3 - vqshrun.s16 d2, q1, #0 - vqshrun.s16 d3, q0, #0 - vqshrun.s16 d22, q11, #0 - vqshrun.s16 d23, q8, #0 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 vdup.16 q0, d20[2] ; proload next 2 rows data vdup.16 q8, d20[3] vst1.64 {d2,d3}, [r0], r1 @@ -459,10 +459,10 @@ loop_16x16_neon vadd.s16 q0, q0, q3 vadd.s16 q11, q8, q2 vadd.s16 q8, q8, q3 - vqshrun.s16 d2, q1, #0 - vqshrun.s16 d3, q0, #0 - vqshrun.s16 d22, q11, #0 - vqshrun.s16 d23, q8, #0 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 vdup.16 q0, d21[0] ; proload next 2 rows data vdup.16 q8, d21[1] vst1.64 {d2,d3}, [r0], r1 @@ -472,10 +472,10 @@ loop_16x16_neon vadd.s16 q0, q0, q3 vadd.s16 q11, q8, q2 vadd.s16 q8, q8, q3 - vqshrun.s16 d2, q1, #0 - vqshrun.s16 d3, q0, #0 - vqshrun.s16 d22, q11, #0 - vqshrun.s16 d23, q8, #0 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 vdup.16 q0, d21[2] ; proload next 2 rows data vdup.16 q8, d21[3] vst1.64 {d2,d3}, [r0], r1 @@ -486,13 +486,11 @@ loop_16x16_neon vadd.s16 q0, q0, q3 vadd.s16 q11, q8, q2 vadd.s16 q8, q8, q3 - vqshrun.s16 d2, q1, #0 - vqshrun.s16 d3, q0, #0 - vqshrun.s16 d22, q11, #0 - vqshrun.s16 d23, q8, #0 - vdup.16 q0, d20[2] - vdup.16 q8, d20[3] - vld1.8 d18, [r3]! ; preload 8 left into r12 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 vmovl.u8 q10, d18 vst1.64 {d2,d3}, [r0], r1 vst1.64 {d22,d23}, [r0], r1 @@ -518,11 +516,11 @@ loop_16x16_neon vdup.u8 q0, r12 ; Load above 32 pixels - vld1.8 q1, [r2]! - vld1.8 q2, [r2] + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] ; preload 8 left pixels - vld1.8 d26, [r3]! + vld1.8 {d26}, [r3]! ; Compute above - ytop_left vsubl.u8 q8, d2, d0 @@ -544,19 +542,19 @@ loop_32x32_neon vadd.s16 q13, q0, q9 vadd.s16 q14, q0, q10 vadd.s16 q15, q0, q11 - vqshrun.s16 d0, q12, #0 - vqshrun.s16 d1, q13, #0 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 vadd.s16 q12, q2, q8 vadd.s16 q13, q2, q9 - vqshrun.s16 d2, q14, #0 - vqshrun.s16 d3, q15, #0 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 vadd.s16 q14, q2, q10 vadd.s16 q15, q2, q11 vst1.64 {d0-d3}, [r0], r1 - vqshrun.s16 d24, q12, #0 - vqshrun.s16 d25, q13, #0 - vqshrun.s16 d26, q14, #0 - vqshrun.s16 d27, q15, #0 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 vdup.16 q1, d6[2] vdup.16 q2, d6[3] vst1.64 {d24-d27}, [r0], r1 @@ -566,19 +564,19 @@ loop_32x32_neon vadd.s16 q13, q1, q9 vadd.s16 q14, q1, q10 vadd.s16 q15, q1, q11 - vqshrun.s16 d0, q12, #0 - vqshrun.s16 d1, q13, #0 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 vadd.s16 q12, q2, q8 vadd.s16 q13, q2, q9 - vqshrun.s16 d2, q14, #0 - vqshrun.s16 d3, q15, #0 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 vadd.s16 q14, q2, q10 vadd.s16 q15, q2, q11 vst1.64 {d0-d3}, [r0], r1 - vqshrun.s16 d24, q12, #0 - vqshrun.s16 d25, q13, #0 - vqshrun.s16 d26, q14, #0 - vqshrun.s16 d27, q15, #0 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 vdup.16 q0, d7[0] vdup.16 q2, d7[1] vst1.64 {d24-d27}, [r0], r1 @@ -588,19 +586,19 @@ loop_32x32_neon vadd.s16 q13, q0, q9 vadd.s16 q14, q0, q10 vadd.s16 q15, q0, q11 - vqshrun.s16 d0, q12, #0 - vqshrun.s16 d1, q13, #0 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 vadd.s16 q12, q2, q8 vadd.s16 q13, q2, q9 - vqshrun.s16 d2, q14, #0 - vqshrun.s16 d3, q15, #0 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 vadd.s16 q14, q2, q10 vadd.s16 q15, q2, q11 vst1.64 {d0-d3}, [r0], r1 - vqshrun.s16 d24, q12, #0 - vqshrun.s16 d25, q13, #0 - vqshrun.s16 d26, q14, #0 - vqshrun.s16 d27, q15, #0 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 vdup.16 q0, d7[2] vdup.16 q2, d7[3] vst1.64 {d24-d27}, [r0], r1 @@ -610,20 +608,20 @@ loop_32x32_neon vadd.s16 q13, q0, q9 vadd.s16 q14, q0, q10 vadd.s16 q15, q0, q11 - vqshrun.s16 d0, q12, #0 - vqshrun.s16 d1, q13, #0 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 vadd.s16 q12, q2, q8 vadd.s16 q13, q2, q9 - vqshrun.s16 d2, q14, #0 - vqshrun.s16 d3, q15, #0 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 vadd.s16 q14, q2, q10 vadd.s16 q15, q2, q11 vst1.64 {d0-d3}, [r0], r1 - vqshrun.s16 d24, q12, #0 - vqshrun.s16 d25, q13, #0 - vld1.8 d0, [r3]! ; preload 8 left pixels - vqshrun.s16 d26, q14, #0 - vqshrun.s16 d27, q15, #0 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 vmovl.u8 q3, d0 vst1.64 {d24-d27}, [r0], r1 diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c deleted file mode 100644 index 536febb65..000000000 --- a/vp9/common/generic/vp9_systemdependent.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_onyxc_int.h" - -void vp9_machine_specific_config(VP9_COMMON *cm) { - (void)cm; - vp9_rtcd(); -} diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h index 991d3c2b3..6ebea9f2f 100644 --- a/vp9/common/mips/dspr2/vp9_common_dspr2.h +++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -85,8 +85,8 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { ); } -void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c index 1b2f5506a..19c582fd1 100644 --- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_10, step1_11, step1_12, step1_13; @@ -404,8 +404,8 @@ static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, } } -static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_8, step1_9, step1_10, step1_11; @@ -905,13 +905,13 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct16_1d_rows_dspr2(input, out, 16); + idct16_rows_dspr2(input, out, 16); // Then transform columns and add to dest - idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, dest_stride); } -static void iadst16_1d(const int16_t *input, int16_t *output) { +static void iadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -1099,16 +1099,16 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - idct16_1d_rows_dspr2(input, outptr, 16); - idct16_1d_cols_add_blk_dspr2(out, dest, pitch); + idct16_rows_dspr2(input, outptr, 16); + idct16_cols_add_blk_dspr2(out, dest, pitch); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - idct16_1d_rows_dspr2(input, outptr, 16); + idct16_rows_dspr2(input, outptr, 16); outptr = out; for (i = 0; i < 16; ++i) { - iadst16_1d(outptr, temp_out); + iadst16(outptr, temp_out); for (j = 0; j < 16; ++j) dest[j * pitch + i] = @@ -1125,7 +1125,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, /* prefetch row */ vp9_prefetch_load((const uint8_t *)(input + 16)); - iadst16_1d(input, outptr); + iadst16(input, outptr); input += 16; outptr += 16; } @@ -1134,7 +1134,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) temp_in[j * 16 + i] = out[i * 16 + j]; - idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); + idct16_cols_add_blk_dspr2(temp_in, dest, pitch); } break; case ADST_ADST: // ADST in both directions @@ -1145,7 +1145,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, /* prefetch row */ vp9_prefetch_load((const uint8_t *)(input + 16)); - iadst16_1d(input, outptr); + iadst16(input, outptr); input += 16; outptr += 16; } @@ -1153,7 +1153,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - iadst16_1d(temp_in, temp_out); + iadst16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) @@ -1183,7 +1183,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. - idct16_1d_rows_dspr2(input, outptr, 4); + idct16_rows_dspr2(input, outptr, 4); outptr += 4; for (i = 0; i < 6; ++i) { @@ -1213,7 +1213,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, } // Then transform columns - idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, dest_stride); } void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c index 5e92db3d2..132d88ce5 100644 --- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -18,8 +18,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c index bc6759400..74a90b02c 100644 --- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; @@ -882,10 +882,10 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr, 32); + idct32_rows_dspr2(input, outptr, 32); // Columns - vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); + vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); } void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, @@ -903,7 +903,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr, 8); + idct32_rows_dspr2(input, outptr, 8); outptr += 8; __asm__ __volatile__ ( @@ -947,7 +947,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, } // Columns - vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride); + vp9_idct32_cols_add_blk_dspr2(out, dest, stride); } void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index 5b7aa5e71..1990348b8 100644 --- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -19,7 +19,7 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { +static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) { int16_t step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; @@ -104,7 +104,7 @@ static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { } } -static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, +static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { int16_t step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; @@ -240,10 +240,10 @@ void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_rows_dspr2(input, outptr); // Columns - vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); } void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, @@ -319,7 +319,7 @@ void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, } } -static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) { +static void iadst4_dspr2(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0, x1, x2, x3; @@ -379,16 +379,16 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - vp9_idct4_1d_rows_dspr2(input, outptr); - vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vp9_idct4_rows_dspr2(input, outptr); + vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_rows_dspr2(input, outptr); outptr = out; for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(outptr, temp_out); + iadst4_dspr2(outptr, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = @@ -400,7 +400,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, break; case DCT_ADST: // DCT in vertical, ADST in horizontal for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(input, outptr); + iadst4_dspr2(input, outptr); input += 4; outptr += 4; } @@ -410,11 +410,11 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 4 + j] = out[j * 4 + i]; } } - vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(input, outptr); + iadst4_dspr2(input, outptr); input += 4; outptr += 4; } @@ -422,7 +422,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - iadst4_1d_dspr2(temp_in, temp_out); + iadst4_dspr2(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index 93a08401d..acccaea6d 100644 --- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct8_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; const int const_2_power_13 = 8192; int Temp0, Temp1, Temp2, Temp3, Temp4; @@ -200,8 +200,8 @@ static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, } } -static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int Temp0, Temp1, Temp2, Temp3; int i; @@ -462,13 +462,13 @@ void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct8_1d_rows_dspr2(input, outptr, 8); + idct8_rows_dspr2(input, outptr, 8); // Then transform columns and add to dest - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); } -static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { +static void iadst8_dspr2(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0, x1, x2, x3, x4, x5, x6, x7; @@ -563,14 +563,14 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - idct8_1d_rows_dspr2(input, outptr, 8); - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_rows_dspr2(input, outptr, 8); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - idct8_1d_rows_dspr2(input, outptr, 8); + idct8_rows_dspr2(input, outptr, 8); for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(&out[i * 8], temp_out); + iadst8_dspr2(&out[i * 8], temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = @@ -580,7 +580,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, break; case DCT_ADST: // DCT in vertical, ADST in horizontal for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(input, outptr); + iadst8_dspr2(input, outptr); input += 8; outptr += 8; } @@ -590,11 +590,11 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 8 + j] = out[j * 8 + i]; } } - idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(input, outptr); + iadst8_dspr2(input, outptr); input += 8; outptr += 8; } @@ -603,7 +603,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - iadst8_1d_dspr2(temp_in, temp_out); + iadst8_dspr2(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = @@ -631,7 +631,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct8_1d_rows_dspr2(input, outptr, 4); + idct8_rows_dspr2(input, outptr, 4); outptr += 4; @@ -659,7 +659,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, // Then transform columns and add to dest - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); } void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index e033fbb99..ff4b7c1f9 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -33,9 +33,16 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) { void vp9_free_frame_buffers(VP9_COMMON *cm) { int i; - for (i = 0; i < FRAME_BUFFERS; i++) + for (i = 0; i < FRAME_BUFFERS; i++) { vp9_free_frame_buffer(&cm->frame_bufs[i].buf); + if (cm->frame_bufs[i].ref_count > 0 && + cm->frame_bufs[i].raw_frame_buffer.data != NULL) { + cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer); + cm->frame_bufs[i].ref_count = 0; + } + } + vp9_free_frame_buffer(&cm->post_proc_buffer); vpx_free(cm->mip); @@ -85,7 +92,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { int mi_size; if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9_DEC_BORDER_IN_PIXELS) < 0) + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) goto fail; set_mb_mi(cm, aligned_width, aligned_height); @@ -194,11 +201,12 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { } void vp9_create_common(VP9_COMMON *cm) { - vp9_machine_specific_config(cm); + vp9_rtcd(); } void vp9_remove_common(VP9_COMMON *cm) { vp9_free_frame_buffers(cm); + vp9_free_internal_frame_buffers(&cm->int_frame_buffers); } void vp9_initialize_common() { diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c index 8cc657114..d918bedc6 100644 --- a/vp9/common/vp9_blockd.c +++ b/vp9/common/vp9_blockd.c @@ -98,16 +98,6 @@ void vp9_foreach_transformed_block(const MACROBLOCKD* const xd, vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); } -void vp9_foreach_transformed_block_uv(const MACROBLOCKD* const xd, - BLOCK_SIZE bsize, - foreach_transformed_block_visitor visit, - void *arg) { - int plane; - - for (plane = 1; plane < MAX_MB_PLANE; plane++) - vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); -} - void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 70b8ffa4e..6086323f6 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -89,7 +89,6 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { #define INTER_OFFSET(mode) ((mode) - NEARESTMV) - /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ @@ -129,7 +128,7 @@ typedef struct { uint8_t mode_context[MAX_REF_FRAMES]; - unsigned char skip_coeff; // 0=need to decode coeffs, 1=no coefficients + unsigned char skip; // 0=need to decode coeffs, 1=no coefficients unsigned char segment_id; // Segment id for this block. // Flags used for prediction status of various bit-stream signals @@ -182,7 +181,7 @@ struct macroblockd_plane { int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; - int16_t *dequant; + const int16_t *dequant; ENTROPY_CONTEXT *above_context; ENTROPY_CONTEXT *left_context; }; @@ -314,11 +313,6 @@ void vp9_foreach_transformed_block( const MACROBLOCKD* const xd, BLOCK_SIZE bsize, foreach_transformed_block_visitor visit, void *arg); - -void vp9_foreach_transformed_block_uv( - const MACROBLOCKD* const xd, BLOCK_SIZE bsize, - foreach_transformed_block_visitor visit, void *arg); - static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int block, int *x, int *y) { diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index 3807ccc87..d30e0b488 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -145,7 +145,7 @@ static const InterpKernel *get_filter_base(const int16_t *filter) { } static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (const InterpKernel *)(intptr_t)f - base; + return (int)((const InterpKernel *)(intptr_t)f - base); } void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c index 355ac1a49..24c785f2a 100644 --- a/vp9/common/vp9_debugmodes.c +++ b/vp9/common/vp9_debugmodes.c @@ -58,7 +58,7 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); - print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff)); + print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip)); print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 13e954efe..bc12f9aa2 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -16,7 +16,7 @@ #include "vpx/vpx_integer.h" -DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = { +const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -85,11 +85,11 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; -DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]) = { +const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, }; -DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]) = { +const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index e030d92ec..aab8b5388 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -42,7 +42,7 @@ extern "C" { #define ENTROPY_NODES 11 -extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); #define EOB_MODEL_TOKEN 3 extern const vp9_tree_index vp9_coefmodel_tree[]; @@ -116,10 +116,10 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { // This macro is currently unused but may be used by certain implementations #define MAXBAND_INDEX 21 -extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]); -extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]); -static const uint8_t *get_band_translate(TX_SIZE tx_size) { +static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 : vp9_coefband_trans_8x8plus; } @@ -146,8 +146,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l) { +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; switch (tx_size) { @@ -174,8 +174,8 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, return combine_entropy_contexts(above_ec, left_ec); } -static const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx) { +static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { const MODE_INFO *const mi = xd->mi_8x8[0]; const MB_MODE_INFO *const mbmi = &mi->mbmi; diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 6def3c869..892153936 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -345,7 +345,7 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, + vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, probs); } @@ -465,8 +465,10 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { cm->frame_contexts[cm->frame_context_idx] = cm->fc; } - vpx_memset(cm->prev_mip, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + if (frame_is_intra_only(cm)) + vpx_memset(cm->prev_mip, 0, + cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + vpx_memset(cm->mip, 0, cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 60ae79fdc..e1f5ef7b4 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -192,8 +192,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, - probs); + vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, + MV_MAX_UPDATE_FACTOR, probs); } void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 546f603b6..7474a88bc 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -10,12 +10,9 @@ #include <assert.h> -#include "vpx_ports/mem.h" - #include "vp9/common/vp9_filter.h" -DECLARE_ALIGNED(256, const InterpKernel, - vp9_bilinear_filters[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -35,8 +32,7 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -56,8 +52,7 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // DCT based filter -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -77,8 +72,7 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index 15610d781..29d3867c9 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -13,6 +13,8 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + #ifdef __cplusplus extern "C" { @@ -37,10 +39,14 @@ typedef int16_t InterpKernel[SUBPEL_TAPS]; const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter); -extern const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS]; -extern const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS]; -extern const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]; -extern const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]; +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_bilinear_filters[SUBPEL_SHIFTS]); +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_sub_pel_filters_8[SUBPEL_SHIFTS]); +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]); +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]); // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c new file mode 100644 index 000000000..dffeb8a22 --- /dev/null +++ b/vp9/common/vp9_frame_buffers.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp9/common/vp9_frame_buffers.h" +#include "vpx_mem/vpx_mem.h" + +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + vp9_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + list->int_fb = vpx_calloc(list->num_internal_frame_buffers, + sizeof(*list->int_fb)); + return (list->int_fb == NULL); +} + +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + vpx_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + vpx_free(list->int_fb); + list->int_fb = NULL; +} + +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) + return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) + break; + } + + if (i == int_fb_list->num_internal_frame_buffers) + return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + int_fb_list->int_fb[i].data = + (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); + if (!int_fb_list->int_fb[i].data) + return -1; + + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + int_fb->in_use = 0; + return 0; +} diff --git a/vp9/common/vp9_frame_buffers.h b/vp9/common/vp9_frame_buffers.h new file mode 100644 index 000000000..e2cfe61b6 --- /dev/null +++ b/vp9/common/vp9_frame_buffers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ + +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 07d7a92f6..868a66ae4 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -262,9 +262,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { int lvl_seg = default_filt_lvl; if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); - lvl_seg = seg->abs_delta == SEGMENT_ABSDATA - ? data - : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER); + lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ? + data : default_filt_lvl + data, + 0, MAX_LOOP_FILTER); } if (!lf->mode_ref_delta_enabled) { @@ -496,7 +496,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, const BLOCK_SIZE block_size = mi->mbmi.sb_type; const TX_SIZE tx_size_y = mi->mbmi.tx_size; const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi); - const int skip = mi->mbmi.skip_coeff; + const int skip = mi->mbmi.skip; const int seg = mi->mbmi.segment_id; const int ref = mi->mbmi.ref_frame[0]; const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]]; @@ -577,7 +577,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, LOOP_FILTER_MASK *lfm) { const BLOCK_SIZE block_size = mi->mbmi.sb_type; const TX_SIZE tx_size_y = mi->mbmi.tx_size; - const int skip = mi->mbmi.skip_coeff; + const int skip = mi->mbmi.skip; const int seg = mi->mbmi.segment_id; const int ref = mi->mbmi.ref_frame[0]; const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]]; @@ -868,7 +868,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16])); } -#if CONFIG_NON420 static uint8_t build_lfi(const loop_filter_info_n *lfi_n, const MB_MODE_INFO *mbmi) { const int seg = mbmi->segment_id; @@ -937,8 +936,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const MODE_INFO *mi = mi_8x8[c]; const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type; - const int skip_this = mi[0].mbmi.skip_coeff - && is_inter_block(&mi[0].mbmi); + const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi); // left edge of current unit is block/partition edge -> no skip const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1; @@ -1047,7 +1045,6 @@ static void filter_block_plane_non420(VP9_COMMON *cm, dst->buf += 8 * dst->stride; } } -#endif void vp9_filter_block_plane(VP9_COMMON *const cm, struct macroblockd_plane *const plane, @@ -1207,10 +1204,8 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; LOOP_FILTER_MASK lfm; -#if CONFIG_NON420 int use_420 = y_only || (xd->plane[1].subsampling_y == 1 && xd->plane[1].subsampling_x == 1); -#endif for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride; @@ -1221,22 +1216,16 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, setup_dst_planes(xd, frame_buffer, mi_row, mi_col); // TODO(JBB): Make setup_mask work for non 420. -#if CONFIG_NON420 if (use_420) -#endif vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride, &lfm); for (plane = 0; plane < num_planes; ++plane) { -#if CONFIG_NON420 if (use_420) -#endif vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); -#if CONFIG_NON420 else filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row, mi_col); -#endif } } } diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h index 98fd1d82f..3eb7f9d61 100644 --- a/vp9/common/vp9_mv.h +++ b/vp9/common/vp9_mv.h @@ -34,8 +34,8 @@ typedef struct mv32 { int32_t col; } MV32; -static void clamp_mv(MV *mv, int min_col, int max_col, - int min_row, int max_row) { +static INLINE void clamp_mv(MV *mv, int min_col, int max_col, + int min_row, int max_row) { mv->col = clamp(mv->col, min_col, max_col); mv->row = clamp(mv->row, min_row, max_row); } diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index ff0262210..e5f3fed45 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -186,17 +186,17 @@ static INLINE int is_inside(const TileInfo *const tile, // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. -void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int block_idx, - int mi_row, int mi_col) { +static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, const MODE_INFO *prev_mi, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int block_idx, int mi_row, int mi_col) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; - const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; + const MB_MODE_INFO *const prev_mbmi = cm->coding_use_prev_mi && prev_mi ? + &prev_mi->mbmi : NULL; int different_ref_found = 0; int context_counter = 0; @@ -290,6 +290,16 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, clamp_mv_ref(&mv_ref_list[i].as_mv, xd); } +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, const MODE_INFO *prev_mi, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col) { + find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, mv_ref_list, -1, + mi_row, mi_col); +} + static void lower_mv_precision(MV *mv, int allow_hp) { const int use_hp = allow_hp && vp9_use_mv_hp(mv); if (!use_hp) { @@ -324,8 +334,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, assert(MAX_MV_REF_CANDIDATES == 2); - vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref], - mv_list, block, mi_row, mi_col); + find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref], + mv_list, block, mi_row, mi_col); near->as_int = 0; switch (block) { diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 0936abfcd..04cb000ef 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -17,29 +17,24 @@ extern "C" { #endif +#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\ + VP9_INTERP_EXTEND) << 3) -void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int block_idx, - int mi_row, int mi_col); - -static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int mi_row, int mi_col) { - vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, - mv_ref_list, -1, mi_row, mi_col); +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } -#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS \ - - VP9_INTERP_EXTEND) << 3) -#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS \ - - VP9_INTERP_EXTEND) << 3) +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, const MODE_INFO *prev_mi, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col); // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best @@ -47,14 +42,6 @@ static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, int_mv *nearest, int_mv *near); -// TODO(jingning): this mv clamping function should be block size dependent. -static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { - clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); -} - void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, const TileInfo *const tile, int block, int ref, int mi_row, int mi_col, diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h index 564e4195f..222086886 100644 --- a/vp9/common/vp9_onyx.h +++ b/vp9/common/vp9_onyx.h @@ -112,7 +112,6 @@ extern "C" { int auto_key; // autodetect cut scenes and set the keyframes int key_freq; // maximum distance to key frame. - int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) int lag_in_frames; // how many frames lag before we start encoding // ---------------------------------------------------------------- @@ -147,8 +146,14 @@ extern "C" { // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- - // Spatial scalability - int ss_number_layers; + // Spatial and temporal scalability. + int ss_number_layers; // Number of spatial layers. + int ts_number_layers; // Number of temporal layers. + // Bitrate allocation for spatial layers. + int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + // Bitrate allocation (CBR mode) and framerate factor, for temporal layers. + int ts_target_bitrate[VPX_TS_MAX_LAYERS]; + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; // these parameters aren't to be used in final build don't use!!! int play_alternate; diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index d92a25b12..e6d6ea7f0 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -18,6 +18,7 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_frame_buffers.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_tile_common.h" @@ -94,6 +95,7 @@ typedef enum { typedef struct { int ref_count; + vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; } RefCntBuffer; @@ -222,14 +224,27 @@ typedef struct VP9Common { int error_resilient_mode; int frame_parallel_decoding_mode; + // Flag indicates if prev_mi can be used in coding: + // 0: encoder assumes decoder does not have prev_mi + // 1: encoder assumes decoder has and uses prev_mi + unsigned int coding_use_prev_mi; + int log2_tile_cols, log2_tile_rows; + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + // Handles memory for the codec. + InternalFrameBufferList int_frame_buffers; } VP9_COMMON; -static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { +static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { return &cm->frame_bufs[cm->new_fb_idx].buf; } -static int get_free_fb(VP9_COMMON *cm) { +static INLINE int get_free_fb(VP9_COMMON *cm) { int i; for (i = 0; i < FRAME_BUFFERS; i++) if (cm->frame_bufs[i].ref_count == 0) @@ -240,7 +255,7 @@ static int get_free_fb(VP9_COMMON *cm) { return i; } -static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { +static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { const int ref_index = *idx; if (ref_index >= 0 && bufs[ref_index].ref_count > 0) @@ -251,7 +266,7 @@ static void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { bufs[new_idx].ref_count++; } -static int mi_cols_aligned_to_sb(int n_mis) { +static INLINE int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } @@ -275,10 +290,10 @@ static INLINE void set_skip_context( } } -static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, - int mi_row, int bh, - int mi_col, int bw, - int mi_rows, int mi_cols) { +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); @@ -292,7 +307,6 @@ static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, static void set_prev_mi(VP9_COMMON *cm) { const int use_prev_in_find_mv_refs = cm->width == cm->last_width && cm->height == cm->last_height && - !cm->error_resilient_mode && !cm->intra_only && cm->last_show_frame; // Special case: set prev_mi to NULL when the previous mode info diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index a172ba6a2..7baa9ee33 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -700,7 +700,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, char zz[4]; int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED && mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.skip_coeff); + mi[mb_index].mbmi.skip); if (cm->frame_type == KEY_FRAME) snprintf(zz, sizeof(zz) - 1, "a"); diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index 11b6d93c1..197bcb643 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -218,27 +218,25 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { } else { // inter/inter const int above_has_second = has_second_ref(above_mbmi); const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; if (above_has_second && left_has_second) { - pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || - above_mbmi->ref_frame[1] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[1] == LAST_FRAME); + pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME || + left0 == LAST_FRAME || left1 == LAST_FRAME); } else if (above_has_second || left_has_second) { - const MV_REFERENCE_FRAME rfs = !above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = above_has_second ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == LAST_FRAME) pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); else pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME); } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + - 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); + pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME); } } } else if (has_above || has_left) { // one edge available @@ -291,23 +289,23 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { } else { // inter/inter const int above_has_second = has_second_ref(above_mbmi); const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; if (above_has_second && left_has_second) { - if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && - above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) - pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || - above_mbmi->ref_frame[1] == GOLDEN_FRAME || - left_mbmi->ref_frame[0] == GOLDEN_FRAME || - left_mbmi->ref_frame[1] == GOLDEN_FRAME); + if (above0 == left0 && above1 == left1) + pred_context = 3 * (above0 == GOLDEN_FRAME || + above1 == GOLDEN_FRAME || + left0 == GOLDEN_FRAME || + left1 == GOLDEN_FRAME); else pred_context = 2; } else if (above_has_second || left_has_second) { - const MV_REFERENCE_FRAME rfs = !above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = above_has_second ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = above_has_second ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == GOLDEN_FRAME) pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); @@ -316,17 +314,15 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { else pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); } else { - if (above_mbmi->ref_frame[0] == LAST_FRAME && - left_mbmi->ref_frame[0] == LAST_FRAME) { + if (above0 == LAST_FRAME && left0 == LAST_FRAME) { pred_context = 3; - } else if (above_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME) { - const MB_MODE_INFO *edge_mbmi = - above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi; - pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) { + const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0 + : above0; + pred_context = 4 * (edge0 == GOLDEN_FRAME); } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + - 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); + pred_context = 2 * (above0 == GOLDEN_FRAME) + + 2 * (left0 == GOLDEN_FRAME); } } } @@ -357,10 +353,10 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); const int has_above = above_mbmi != NULL; const int has_left = left_mbmi != NULL; - int above_ctx = (has_above && !above_mbmi->skip_coeff) ? above_mbmi->tx_size - : max_tx_size; - int left_ctx = (has_left && !left_mbmi->skip_coeff) ? left_mbmi->tx_size - : max_tx_size; + int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size + : max_tx_size; + int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size + : max_tx_size; if (!has_left) left_ctx = above_ctx; diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index 0acee32f8..6c7a0d383 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -39,7 +39,7 @@ static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { return above_sip + left_sip; } -static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, +static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg, const MACROBLOCKD *xd) { return seg->pred_probs[vp9_get_pred_context_seg_id(xd)]; } @@ -47,8 +47,8 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { const MODE_INFO *const above_mi = get_above_mi(xd); const MODE_INFO *const left_mi = get_left_mi(xd); - const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip_coeff : 0; - const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0; + const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0; + const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0; return above_skip + left_skip; } @@ -98,8 +98,8 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, int vp9_get_tx_size_context(const MACROBLOCKD *xd); -static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, - const struct tx_probs *tx_probs) { +static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, + const struct tx_probs *tx_probs) { switch (max_tx_size) { case TX_8X8: return tx_probs->p8x8[ctx]; @@ -113,13 +113,14 @@ static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, } } -static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { +static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, + const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs); } -static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, - struct tx_counts *tx_counts) { +static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, + struct tx_counts *tx_counts) { switch (max_tx_size) { case TX_8X8: return tx_counts->p8x8[ctx]; diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c index 884884e0b..a1befc63e 100644 --- a/vp9/common/vp9_prob.c +++ b/vp9/common/vp9_prob.c @@ -10,7 +10,7 @@ #include "vp9/common/vp9_prob.h" -DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { +const uint8_t vp9_norm[256] = { 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -28,3 +28,34 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + +static unsigned int tree_merge_probs_impl(unsigned int i, + const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, + unsigned int count_sat, + unsigned int max_update, + vp9_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, + count_sat, max_update, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, + count_sat, max_update, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, + count_sat, max_update); + return left_count + right_count; +} + +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat, + max_update_factor, probs); +} diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h index cc8d8ab38..f36148035 100644 --- a/vp9/common/vp9_prob.h +++ b/vp9/common/vp9_prob.h @@ -79,37 +79,10 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob, return weighted_prob(pre_prob, prob, factor); } -static unsigned int tree_merge_probs_impl(unsigned int i, - const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, - vp9_prob *probs) { - const int l = tree[i]; - const unsigned int left_count = (l <= 0) - ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const int r = tree[i + 1]; - const unsigned int right_count = (r <= 0) - ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const unsigned int ct[2] = { left_count, right_count }; - probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, - count_sat, max_update_factor); - return left_count + right_count; -} +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs); -static void tree_merge_probs(const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, vp9_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, counts, - count_sat, max_update_factor, probs); -} DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index 6dbdb4216..def12554d 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -130,12 +130,13 @@ int16_t vp9_ac_quant(int qindex, int delta) { } -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) { +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); - return seg->abs_delta == SEGMENT_ABSDATA ? - data : // Abs value - clamp(base_qindex + data, 0, MAXQ); // Delta value + const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ? + data : base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); } else { return base_qindex; } diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h index af50e23cd..581104006 100644 --- a/vp9/common/vp9_quant_common.h +++ b/vp9/common/vp9_quant_common.h @@ -27,7 +27,8 @@ void vp9_init_quant_tables(); int16_t vp9_dc_quant(int qindex, int delta); int16_t vp9_ac_quant(int qindex, int delta); -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex); +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index db20f19d9..df603ad70 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -139,9 +139,6 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, return clamped_mv; } -// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could -// calculate the subsampled BLOCK_SIZE, but that type isn't defined for -// sizes smaller than 16x16 yet. static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, int bw, int bh, int x, int y, int w, int h, @@ -270,8 +267,8 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, : mi_mv_pred_q4(mi, ref)) : mi->mbmi.mv[ref].as_mv; MV32 scaled_mv; - int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width, - frame_height, subpel_x, subpel_y, buf_stride; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride, + subpel_x, subpel_y; uint8_t *ref_frame, *buf_ptr; const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; const MV mv_q4 = { @@ -321,10 +318,6 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, x0_16 += scaled_mv.col; y0_16 += scaled_mv.row; - // Get reference block bottom right coordinate. - x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; - y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; - // Get reference block pointer. buf_ptr = ref_frame + y0 * pre_buf->stride + x0; buf_stride = pre_buf->stride; @@ -333,6 +326,9 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // width/height is not a multiple of 8 pixels. if (scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || (frame_height & 0x7)) { + // Get reference block bottom right coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; int x_pad = 0, y_pad = 0; if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) { diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index bf738c28b..dccd60938 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -39,18 +39,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, enum mv_precision precision, int x, int y); -static int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *sf) { +static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, + const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; return y * stride + x; } -static void setup_pred_plane(struct buf_2d *dst, - uint8_t *src, int stride, - int mi_row, int mi_col, - const struct scale_factors *scale, - int subsampling_x, int subsampling_y) { +static INLINE void setup_pred_plane(struct buf_2d *dst, + uint8_t *src, int stride, + int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { const int x = (MI_SIZE * mi_col) >> subsampling_x; const int y = (MI_SIZE * mi_row) >> subsampling_y; dst->buf = src + scaled_buffer_offset(x, y, stride, scale); diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 96ba3e464..71a41a9de 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -382,34 +382,34 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, /* slower path if the block needs border extension */ if (x0 + 2 * bs <= frame_width) { if (right_available && bs == 4) { - vpx_memcpy(above_row - 1, above_ref - 1, 2 * bs + 1); + vpx_memcpy(above_row, above_ref, 2 * bs); } else { - vpx_memcpy(above_row - 1, above_ref - 1, bs + 1); + vpx_memcpy(above_row, above_ref, bs); vpx_memset(above_row + bs, above_row[bs - 1], bs); } } else if (x0 + bs <= frame_width) { const int r = frame_width - x0; if (right_available && bs == 4) { - vpx_memcpy(above_row - 1, above_ref - 1, r + 1); + vpx_memcpy(above_row, above_ref, r); vpx_memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); } else { - vpx_memcpy(above_row - 1, above_ref - 1, bs + 1); + vpx_memcpy(above_row, above_ref, bs); vpx_memset(above_row + bs, above_row[bs - 1], bs); } } else if (x0 <= frame_width) { const int r = frame_width - x0; if (right_available && bs == 4) { - vpx_memcpy(above_row - 1, above_ref - 1, r + 1); + vpx_memcpy(above_row, above_ref, r); vpx_memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); } else { - vpx_memcpy(above_row - 1, above_ref - 1, r + 1); + vpx_memcpy(above_row, above_ref, r); vpx_memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); } - above_row[-1] = left_available ? above_ref[-1] : 129; } + above_row[-1] = left_available ? above_ref[-1] : 129; } else { /* faster path if the block does not need extension */ if (bs == 4 && right_available && left_available) { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 04a40bd58..4031bda55 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -264,13 +264,13 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8 specialize vp9_convolve_avg $sse2_x86inc neon dspr2 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8 sse2 ssse3 neon dspr2 +specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2 +specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_vert sse2 ssse3 neon dspr2 +specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg sse2 ssse3 neon dspr2 @@ -386,7 +386,7 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid specialize vp9_variance4x4 mmx $sse2_x86inc prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc +specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc @@ -416,7 +416,7 @@ prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc +specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc @@ -707,14 +707,14 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then fi # fdct functions -prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht4x4 sse2 avx2 +prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" +specialize vp9_fht4x4 sse2 avx2 -prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht8x8 sse2 avx2 +prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" +specialize vp9_fht8x8 sse2 avx2 -prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht16x16 sse2 avx2 +prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" +specialize vp9_fht16x16 sse2 avx2 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" specialize vp9_fwht4x4 @@ -737,20 +737,20 @@ specialize vp9_fdct32x32_rd sse2 avx2 # # Motion search # -prototype int vp9_full_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n" +prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv" specialize vp9_full_search_sad sse3 sse4_1 vp9_full_search_sad_sse3=vp9_full_search_sadx3 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8 -prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_refining_search_sad sse3 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4 -prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_diamond_search_sad sse3 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4 -prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_full_range_search prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count" diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index 90b0d0bf9..a9dda1889 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -40,12 +40,12 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); -static int vp9_is_valid_scale(const struct scale_factors *sf) { +static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { return sf->x_scale_fp != REF_INVALID_SCALE && sf->y_scale_fp != REF_INVALID_SCALE; } -static int vp9_is_scaled(const struct scale_factors *sf) { +static INLINE int vp9_is_scaled(const struct scale_factors *sf) { return sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE; } diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h index ee9a4823b..72edbca55 100644 --- a/vp9/common/vp9_systemdependent.h +++ b/vp9/common/vp9_systemdependent.h @@ -11,13 +11,17 @@ #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ -#ifdef __cplusplus -extern "C" { +#ifdef _MSC_VER +# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) +# include <intrin.h> +# define USE_MSC_INTRIN +# endif +# include <math.h> +# define snprintf _snprintf #endif -#ifdef _MSC_VER -#include <math.h> -#define snprintf _snprintf +#ifdef __cplusplus +extern "C" { #endif #include "./vpx_config.h" @@ -30,7 +34,7 @@ void vpx_reset_mmx_state(void); #if defined(_MSC_VER) && _MSC_VER < 1800 // round is not defined in MSVC before VS2013. -static int round(double x) { +static INLINE int round(double x) { if (x < 0) return (int)ceil(x - 0.5); else @@ -44,9 +48,7 @@ static int round(double x) { static INLINE int get_msb(unsigned int n) { return 31 ^ __builtin_clz(n); } -#elif defined(_MSC_VER) && _MSC_VER > 1310 && \ - (defined(_M_X64) || defined(_M_IX86)) -#include <intrin.h> +#elif defined(USE_MSC_INTRIN) #pragma intrinsic(_BitScanReverse) static INLINE int get_msb(unsigned int n) { @@ -54,6 +56,7 @@ static INLINE int get_msb(unsigned int n) { _BitScanReverse(&first_set_bit, n); return first_set_bit; } +#undef USE_MSC_INTRIN #else // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { @@ -73,9 +76,6 @@ static INLINE int get_msb(unsigned int n) { } #endif -struct VP9Common; -void vp9_machine_specific_config(struct VP9Common *cm); - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 8ab5fb1bc..1b4904c39 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -16,15 +16,15 @@ typedef void filter8_1dfunction ( const unsigned char *src_ptr, - const unsigned int src_pitch, + const ptrdiff_t src_pitch, unsigned char *output_ptr, - unsigned int out_pitch, + ptrdiff_t out_pitch, unsigned int output_height, const short *filter ); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \ -void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -32,50 +32,68 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ if (step_q4 == 16 && filter[3] != 128) { \ if (filter[0] || filter[1] || filter[2]) { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } else { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \ - dst, dst_stride, \ - h, filter); \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ src += 4; \ dst += 4; \ w -= 4; \ @@ -121,14 +139,79 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ } \ } +#if HAVE_AVX2 +filter8_1dfunction vp9_filter_block1d16_v8_avx2; +filter8_1dfunction vp9_filter_block1d16_h8_avx2; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif #if HAVE_SSSE3 +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else filter8_1dfunction vp9_filter_block1d16_v8_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; @@ -136,18 +219,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_sse2; -filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -169,11 +252,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3, sse2); + ssse3); // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -236,11 +319,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2, - sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm index a7f69307d..91055b9f9 100644 --- a/vp9/common/x86/vp9_loopfilter_mmx.asm +++ b/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -527,7 +527,7 @@ sym(vp9_lpf_vertical_4_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset ; mm7 = q1 - ; tranpose and write back + ; transpose and write back ; mm1 = 72 62 52 42 32 22 12 02 ; mm6 = 73 63 53 43 33 23 13 03 ; mm3 = 74 64 54 44 34 24 14 04 diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000..efa960c66 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> +#include "vpx_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register +#if defined (__GNUC__) +#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ +(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) + filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); +#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) + filtersReg32 = _mm_broadcastsi128_si256(filtersReg); +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i-=2) { + // load the 2 strides of source + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr-3))); + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line-3)), 1); + + // filter the source buffer + srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+5))); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line+5)), 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, + srcRegFilt32b2_1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr+=dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt2Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(secondFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt2Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(secondFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + } +} + +void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register +#if defined (__GNUC__) +#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \ +(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0)))) + filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg); +#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0)) + filtersReg32 = _mm_broadcastsi128_si256(filtersReg); +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif +#else + filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); +#endif + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + + for (i = output_height; i > 1; i-=2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); + + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b6, srcReg32b13)); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b6, srcReg32b13)); + + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr+=dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the last 2 results together + srcRegFilt4 = _mm_unpacklo_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = _mm_unpackhi_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + } +} diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000..cf28d8d2b --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 =_mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); + forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); + srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr+=src_pixels_per_line; + + // save only 4 bytes + *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + + srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes. + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + src_ptr+=src_pixels_per_line; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + for (i = 0; i < output_height; i++) { + // load the first 8 bytes + srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); + // load the next 8 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); + srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); + srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); + + // load the next 8 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); + srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); + srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); + srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); + srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pitch; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=out_pitch; + } +} + +void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + for (i = 0; i < output_height; i++) { + // load the first 16 bytes + srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); + // load the next 16 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); + srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the result together + srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); + srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + + // load the next 16 bytes in stride of two/three src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); + + // merge the result together + srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); + + // load the next 16 bytes in stride of four/five src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); + + // merge the result together + srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); + srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_min_epi16(srcRegFilt4, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt6, srcRegFilt8)); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_max_epi16(srcRegFilt4, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt6, srcRegFilt8)); + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); + + src_ptr+=src_pitch; + + // save 16 bytes convolve result + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + + output_ptr+=out_pitch; + } +} diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..b5e18fe6d --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 93ef7503f..e52b3f759 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -15,6 +15,7 @@ #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" #include "vp9/common/vp9_alloccommon.h" @@ -39,20 +40,16 @@ #include "vp9/decoder/vp9_reader.h" #include "vp9/decoder/vp9_thread.h" -static int read_be32(const uint8_t *p) { - return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; -} - static int is_compound_reference_allowed(const VP9_COMMON *cm) { int i; for (i = 1; i < REFS_PER_FRAME; ++i) - if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; return 0; } -static void setup_compound_reference(VP9_COMMON *cm) { +static void setup_compound_reference_mode(VP9_COMMON *cm) { if (cm->ref_frame_sign_bias[LAST_FRAME] == cm->ref_frame_sign_bias[GOLDEN_FRAME]) { cm->comp_fixed_ref = ALTREF_FRAME; @@ -116,33 +113,34 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); } -static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) { +static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, + vp9_reader *r) { if (is_compound_reference_allowed(cm)) { - REFERENCE_MODE mode = vp9_read_bit(r); - if (mode) - mode += vp9_read_bit(r); - setup_compound_reference(cm); - return mode; + return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT + : COMPOUND_REFERENCE) + : SINGLE_REFERENCE; } else { return SINGLE_REFERENCE; } } -static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) { +static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) { + FRAME_CONTEXT *const fc = &cm->fc; int i; + if (cm->reference_mode == REFERENCE_MODE_SELECT) - for (i = 0; i < COMP_INTER_CONTEXTS; i++) - vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]); + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_inter_prob[i]); if (cm->reference_mode != COMPOUND_REFERENCE) - for (i = 0; i < REF_CONTEXTS; i++) { - vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]); - vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]); + for (i = 0; i < REF_CONTEXTS; ++i) { + vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]); + vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]); } if (cm->reference_mode != SINGLE_REFERENCE) - for (i = 0; i < REF_CONTEXTS; i++) - vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); + for (i = 0; i < REF_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_ref_prob[i]); } static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) { @@ -303,7 +301,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block, dst, pd->dst.stride, dst, pd->dst.stride, x, y, plane); - if (!mi->mbmi.skip_coeff) { + if (!mi->mbmi.skip) { const int eob = vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size, args->r); @@ -350,9 +348,9 @@ static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, xd->mi_8x8 = cm->mi_grid_visible + offset; xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset; - // Special case: if prev_mi is NULL, the previous mode info context - // cannot be used. - xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; + + xd->last_mi = cm->coding_use_prev_mi && cm->prev_mi ? + xd->prev_mi_8x8[0] : NULL; xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset; xd->mi_8x8[0]->mbmi.sb_type = bsize; @@ -397,7 +395,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, // Has to be called after set_offsets mbmi = &xd->mi_8x8[0]->mbmi; - if (mbmi->skip_coeff) { + if (mbmi->skip) { reset_skip_context(xd, bsize); } else { if (cm->seg.enabled) @@ -421,12 +419,12 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); // Reconstruction - if (!mbmi->skip_coeff) { + if (!mbmi->skip) { int eobtotal = 0; struct inter_args arg = { cm, xd, r, &eobtotal }; vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); if (!less8x8 && eobtotal == 0) - mbmi->skip_coeff = 1; // skip loopfilter + mbmi->skip = 1; // skip loopfilter } } @@ -691,9 +689,14 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { vp9_update_frame_size(cm); } - vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9_DEC_BORDER_IN_PIXELS); + if (vp9_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS, + &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, + cm->cb_priv)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } } static void setup_frame_size(VP9D_COMP *pbi, @@ -831,7 +834,7 @@ static size_t get_tile(const uint8_t *const data_end, vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); - size = read_be32(*data); + size = mem_get_be32(*data); *data += 4; if (size > (size_t)(data_end - *data)) @@ -1114,7 +1117,13 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->show_existing_frame = vp9_rb_read_bit(rb); if (cm->show_existing_frame) { // Show an existing frame directly. - int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; + const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; + + if (cm->frame_bufs[frame_to_show].ref_count < 1) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Buffer %d does not contain a decoded frame", + frame_to_show); + ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show); pbi->refresh_frame_flags = 0; cm->lf.filter_level = 0; @@ -1198,9 +1207,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, } if (!cm->error_resilient_mode) { + cm->coding_use_prev_mi = 1; cm->refresh_frame_context = vp9_rb_read_bit(rb); cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb); } else { + cm->coding_use_prev_mi = 0; cm->refresh_frame_context = 0; cm->frame_parallel_decoding_mode = 1; } @@ -1258,8 +1269,10 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, for (i = 0; i < INTRA_INTER_CONTEXTS; i++) vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]); - cm->reference_mode = read_reference_mode(cm, &r); - read_reference_mode_probs(cm, &r); + cm->reference_mode = read_frame_reference_mode(cm, &r); + if (cm->reference_mode != SINGLE_REFERENCE) + setup_compound_reference_mode(cm); + read_frame_reference_mode_probs(cm, &r); for (j = 0; j < BLOCK_SIZE_GROUPS; j++) for (i = 0; i < INTRA_MODES - 1; ++i) @@ -1368,7 +1381,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { alloc_tile_storage(pbi, tile_rows, tile_cols); xd->mode_info_stride = cm->mode_info_stride; - set_prev_mi(cm); + if (cm->coding_use_prev_mi) + set_prev_mi(cm); + else + cm->prev_mi = NULL; setup_plane_dequants(cm, xd, cm->base_qindex); vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index c7fb71ddf..0fb7a1580 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -146,8 +146,8 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, return segment_id; } -static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd, - int segment_id, vp9_reader *r) { +static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, vp9_reader *r) { if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { @@ -169,7 +169,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm, const BLOCK_SIZE bsize = mbmi->sb_type; mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r); - mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r); + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; @@ -257,13 +257,18 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, mv->col = ref->col + diff.col; } -static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, const MACROBLOCKD *xd, - vp9_reader *r) { - const int ctx = vp9_get_reference_mode_context(cm, xd); - const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]); - if (!cm->frame_parallel_decoding_mode) - ++cm->counts.comp_inter[ctx][mode]; - return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE +static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, + const MACROBLOCKD *xd, + vp9_reader *r) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + const int ctx = vp9_get_reference_mode_context(cm, xd); + const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.comp_inter[ctx][mode]; + return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE + } else { + return cm->reference_mode; + } } // Read the referncence frame @@ -277,10 +282,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); ref_frame[1] = NONE; } else { - const REFERENCE_MODE mode = (cm->reference_mode == REFERENCE_MODE_SELECT) - ? read_reference_mode(cm, xd, r) - : cm->reference_mode; - + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding if (mode == COMPOUND_REFERENCE) { const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; @@ -356,6 +358,11 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi, mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode); } +static INLINE int is_mv_valid(const MV *mv) { + return mv->row > MV_LOW && mv->row < MV_UPP && + mv->col > MV_LOW && mv->col < MV_UPP; +} + static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, int_mv mv[2], int_mv ref_mv[2], int_mv nearest_mv[2], int_mv near_mv[2], @@ -367,14 +374,10 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, case NEWMV: { nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? NULL : &cm->counts.mv; - read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, - &cm->fc.nmvc, mv_counts, allow_hp); - if (is_compound) - read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, - &cm->fc.nmvc, mv_counts, allow_hp); for (i = 0; i < 1 + is_compound; ++i) { - ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW; - ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW; + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts, + allow_hp); + ret = ret && is_mv_valid(&mv[i].as_mv); } break; } @@ -520,10 +523,10 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm, mbmi->mv[0].as_int = 0; mbmi->mv[1].as_int = 0; mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r); - mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r); + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type, - !mbmi->skip_coeff || !inter_block, r); + !mbmi->skip || !inter_block, r); if (inter_block) read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 128b9f8af..542732aa0 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -220,11 +220,13 @@ void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, CHECK_MEM_ERROR(cm, lf_sync->mutex_, vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[i], NULL); + } + CHECK_MEM_ERROR(cm, lf_sync->cond_, vpx_malloc(sizeof(*lf_sync->cond_) * rows)); - for (i = 0; i < rows; ++i) { - pthread_mutex_init(&lf_sync->mutex_[i], NULL); pthread_cond_init(&lf_sync->cond_[i], NULL); } #endif // CONFIG_MULTITHREAD @@ -242,18 +244,29 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) { if (lf_sync != NULL) { int i; - for (i = 0; i < rows; ++i) { - pthread_mutex_destroy(&lf_sync->mutex_[i]); - pthread_cond_destroy(&lf_sync->cond_[i]); + if (lf_sync->mutex_ != NULL) { + for (i = 0; i < rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex_[i]); + } + vpx_free(lf_sync->mutex_); + } + if (lf_sync->cond_ != NULL) { + for (i = 0; i < rows; ++i) { + pthread_cond_destroy(&lf_sync->cond_[i]); + } + vpx_free(lf_sync->cond_); } - vpx_free(lf_sync->mutex_); - vpx_free(lf_sync->cond_); vpx_free(lf_sync->cur_sb_col); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + vpx_memset(lf_sync, 0, sizeof(*lf_sync)); } #else (void)rows; - if (lf_sync != NULL) + if (lf_sync != NULL) { vpx_free(lf_sync->cur_sb_col); + vpx_memset(lf_sync, 0, sizeof(*lf_sync)); + } #endif // CONFIG_MULTITHREAD } diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index 803d536ba..1d3522e13 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -290,9 +290,14 @@ static void swap_frame_buffers(VP9D_COMP *pbi) { VP9_COMMON *const cm = &pbi->common; for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - if (mask & 1) + if (mask & 1) { + const int old_idx = cm->ref_frame_map[ref_index]; ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index], cm->new_fb_idx); + if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0) + cm->release_fb_cb(cm->cb_priv, + &cm->frame_bufs[old_idx].raw_frame_buffer); + } ++ref_index; } @@ -337,6 +342,10 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, cm->frame_refs[0].buf->corrupted = 1; } + // Check if the previous frame was a frame without any references to it. + if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0) + cm->release_fb_cb(cm->cb_priv, + &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer); cm->new_fb_idx = get_free_fb(cm); if (setjmp(cm->error.jmp)) { diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index dc64a107c..34d1da7bd 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -14,6 +14,7 @@ #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem_ops.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" @@ -33,13 +34,7 @@ #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_write_bit_buffer.h" - -#if defined(SECTIONBITS_OUTPUT) -unsigned __int64 Sectionbits[500]; -#endif - #ifdef ENTROPY_STATS -vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES]; extern unsigned int active_section; #endif @@ -67,13 +62,6 @@ static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode, &inter_mode_encodings[INTER_OFFSET(mode)]); } -static INLINE void write_be32(uint8_t *p, int value) { - p[0] = value >> 24; - p[1] = value >> 16; - p[2] = value >> 8; - p[3] = value; -} - void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb, int data, int max) { vp9_wb_write_literal(wb, data, get_unsigned_bits(max)); @@ -109,13 +97,13 @@ static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m, } } -static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m, - vp9_writer *w) { +static int write_skip(const VP9_COMP *cpi, int segment_id, MODE_INFO *m, + vp9_writer *w) { const MACROBLOCKD *const xd = &cpi->mb.e_mbd; if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { - const int skip = m->mbmi.skip_coeff; + const int skip = m->mbmi.skip; vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd)); return skip; } @@ -252,15 +240,15 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { const nmv_context *nmvc = &cm->fc.nmvc; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - struct segmentation *seg = &cm->seg; + const struct segmentation *const seg = &cm->seg; MB_MODE_INFO *const mi = &m->mbmi; - const MV_REFERENCE_FRAME rf = mi->ref_frame[0]; - const MV_REFERENCE_FRAME sec_rf = mi->ref_frame[1]; + const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0]; + const MV_REFERENCE_FRAME ref1 = mi->ref_frame[1]; const MB_PREDICTION_MODE mode = mi->mode; const int segment_id = mi->segment_id; - int skip_coeff; const BLOCK_SIZE bsize = mi->sb_type; const int allow_hp = cm->allow_high_precision_mv; + int skip; #ifdef ENTROPY_STATS active_section = 9; @@ -278,18 +266,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { } } - skip_coeff = write_skip_coeff(cpi, segment_id, m, bc); + skip = write_skip(cpi, segment_id, m, bc); if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) - vp9_write(bc, rf != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd)); + vp9_write(bc, ref0 != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd)); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && - !(rf != INTRA_FRAME && - (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { + !(ref0 != INTRA_FRAME && + (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc); } - if (rf == INTRA_FRAME) { + if (ref0 == INTRA_FRAME) { #ifdef ENTROPY_STATS active_section = 6; #endif @@ -311,7 +299,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { } else { vp9_prob *mv_ref_p; encode_ref_frame(cpi, bc); - mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]]; + mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]]; #ifdef ENTROPY_STATS active_section = 3; @@ -321,7 +309,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (bsize >= BLOCK_8X8) { write_inter_mode(bc, mode, mv_ref_p); - ++cm->counts.inter_mode[mi->mode_context[rf]][INTER_OFFSET(mode)]; + ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(mode)]; } } @@ -341,21 +329,19 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { const int j = idy * 2 + idx; - const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode; - write_inter_mode(bc, blockmode, mv_ref_p); - ++cm->counts.inter_mode[mi->mode_context[rf]] - [INTER_OFFSET(blockmode)]; - - if (blockmode == NEWMV) { + const MB_PREDICTION_MODE b_mode = m->bmi[j].as_mode; + write_inter_mode(bc, b_mode, mv_ref_p); + ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(b_mode)]; + if (b_mode == NEWMV) { #ifdef ENTROPY_STATS active_section = 11; #endif vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv, - &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp); + &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp); if (has_second_ref(mi)) vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv, - &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp); + &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp); } } } @@ -364,11 +350,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { active_section = 5; #endif vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, - &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp); + &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp); if (has_second_ref(mi)) vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, - &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp); + &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp); } } } @@ -387,7 +373,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, if (seg->update_map) write_segment_id(bc, seg, m->mbmi.segment_id); - write_skip_coeff(cpi, segment_id, m, bc); + write_skip(cpi, segment_id, m, bc); if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc); @@ -555,16 +541,6 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { coef_probs[i][j][k][l][m] = get_binary_prob( coef_branch_ct[i][j][k][l][m][0], coef_branch_ct[i][j][k][l][m][1]); -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) { - int t; - for (t = 0; t < ENTROPY_TOKENS; ++t) - context_counters[tx_size][i][j][k][l][t] += - coef_counts[i][j][k][l][t]; - context_counters[tx_size][i][j][k][l][ENTROPY_TOKENS] += - eob_branch_ct[i][j][k][l]; - } -#endif } } } @@ -643,10 +619,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, if (s > 0 && newp != *oldp) u = 1; vp9_write(bc, u, upd); -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; -#endif if (u) { /* send/use new probability */ vp9_write_prob_diff_update(bc, newp, *oldp); @@ -698,10 +670,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, updates += u; if (u == 0 && updates == 0) { noupdates_before_first++; -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; -#endif continue; } if (u == 1 && updates == 1) { @@ -712,10 +680,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, vp9_write(bc, 0, upd); } vp9_write(bc, u, upd); -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; -#endif if (u) { /* send/use new probability */ vp9_write_prob_diff_update(bc, newp, *oldp); @@ -1037,7 +1001,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { vp9_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { // size of this tile - write_be32(data_ptr + total_size, residual_bc.pos); + mem_put_be32(data_ptr + total_size, residual_bc.pos); total_size += 4; } @@ -1287,11 +1251,12 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { active_section = 7; #endif - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); first_part_size = write_compressed_header(cpi, data); data += first_part_size; - vp9_wb_write_literal(&saved_wb, first_part_size, 16); + // TODO(jbb): Figure out what to do if first_part_size > 16 bits. + vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16); data += encode_tiles(cpi, data); diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 713cc5132..85f6c97af 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -49,7 +49,6 @@ typedef struct { int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; int rate; int distortion; - int64_t intra_error; int best_mode_index; int rddiv; int rdmult; @@ -63,9 +62,6 @@ typedef struct { // search loop int_mv pred_mv[MAX_REF_FRAMES]; INTERP_FILTER pred_interp_filter; - - // Bit flag for each mode whether it has high error in comparison to others. - unsigned int modes_with_high_error; } PICK_MODE_CONTEXT; struct macroblock_plane { @@ -172,9 +168,7 @@ struct macroblock { int skip_encode; // Used to store sub partition's choices. - int fast_ms; int_mv pred_mv[MAX_REF_FRAMES]; - int subblock_ref; // TODO(jingning): Need to refactor the structure arrays that buffers the // coding mode decisions of each partition type. diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index a840b480a..d5232393f 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -18,8 +18,6 @@ #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_dct.h" - static INLINE int fdct_round_shift(int input) { int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); assert(INT16_MIN <= rv && rv <= INT16_MAX); @@ -49,7 +47,7 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -157,32 +155,36 @@ static const transform_2d FHT_4[] = { { fadst4, fadst4 } // ADST_ADST = 3 }; -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, - int stride, int tx_type) { - int16_t out[4 * 4]; - int16_t *outptr = &out[0]; - int i, j; - int16_t temp_in[4], temp_out[4]; - const transform_2d ht = FHT_4[tx_type]; +void vp9_fht4x4_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vp9_fdct4x4_c(input, output, stride); + } else { + int16_t out[4 * 4]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[4], temp_out[4]; + const transform_2d ht = FHT_4[tx_type]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = input[j * stride + i] * 16; - if (i == 0 && temp_in[0]) - temp_in[0] += 1; - ht.cols(temp_in, temp_out); - for (j = 0; j < 4; ++j) - outptr[j * 4 + i] = temp_out[j]; - } + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j * stride + i] * 16; + if (i == 0 && temp_in[0]) + temp_in[0] += 1; + ht.cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) + outptr[j * 4 + i] = temp_out[j]; + } - // Rows - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j + i * 4]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 4; ++j) - output[j + i * 4] = (temp_out[j] + 1) >> 2; + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j + i * 4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j + i * 4] = (temp_out[j] + 1) >> 2; + } } } @@ -313,7 +315,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -565,30 +567,34 @@ static const transform_2d FHT_8[] = { { fadst8, fadst8 } // ADST_ADST = 3 }; -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, - int stride, int tx_type) { - int16_t out[64]; - int16_t *outptr = &out[0]; - int i, j; - int16_t temp_in[8], temp_out[8]; - const transform_2d ht = FHT_8[tx_type]; - - // Columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = input[j * stride + i] * 4; - ht.cols(temp_in, temp_out); - for (j = 0; j < 8; ++j) - outptr[j * 8 + i] = temp_out[j]; - } +void vp9_fht8x8_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vp9_fdct8x8_c(input, output, stride); + } else { + int16_t out[64]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[8], temp_out[8]; + const transform_2d ht = FHT_8[tx_type]; + + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) + outptr[j * 8 + i] = temp_out[j]; + } - // Rows - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j + i * 8]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 8; ++j) - output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j + i * 8]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } } } @@ -958,31 +964,34 @@ static const transform_2d FHT_16[] = { { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, - int stride, int tx_type) { - int16_t out[256]; - int16_t *outptr = &out[0]; - int i, j; - int16_t temp_in[16], temp_out[16]; - const transform_2d ht = FHT_16[tx_type]; - - // Columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = input[j * stride + i] * 4; - ht.cols(temp_in, temp_out); - for (j = 0; j < 16; ++j) - outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; -// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } +void vp9_fht16x16_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vp9_fdct16x16_c(input, output, stride); + } else { + int16_t out[256]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[16], temp_out[16]; + const transform_2d ht = FHT_16[tx_type]; + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + } - // Rows - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j + i * 16]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i * 16] = temp_out[j]; + // Rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j + i * 16]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j + i * 16] = temp_out[j]; + } } } @@ -1375,27 +1384,3 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { out[j + i * 32] = temp_out[j]; } } - -void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride) { - if (tx_type == DCT_DCT) - vp9_fdct4x4(input, output, stride); - else - vp9_short_fht4x4(input, output, stride, tx_type); -} - -void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride) { - if (tx_type == DCT_DCT) - vp9_fdct8x8(input, output, stride); - else - vp9_short_fht8x8(input, output, stride, tx_type); -} - -void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride) { - if (tx_type == DCT_DCT) - vp9_fdct16x16(input, output, stride); - else - vp9_short_fht16x16(input, output, stride, tx_type); -} diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h deleted file mode 100644 index cf5f001a9..000000000 --- a/vp9/encoder/vp9_dct.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_VP9_DCT_H_ -#define VP9_ENCODER_VP9_DCT_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride); - -void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride); - -void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_ENCODER_VP9_DCT_H_ diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 7fb5a03ba..57865138d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -40,8 +40,6 @@ #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_vaq.h" -#define DBG_PRNT_SEGMAP 0 - static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) { switch (subsize) { case BLOCK_64X64: @@ -96,7 +94,8 @@ static const uint8_t VP9_VAR_OFFS[64] = { 128, 128, 128, 128, 128, 128, 128, 128 }; -static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x, +static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bs) { unsigned int var, sse; var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, @@ -104,6 +103,52 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x, return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } +static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, + MACROBLOCK *x, + int mi_row, + int mi_col, + BLOCK_SIZE bs) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + int offset = (mi_row * MI_SIZE) * yv12->y_stride + (mi_col * MI_SIZE); + unsigned int var, sse; + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, + x->plane[0].src.stride, + yv12->y_buffer + offset, + yv12->y_stride, + &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} + +static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, + int mi_row, + int mi_col) { + unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb, + mi_row, mi_col, + BLOCK_64X64); + if (var < 8) + return BLOCK_64X64; + else if (var < 128) + return BLOCK_32X32; + else if (var < 2048) + return BLOCK_16X16; + else + return BLOCK_8X8; +} + +static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi, + int mi_row, + int mi_col) { + unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb, + mi_row, mi_col, + BLOCK_64X64); + if (var < 4) + return BLOCK_64X64; + else if (var < 10) + return BLOCK_32X32; + else + return BLOCK_16X16; +} + // Original activity measure from Tim T's code. static unsigned int tt_activity_measure(MACROBLOCK *x) { unsigned int sse; @@ -321,7 +366,7 @@ static void build_activity_map(VP9_COMP *cpi) { } // Macroblock activity masking -void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { +static void activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { #if USE_ACT_INDEX x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2); x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); @@ -347,7 +392,6 @@ static void select_in_frame_q_segment(VP9_COMP *cpi, int mi_row, int mi_col, int output_enabled, int projected_rate) { VP9_COMMON *const cm = &cpi->common; - int target_rate = cpi->rc.sb64_target_rate << 8; // convert to bits << 8 const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; @@ -364,11 +408,10 @@ static void select_in_frame_q_segment(VP9_COMP *cpi, } else { // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). // It is converted to bits * 256 units - target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh); + const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / + (bw * bh); if (projected_rate < (target_rate / 4)) { - segment = 2; - } else if (projected_rate < (target_rate / 2)) { segment = 1; } else { segment = 0; @@ -402,7 +445,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; MODE_INFO *mi_addr = xd->mi_8x8[0]; - const int mb_mode_index = ctx->best_mode_index; const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; @@ -474,8 +516,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i]; } - if (frame_is_intra_only(cm)) { #if CONFIG_INTERNAL_STATS + if (frame_is_intra_only(cm)) { static const int kf_mode_index[] = { THR_DC /*DC_PRED*/, THR_V_PRED /*V_PRED*/, @@ -488,29 +530,32 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, THR_D63_PRED /*D63_PRED*/, THR_TM /*TM_PRED*/, }; - cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]++; -#endif + ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]; } else { // Note how often each mode chosen as best - cpi->mode_chosen_counts[mb_mode_index]++; - if (is_inter_block(mbmi) && - (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { - int_mv best_mv[2]; - for (i = 0; i < 1 + has_second_ref(mbmi); ++i) - best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int; - vp9_update_mv_count(cpi, x, best_mv); - } + ++cpi->mode_chosen_counts[ctx->best_mode_index]; + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mbmi)) { + if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) { + int_mv best_mv[2]; + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) + best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int; + vp9_update_mv_count(cpi, x, best_mv); + } - if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) { - const int ctx = vp9_get_pred_context_switchable_interp(xd); - ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; + if (cm->interp_filter == SWITCHABLE) { + const int ctx = vp9_get_pred_context_switchable_interp(xd); + ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; + } } cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; } } @@ -555,8 +600,6 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, xd->mi_8x8 = cm->mi_grid_visible + idx_str; xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; - // Special case: if prev_mi is NULL, the previous mode info context - // cannot be used. xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; xd->mi_8x8[0] = cm->mi + idx_str; @@ -613,7 +656,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id]; } else { mbmi->segment_id = 0; - x->encode_breakout = cpi->oxcf.encode_breakout; + x->encode_breakout = cpi->encode_breakout; } } @@ -631,7 +674,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, int orig_rdmult = x->rdmult; double rdmult_ratio; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); rdmult_ratio = 1.0; // avoid uninitialized warnings // Use the lower precision, but faster, 32x32 fdct for mode selection. @@ -660,32 +703,44 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, x->skip_recode = 0; // Set to zero to make sure we do not use the previous encoded frame stats - xd->mi_8x8[0]->mbmi.skip_coeff = 0; + xd->mi_8x8[0]->mbmi.skip = 0; x->source_variance = get_sby_perpixel_variance(cpi, x, bsize); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { const int energy = bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy); + + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy); + } else { + const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + xd->mi_8x8[0]->mbmi.segment_id = + vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + } + rdmult_ratio = vp9_vaq_rdmult_ratio(energy); vp9_mb_init_quantizer(cpi, x); } if (cpi->oxcf.tuning == VP8_TUNE_SSIM) - vp9_activity_masking(cpi, x); + activity_masking(cpi, x); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - x->rdmult = round(x->rdmult * rdmult_ratio); + vp9_clear_system_state(); + x->rdmult = (int)round(x->rdmult * rdmult_ratio); } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { const int mi_offset = mi_row * cm->mi_cols + mi_col; unsigned char complexity = cpi->complexity_map[mi_offset]; - const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) || - (mi_col == 0) || (mi_col == (cm->mi_cols - 1)); + const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) || + (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2)); - if (!is_edge && (complexity > 128)) + if (!is_edge && (complexity > 128)) { x->rdmult = x->rdmult + ((x->rdmult * (complexity - 128)) / 256); + } } // Find best coding mode & reconstruct the MB so it is available @@ -705,44 +760,51 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->oxcf.aq_mode == VARIANCE_AQ) { x->rdmult = orig_rdmult; if (*totalrate != INT_MAX) { - vp9_clear_system_state(); // __asm emms; - *totalrate = round(*totalrate * rdmult_ratio); + vp9_clear_system_state(); + *totalrate = (int)round(*totalrate * rdmult_ratio); } } + else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + x->rdmult = orig_rdmult; + } } static void update_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi = xd->mi_8x8[0]; - MB_MODE_INFO *const mbmi = &mi->mbmi; + const MACROBLOCK *const x = &cpi->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const MODE_INFO *const mi = xd->mi_8x8[0]; + const MB_MODE_INFO *const mbmi = &mi->mbmi; if (!frame_is_intra_only(cm)) { const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + FRAME_COUNTS *const counts = &cm->counts; + const int inter_block = is_inter_block(mbmi); - if (!seg_ref_active) - cm->counts.intra_inter[vp9_get_intra_inter_context(xd)] - [is_inter_block(mbmi)]++; - - // If the segment reference feature is enabled we have only a single - // reference frame allowed for the segment so exclude it from - // the reference frame counts used to work out probabilities. - if (is_inter_block(mbmi) && !seg_ref_active) { - if (cm->reference_mode == REFERENCE_MODE_SELECT) - cm->counts.comp_inter[vp9_get_reference_mode_context(cm, xd)] - [has_second_ref(mbmi)]++; - - if (has_second_ref(mbmi)) { - cm->counts.comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)] - [mbmi->ref_frame[0] == GOLDEN_FRAME]++; - } else { - cm->counts.single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] - [mbmi->ref_frame[0] != LAST_FRAME]++; - if (mbmi->ref_frame[0] != LAST_FRAME) - cm->counts.single_ref[vp9_get_pred_context_single_ref_p2(xd)][1] - [mbmi->ref_frame[0] != GOLDEN_FRAME]++; + counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++; + + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + counts->comp_inter[vp9_get_reference_mode_context(cm, xd)] + [has_second_ref(mbmi)]++; + + if (has_second_ref(mbmi)) { + counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)] + [ref0 == GOLDEN_FRAME]++; + } else { + counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] + [ref0 != LAST_FRAME]++; + if (ref0 != LAST_FRAME) + counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1] + [ref0 != GOLDEN_FRAME]++; + } } } } @@ -950,9 +1012,9 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, // may not be allowed in which case this code attempts to choose the largest // allowable partition. static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, int mi_row, int mi_col) { + MODE_INFO **mi_8x8, int mi_row, int mi_col, + BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; - BLOCK_SIZE bsize = cpi->sf.always_this_block_size; const int mis = cm->mode_info_stride; int row8x8_remaining = tile->mi_row_end - mi_row; int col8x8_remaining = tile->mi_col_end - mi_col; @@ -979,7 +1041,7 @@ static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile, for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { int index = block_row * mis + block_col; // Find a partition size that fits - bsize = find_partition_size(cpi->sf.always_this_block_size, + bsize = find_partition_size(bsize, (row8x8_remaining - block_row), (col8x8_remaining - block_col), &bh, &bw); mi_8x8[index] = mi_upper_left + index; @@ -1025,38 +1087,19 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) { } return 0; } + static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, int output_enabled) { int i; VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = xd->plane; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - const int mb_mode_index = ctx->best_mode_index; - int max_plane; - - max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1; - for (i = 0; i < max_plane; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][1]; - p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - p[i].eobs = ctx->eobs_pbuf[i][1]; - } - - for (i = max_plane; i < MAX_MB_PLANE; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][2]; - p[i].qcoeff = ctx->qcoeff_pbuf[i][2]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; - p[i].eobs = ctx->eobs_pbuf[i][2]; - } - x->skip = ctx->skip; - if (frame_is_intra_only(cm)) { #if CONFIG_INTERNAL_STATS + if (frame_is_intra_only(cm)) { static const int kf_mode_index[] = { THR_DC /*DC_PRED*/, THR_V_PRED /*V_PRED*/, @@ -1070,21 +1113,24 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, THR_TM /*TM_PRED*/, }; ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]; -#endif } else { // Note how often each mode chosen as best - cpi->mode_chosen_counts[mb_mode_index]++; - if (is_inter_block(mbmi) && - (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { - int_mv best_mv[2]; - for (i = 0; i < 1 + has_second_ref(mbmi); ++i) - best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int; - vp9_update_mv_count(cpi, x, best_mv); - } + ++cpi->mode_chosen_counts[ctx->best_mode_index]; + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mbmi)) { + if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) { + int_mv best_mv[2]; + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) + best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int; + vp9_update_mv_count(cpi, x, best_mv); + } - if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) { - const int ctx = vp9_get_pred_context_switchable_interp(xd); - ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; + if (cm->interp_filter == SWITCHABLE) { + const int ctx = vp9_get_pred_context_switchable_interp(xd); + ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; + } } } } @@ -1111,8 +1157,8 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, } static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, - TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize) { + TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; @@ -1130,7 +1176,6 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row, mi_col, bsize); subsize = mi_8x8[0]->mbmi.sb_type; - } else { ctx = 0; subsize = BLOCK_4X4; @@ -1181,7 +1226,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, subsize); *get_sb_index(x, subsize) = 3; encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, - subsize); + subsize); break; default: assert("Invalid partition type."); @@ -1213,13 +1258,14 @@ static void rd_use_partition(VP9_COMP *cpi, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; int last_part_rate = INT_MAX; - int64_t last_part_dist = INT_MAX; - int split_rate = INT_MAX; - int64_t split_dist = INT_MAX; + int64_t last_part_dist = INT64_MAX; + int64_t last_part_rd = INT64_MAX; int none_rate = INT_MAX; - int64_t none_dist = INT_MAX; + int64_t none_dist = INT64_MAX; + int64_t none_rd = INT64_MAX; int chosen_rate = INT_MAX; - int64_t chosen_dist = INT_MAX; + int64_t chosen_dist = INT64_MAX; + int64_t chosen_rd = INT64_MAX; BLOCK_SIZE sub_subsize = BLOCK_4X4; int splits_below = 0; BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type; @@ -1248,10 +1294,8 @@ static void rd_use_partition(VP9_COMP *cpi, x->mb_energy = vp9_block_energy(cpi, x, bsize); } - x->fast_ms = 0; - x->subblock_ref = 0; - - if (cpi->sf.adjust_partitioning_from_last_frame) { + if (cpi->sf.partition_search_type == SEARCH_PARTITION && + cpi->sf.adjust_partitioning_from_last_frame) { // Check if any of the sub blocks are further split. if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { sub_subsize = get_subsize(subsize, PARTITION_SPLIT); @@ -1277,7 +1321,11 @@ static void rd_use_partition(VP9_COMP *cpi, pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row, mi_col, bsize); - none_rate += x->partition_cost[pl][PARTITION_NONE]; + + if (none_rate < INT_MAX) { + none_rate += x->partition_cost[pl][PARTITION_NONE]; + none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist); + } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); mi_8x8[0]->mbmi.sb_type = bs_type; @@ -1305,9 +1353,9 @@ static void rd_use_partition(VP9_COMP *cpi, *get_sb_index(x, subsize) = 1; rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); - if (rt == INT_MAX || dt == INT_MAX) { + if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; - last_part_dist = INT_MAX; + last_part_dist = INT64_MAX; break; } @@ -1329,9 +1377,9 @@ static void rd_use_partition(VP9_COMP *cpi, *get_sb_index(x, subsize) = 1; rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); - if (rt == INT_MAX || dt == INT_MAX) { + if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; - last_part_dist = INT_MAX; + last_part_dist = INT64_MAX; break; } last_part_rate += rt; @@ -1357,9 +1405,9 @@ static void rd_use_partition(VP9_COMP *cpi, rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, i != 3); - if (rt == INT_MAX || dt == INT_MAX) { + if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; - last_part_dist = INT_MAX; + last_part_dist = INT64_MAX; break; } last_part_rate += rt; @@ -1372,16 +1420,19 @@ static void rd_use_partition(VP9_COMP *cpi, pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row, mi_col, bsize); - if (last_part_rate < INT_MAX) + if (last_part_rate < INT_MAX) { last_part_rate += x->partition_cost[pl][partition]; + last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist); + } if (cpi->sf.adjust_partitioning_from_last_frame + && cpi->sf.partition_search_type == SEARCH_PARTITION && partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows) && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) { BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT); - split_rate = 0; - split_dist = 0; + chosen_rate = 0; + chosen_dist = 0; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); // Split partition. @@ -1408,46 +1459,44 @@ static void rd_use_partition(VP9_COMP *cpi, restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (rt == INT_MAX || dt == INT_MAX) { - split_rate = INT_MAX; - split_dist = INT_MAX; + if (rt == INT_MAX || dt == INT64_MAX) { + chosen_rate = INT_MAX; + chosen_dist = INT64_MAX; break; } + chosen_rate += rt; + chosen_dist += dt; + if (i != 3) encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0, split_subsize); - split_rate += rt; - split_dist += dt; pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row + y_idx, mi_col + x_idx, split_subsize); - split_rate += x->partition_cost[pl][PARTITION_NONE]; + chosen_rate += x->partition_cost[pl][PARTITION_NONE]; } pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row, mi_col, bsize); - if (split_rate < INT_MAX) { - split_rate += x->partition_cost[pl][PARTITION_SPLIT]; - - chosen_rate = split_rate; - chosen_dist = split_dist; + if (chosen_rate < INT_MAX) { + chosen_rate += x->partition_cost[pl][PARTITION_SPLIT]; + chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist); } } // If last_part is better set the partitioning to that... - if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist) - < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) { + if (last_part_rd < chosen_rd) { mi_8x8[0]->mbmi.sb_type = bsize; if (bsize >= BLOCK_8X8) *(get_sb_partitioning(x, bsize)) = subsize; chosen_rate = last_part_rate; chosen_dist = last_part_dist; + chosen_rd = last_part_rd; } // If none was better set the partitioning to that... - if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist) - > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) { + if (none_rd < chosen_rd) { if (bsize >= BLOCK_8X8) *(get_sb_partitioning(x, bsize)) = bsize; chosen_rate = none_rate; @@ -1459,7 +1508,7 @@ static void rd_use_partition(VP9_COMP *cpi, // We must have chosen a partitioning and encoding or we'll fail later on. // No other opportunities for success. if ( bsize == BLOCK_64X64) - assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX); + assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX); if (do_recon) { int output_enabled = (bsize == BLOCK_64X64); @@ -1523,6 +1572,15 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, } } +// Next square block size less or equal than current block size. +static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = { + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, + BLOCK_8X8, BLOCK_8X8, BLOCK_8X8, + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, + BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, + BLOCK_64X64 +}; + // Look at neighboring blocks and set a min and max partition size based on // what they chose. static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, @@ -1589,95 +1647,13 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, row8x8_remaining, col8x8_remaining, &bh, &bw); *min_block_size = MIN(*min_block_size, *max_block_size); -} -static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - - // Only use 8x8 result for non HD videos. - // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; - int use_8x8 = 1; - - if (cm->frame_type && !cpi->rc.is_src_frame_alt_ref && - ((use_8x8 && bsize == BLOCK_16X16) || - bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) { - int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0; - PICK_MODE_CONTEXT *block_context = NULL; - - if (bsize == BLOCK_16X16) { - block_context = x->sb8x8_context[x->sb_index][x->mb_index]; - } else if (bsize == BLOCK_32X32) { - block_context = x->mb_context[x->sb_index]; - } else if (bsize == BLOCK_64X64) { - block_context = x->sb32_context; - } - - if (block_context) { - ref0 = block_context[0].mic.mbmi.ref_frame[0]; - ref1 = block_context[1].mic.mbmi.ref_frame[0]; - ref2 = block_context[2].mic.mbmi.ref_frame[0]; - ref3 = block_context[3].mic.mbmi.ref_frame[0]; - } - - // Currently, only consider 4 inter reference frames. - if (ref0 && ref1 && ref2 && ref3) { - int d01, d23, d02, d13; - - // Motion vectors for the four subblocks. - int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row; - int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col; - int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row; - int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col; - int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row; - int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col; - int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row; - int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col; - - // Adjust sign if ref is alt_ref. - if (cm->ref_frame_sign_bias[ref0]) { - mvr0 *= -1; - mvc0 *= -1; - } - - if (cm->ref_frame_sign_bias[ref1]) { - mvr1 *= -1; - mvc1 *= -1; - } - - if (cm->ref_frame_sign_bias[ref2]) { - mvr2 *= -1; - mvc2 *= -1; - } - - if (cm->ref_frame_sign_bias[ref3]) { - mvr3 *= -1; - mvc3 *= -1; - } - - // Calculate mv distances. - d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1)); - d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3)); - d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2)); - d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3)); - - if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH && - d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) { - // Set fast motion search level. - x->fast_ms = 1; - - if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 && - d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) { - // Set fast motion search level. - x->fast_ms = 2; - - if (!d01 && !d23 && !d02 && !d13) { - x->fast_ms = 3; - x->subblock_ref = ref0; - } - } - } - } + // When use_square_partition_only is true, make sure at least one square + // partition is allowed by selecting the next smaller square size as + // *min_block_size. + if (cpi->sf.use_square_partition_only && + (*max_block_size - *min_block_size) < 2) { + *min_block_size = next_square_size[*min_block_size]; } } @@ -1720,8 +1696,6 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; - - int partition_split_done = 0; (void) *tp_orig; if (bsize < BLOCK_8X8) { @@ -1863,18 +1837,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.less_rectangular_check) do_rect &= !partition_none_allowed; } - partition_split_done = 1; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - x->fast_ms = 0; - x->subblock_ref = 0; - - if (partition_split_done && - cpi->sf.using_small_partition_info) { - compute_fast_motion_search_level(cpi, bsize); - } - // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); @@ -1979,7 +1944,11 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - + // TODO(jbb): This code added so that we avoid static analysis + // warning related to the fact that best_rd isn't used after this + // point. This code should be refactored so that the duplicate + // checks occur in some sub function and thus are used... + (void) best_rd; *rate = best_rate; *dist = best_dist; @@ -1997,49 +1966,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); assert(best_rate < INT_MAX); - assert(best_dist < INT_MAX); + assert(best_dist < INT64_MAX); } else { assert(tp_orig == *tp); } } -// Examines 64x64 block and chooses a best reference frame -static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl; - int ms = bs / 2; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl[8], sa[8]; - int pl; - int r; - int64_t d; - - save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); - - // Default is non mask (all reference frames allowed. - cpi->ref_frame_mask = 0; - - // Do RD search for 64x64. - if ((mi_row + (ms >> 1) < cm->mi_rows) && - (mi_col + (ms >> 1) < cm->mi_cols)) { - cpi->set_ref_frame_mask = 1; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, - get_block_context(x, BLOCK_64X64), INT64_MAX); - pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, BLOCK_64X64); - r += x->partition_cost[pl][PARTITION_NONE]; - - *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64; - cpi->set_ref_frame_mask = 0; - } - - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); -} - -static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp) { +static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp) { VP9_COMMON *const cm = &cpi->common; int mi_col; @@ -2055,28 +1989,45 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, BLOCK_SIZE i; MACROBLOCK *x = &cpi->mb; - for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; - const int num_4x4_h = num_4x4_blocks_high_lookup[i]; - const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) - for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) - for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) - get_block_context(x, i)->pred_interp_filter = SWITCHABLE; + + if (cpi->sf.adaptive_pred_interp_filter) { + for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; + const int num_4x4_h = num_4x4_blocks_high_lookup[i]; + const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) + for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) + for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) + get_block_context(x, i)->pred_interp_filter = SWITCHABLE; + } } vp9_zero(cpi->mb.pred_mv); - if (cpi->sf.use_lastframe_partitioning || - cpi->sf.use_one_partition_size_always ) { + if ((cpi->sf.partition_search_type == SEARCH_PARTITION && + cpi->sf.use_lastframe_partitioning) || + cpi->sf.partition_search_type == FIXED_PARTITION || + cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION) { const int idx_str = cm->mode_info_stride * mi_row + mi_col; MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; cpi->mb.source_variance = UINT_MAX; - if (cpi->sf.use_one_partition_size_always) { + if (cpi->sf.partition_search_type == FIXED_PARTITION) { + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, + cpi->sf.always_this_block_size); + rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1); + } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION || + cpi->sf.partition_search_type == VAR_BASED_PARTITION) { + // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case. + // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION + // map to the same thing. + BLOCK_SIZE bsize; set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col); + bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col); + set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize); rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } else { @@ -2183,118 +2134,6 @@ static void switch_tx_mode(VP9_COMP *cpi) { cpi->common.tx_mode = ALLOW_32X32; } -static void encode_frame_internal(VP9_COMP *cpi) { - int mi_row; - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - -// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", -// cpi->common.current_video_frame, cpi->common.show_frame, -// cm->frame_type); - -// debug output -#if DBG_PRNT_SEGMAP - { - FILE *statsfile; - statsfile = fopen("segmap2.stt", "a"); - fprintf(statsfile, "\n"); - fclose(statsfile); - } -#endif - - vp9_zero(cm->counts.switchable_interp); - vp9_zero(cpi->tx_stepdown_count); - - xd->mi_8x8 = cm->mi_grid_visible; - // required for vp9_frame_init_quantizer - xd->mi_8x8[0] = cm->mi; - - xd->last_mi = cm->prev_mi; - - vp9_zero(cm->counts.mv); - vp9_zero(cpi->coef_counts); - vp9_zero(cm->counts.eob_branch); - - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 - && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); - - vp9_frame_init_quantizer(cpi); - - vp9_initialize_rd_consts(cpi); - vp9_initialize_me_consts(cpi, cm->base_qindex); - switch_tx_mode(cpi); - - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { - // Initialize encode frame context. - init_encode_frame_mb_context(cpi); - - // Build a frame level activity map - build_activity_map(cpi); - } - - // Re-initialize encode frame context. - init_encode_frame_mb_context(cpi); - - vp9_zero(cpi->rd_comp_pred_diff); - vp9_zero(cpi->rd_filter_diff); - vp9_zero(cpi->rd_tx_select_diff); - vp9_zero(cpi->rd_tx_select_threshes); - - set_prev_mi(cm); - - { - struct vpx_usec_timer emr_timer; - vpx_usec_timer_start(&emr_timer); - - { - // Take tiles into account and give start/end MB - int tile_col, tile_row; - TOKENEXTRA *tp = cpi->tok; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - TileInfo tile; - TOKENEXTRA *tp_old = tp; - - // For each row of SBs in the frame - vp9_tile_init(&tile, cm, tile_row, tile_col); - for (mi_row = tile.mi_row_start; - mi_row < tile.mi_row_end; mi_row += 8) - encode_sb_row(cpi, &tile, mi_row, &tp); - - cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); - assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); - } - } - } - - vpx_usec_timer_mark(&emr_timer); - cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); - } - - if (cpi->sf.skip_encode_sb) { - int j; - unsigned int intra_count = 0, inter_count = 0; - for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { - intra_count += cm->counts.intra_inter[j][0]; - inter_count += cm->counts.intra_inter[j][1]; - } - cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count); - cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME); - cpi->sf.skip_encode_frame &= cm->show_frame; - } else { - cpi->sf.skip_encode_frame = 0; - } - -#if 0 - // Keep record of the total distortion this time around for future use - cpi->last_frame_distortion = cpi->frame_distortion; -#endif -} static int check_dual_ref_flags(VP9_COMP *cpi) { const int ref_flags = cpi->ref_frame_flags; @@ -2312,7 +2151,7 @@ static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) { for (y = 0; y < ymbs; y++) { for (x = 0; x < xmbs; x++) { - if (!mi_8x8[y * mis + x]->mbmi.skip_coeff) + if (!mi_8x8[y * mis + x]->mbmi.skip) return 0; } } @@ -2443,6 +2282,7 @@ static void select_tx_mode(VP9_COMP *cpi) { } } } + // Start RTC Exploration typedef enum { BOTH_ZERO = 0, @@ -2470,98 +2310,75 @@ static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, mbmi->ref_frame[1] = INTRA_FRAME; mbmi->tx_size = max_txsize_lookup[bsize]; mbmi->uv_mode = mode; - mbmi->skip_coeff = 0; + mbmi->skip = 0; mbmi->sb_type = bsize; mbmi->segment_id = 0; } + static INLINE int get_block_row(int b32i, int b16i, int b8i) { return ((b32i >> 1) << 2) + ((b16i >> 1) << 1) + (b8i >> 1); } + static INLINE int get_block_col(int b32i, int b16i, int b8i) { return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1); } -static void rtc_use_partition(VP9_COMP *cpi, - const TileInfo *const tile, - MODE_INFO **mi_8x8, - TOKENEXTRA **tp, int mi_row, int mi_col, - BLOCK_SIZE bsize, int *rate, int64_t *dist, - int do_recon) { + +static void nonrd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate, int64_t *dist) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - const int mis = cm->mode_info_stride; - int mi_width = num_8x8_blocks_wide_lookup[cpi->sf.always_this_block_size]; - int mi_height = num_8x8_blocks_high_lookup[cpi->sf.always_this_block_size]; + int mis = cm->mode_info_stride; + int br, bc; int i, j; int chosen_rate = INT_MAX; - int64_t chosen_dist = INT_MAX; + int64_t chosen_dist = INT64_MAX; MB_PREDICTION_MODE mode = DC_PRED; - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; - int b32i; - x->fast_ms = 0; - x->subblock_ref = 0; - for (b32i = 0; b32i < 4; b32i++) { - int b16i; - for (b16i = 0; b16i < 4; b16i++) { - int b8i; - int block_row = get_block_row(b32i, b16i, 0); - int block_col = get_block_col(b32i, b16i, 0); - int index = block_row * mis + block_col; - int rate; - int64_t dist; - - int_mv frame_nearest_mv[MAX_REF_FRAMES]; - int_mv frame_near_mv[MAX_REF_FRAMES]; - struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE]; - - // Find a partition size that fits - bsize = find_partition_size(cpi->sf.always_this_block_size, - (row8x8_remaining - block_row), - (col8x8_remaining - block_col), - &mi_height, &mi_width); - mi_8x8[index] = mi_8x8[0] + index; - - set_mi_row_col(xd, tile, mi_row + block_row, mi_height, - mi_col + block_col, mi_width, cm->mi_rows, cm->mi_cols); - - xd->mi_8x8 = mi_8x8 + index; - - if (cm->frame_type != KEY_FRAME) { - set_offsets(cpi, tile, mi_row + block_row, mi_col + block_col, bsize); - - vp9_pick_inter_mode(cpi, x, tile, - mi_row + block_row, mi_col + block_col, - &rate, &dist, bsize); - } else { - set_mode_info(&mi_8x8[index]->mbmi, bsize, mode, - mi_row + block_row, mi_col + block_col); - vp9_setup_buffer_inter(cpi, x, tile, - LAST_FRAME, cpi->sf.always_this_block_size, - mi_row + block_row, mi_col + block_col, - frame_nearest_mv, frame_near_mv, yv12_mb); - } + int rows = MIN(MI_BLOCK_SIZE, tile->mi_row_end - mi_row); + int cols = MIN(MI_BLOCK_SIZE, tile->mi_col_end - mi_col); - for (j = 0; j < mi_height; j++) - for (i = 0; i < mi_width; i++) - if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > i - && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > j) { - mi_8x8[index+ i + j * mis] = mi_8x8[index]; - } + int bw = num_8x8_blocks_wide_lookup[bsize]; + int bh = num_8x8_blocks_high_lookup[bsize]; - for (b8i = 0; b8i < 4; b8i++) { - } + int brate = 0; + int64_t bdist = 0; + *rate = 0; + *dist = 0; + + // find prediction mode for each 8x8 block + for (br = 0; br < rows; br += bh) { + for (bc = 0; bc < cols; bc += bw) { + int row = mi_row + br; + int col = mi_col + bc; + + BLOCK_SIZE bs = find_partition_size(bsize, rows - br, cols - bc, + &bh, &bw); + set_offsets(cpi, tile, row, col, bs); + + if (cm->frame_type != KEY_FRAME) + vp9_pick_inter_mode(cpi, x, tile, row, col, &brate, &bdist, bs); + else + set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode, row, col); + + *rate += brate; + *dist += bdist; + + for (j = 0; j < bh; ++j) + for (i = 0; i < bw; ++i) { + xd->mi_8x8[j * mis + i] = xd->mi_8x8[0]; + } } } - encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64); *rate = chosen_rate; *dist = chosen_dist; + + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64); } -static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp) { - VP9_COMMON * const cm = &cpi->common; +static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp) { int mi_col; // Initialize the left context for the new SB row @@ -2574,38 +2391,39 @@ static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int dummy_rate; int64_t dummy_dist; - const int idx_str = cm->mode_info_stride * mi_row + mi_col; - MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; - cpi->mb.source_variance = UINT_MAX; - set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col); - rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); + + if (cpi->sf.partition_search_type == FIXED_PARTITION) { + nonrd_use_partition(cpi, tile, tp, mi_row, mi_col, + cpi->sf.always_this_block_size, + &dummy_rate, &dummy_dist); + } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION || + cpi->sf.partition_search_type == VAR_BASED_PARTITION) { + // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case. + // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION + // map to the same thing. + BLOCK_SIZE bsize = get_nonrd_var_based_fixed_partition(cpi, + mi_row, + mi_col); + nonrd_use_partition(cpi, tile, tp, mi_row, mi_col, + bsize, &dummy_rate, &dummy_dist); + } else { + assert(0); + } } } +// end RTC play code - -static void encode_rtc_frame_internal(VP9_COMP *cpi) { +static void encode_frame_internal(VP9_COMP *cpi) { int mi_row; - MACROBLOCK * const x = &cpi->mb; - VP9_COMMON * const cm = &cpi->common; - MACROBLOCKD * const xd = &x->e_mbd; + MACROBLOCK *const x = &cpi->mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; // fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", // cpi->common.current_video_frame, cpi->common.show_frame, // cm->frame_type); -// debug output -#if DBG_PRNT_SEGMAP - { - FILE *statsfile; - statsfile = fopen("segmap2.stt", "a"); - fprintf(statsfile, "\n"); - fclose(statsfile); - } -#endif - vp9_zero(cm->counts.switchable_interp); vp9_zero(cpi->tx_stepdown_count); @@ -2615,7 +2433,7 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) { xd->last_mi = cm->prev_mi; - vp9_zero(cpi->common.counts.mv); + vp9_zero(cm->counts.mv); vp9_zero(cpi->coef_counts); vp9_zero(cm->counts.eob_branch); @@ -2628,7 +2446,6 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) { vp9_initialize_rd_consts(cpi); vp9_initialize_me_consts(cpi, cm->base_qindex); switch_tx_mode(cpi); - cpi->sf.always_this_block_size = BLOCK_16X16; if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { // Initialize encode frame context. @@ -2648,6 +2465,22 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) { set_prev_mi(cm); + if (cpi->sf.use_nonrd_pick_mode) { + // Initialize internal buffer pointers for rtc coding, where non-RD + // mode decision is used and hence no buffer pointer swap needed. + int i; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][0]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; + p[i].eobs = ctx->eobs_pbuf[i][0]; + } + } + { struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); @@ -2667,9 +2500,12 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) { // For each row of SBs in the frame vp9_tile_init(&tile, cm, tile_row, tile_col); for (mi_row = tile.mi_row_start; - mi_row < tile.mi_row_end; mi_row += 8) - encode_rtc_sb_row(cpi, &tile, mi_row, &tp); - + mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) { + if (cpi->sf.use_nonrd_pick_mode) + encode_nonrd_sb_row(cpi, &tile, mi_row, &tp); + else + encode_rd_sb_row(cpi, &tile, mi_row, &tp); + } cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); } @@ -2699,8 +2535,6 @@ static void encode_rtc_frame_internal(VP9_COMP *cpi) { cpi->last_frame_distortion = cpi->frame_distortion; #endif } -// end RTC play code - void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -2725,7 +2559,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } - if (cpi->sf.RD) { + if (cpi->sf.frame_parameter_update) { int i; REFERENCE_MODE reference_mode; /* @@ -2775,10 +2609,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { select_tx_mode(cpi); cm->reference_mode = reference_mode; - if (cpi->sf.super_fast_rtc) - encode_rtc_frame_internal(cpi); - else - encode_frame_internal(cpi); + encode_frame_internal(cpi); for (i = 0; i < REFERENCE_MODES; ++i) { const int diff = (int) (cpi->rd_comp_pred_diff[i] / cm->MBs); @@ -2858,10 +2689,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { } else { // Force the usage of the BILINEAR interp_filter. cm->interp_filter = BILINEAR; - if (cpi->sf.super_fast_rtc) - encode_rtc_frame_internal(cpi); - else - encode_frame_internal(cpi); + encode_frame_internal(cpi); } } @@ -2936,9 +2764,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; + x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 && (cpi->oxcf.aq_mode != COMPLEXITY_AQ) && - !cpi->sf.super_fast_rtc; + !cpi->sf.use_nonrd_pick_mode; x->skip_optimize = ctx->is_coded; ctx->is_coded = 1; x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; @@ -2969,11 +2798,13 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, } if (!is_inter_block(mbmi)) { - mbmi->skip_coeff = 1; - vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8)); - vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8)); + int plane; + mbmi->skip = 1; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane); if (output_enabled) sum_intra_stats(&cm->counts, mi); + vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else { int ref; const int is_compound = has_second_ref(mbmi); @@ -2983,26 +2814,24 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf); } vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); - } - if (!is_inter_block(mbmi)) { - vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); - } else if (!x->skip) { - mbmi->skip_coeff = 1; - vp9_encode_sb(x, MAX(bsize, BLOCK_8X8)); - vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); - } else { - mbmi->skip_coeff = 1; - if (output_enabled) - cm->counts.skip[vp9_get_skip_context(xd)][1]++; - reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); + if (!x->skip) { + mbmi->skip = 1; + vp9_encode_sb(x, MAX(bsize, BLOCK_8X8)); + vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); + } else { + mbmi->skip = 1; + if (output_enabled) + cm->counts.skip[vp9_get_skip_context(xd)][1]++; + reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); + } } if (output_enabled) { if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 && !(is_inter_block(mbmi) && - (mbmi->skip_coeff || + (mbmi->skip || vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd), &cm->counts.tx)[mbmi->tx_size]; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 376a899e0..13eabe05d 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -19,29 +19,39 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_dct.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_tokenize.h" +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; +}; + +struct encode_b_args { + MACROBLOCK *x; + struct optimize_ctx *ctx; + unsigned char *skip; +}; + void vp9_subtract_block_c(int rows, int cols, - int16_t *diff_ptr, ptrdiff_t diff_stride, - const uint8_t *src_ptr, ptrdiff_t src_stride, - const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) - diff_ptr[c] = src_ptr[c] - pred_ptr[c]; + diff[c] = src[c] - pred[c]; - diff_ptr += diff_stride; - pred_ptr += pred_stride; - src_ptr += src_stride; + diff += diff_stride; + pred += pred_stride; + src += src_stride; } } -static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { +void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); @@ -52,22 +62,6 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { - subtract_plane(x, bsize, 0); -} - -void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) { - int i; - - for (i = 1; i < MAX_MB_PLANE; i++) - subtract_plane(x, bsize, i); -} - -void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { - vp9_subtract_sby(x, bsize); - vp9_subtract_sbuv(x, bsize); -} - #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) typedef struct vp9_token_state vp9_token_state; @@ -111,19 +105,18 @@ static int trellis_get_coeff_context(const int16_t *scan, return pt; } -static void optimize_b(MACROBLOCK *mb, - int plane, int block, BLOCK_SIZE plane_bsize, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, - TX_SIZE tx_size) { +static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, MACROBLOCK *mb, + struct optimize_ctx *ctx) { MACROBLOCKD *const xd = &mb->e_mbd; struct macroblock_plane *p = &mb->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); vp9_token_state tokens[1025][2]; unsigned best_index[1025][2]; - const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block); - int16_t *qcoeff_ptr; - int16_t *dqcoeff_ptr; + const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); int eob = p->eobs[block], final_eob, sz = 0; const int i0 = 0; int rc, x, next, i; @@ -133,7 +126,6 @@ static void optimize_b(MACROBLOCK *mb, PLANE_TYPE type = pd->plane_type; int err_mult = plane_rd_mult[type]; const int default_eob = 16 << (tx_size << 1); - const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; const int16_t *dequant_ptr = pd->dequant; @@ -141,10 +133,13 @@ static void optimize_b(MACROBLOCK *mb, const scan_order *so = get_scan(xd, tx_size, type, block); const int16_t *scan = so->scan; const int16_t *nb = so->neighbors; + ENTROPY_CONTEXT *a, *l; + int tx_x, tx_y; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &tx_x, &tx_y); + a = &ctx->ta[plane][tx_x]; + l = &ctx->tl[plane][tx_y]; assert((!type && !plane) || (type && plane)); - dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); - qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ @@ -162,13 +157,13 @@ static void optimize_b(MACROBLOCK *mb, next = eob; for (i = 0; i < eob; i++) token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ - qcoeff_ptr[scan[i]]].token]; + qcoeff[scan[i]]].token]; for (i = eob; i-- > i0;) { int base_bits, d2, dx; rc = scan[i]; - x = qcoeff_ptr[rc]; + x = qcoeff[rc]; /* Only add a trellis state for non-zero coefficients. */ if (x) { int shortcut = 0; @@ -193,7 +188,7 @@ static void optimize_b(MACROBLOCK *mb, /* And pick the best. */ best = rd_cost1 < rd_cost0; base_bits = *(vp9_dct_value_cost_ptr + x); - dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]); + dx = mul * (dqcoeff[rc] - coeff[rc]); d2 = dx * dx; tokens[i][0].rate = base_bits + (best ? rate1 : rate0); tokens[i][0].error = d2 + (best ? error1 : error0); @@ -206,8 +201,8 @@ static void optimize_b(MACROBLOCK *mb, rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) && - (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul + + if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) && + (abs(x)*dequant_ptr[rc != 0] < abs(coeff[rc]) * mul + dequant_ptr[rc != 0])) shortcut = 1; else @@ -296,16 +291,16 @@ static void optimize_b(MACROBLOCK *mb, UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; final_eob = i0 - 1; - vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2))); - vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2))); + vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2))); + vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2))); for (i = next; i < eob; i = next) { x = tokens[i][best].qc; if (x) { final_eob = i; } rc = scan[i]; - qcoeff_ptr[rc] = x; - dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul; + qcoeff[rc] = x; + dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul; next = tokens[i][best].next; best = best_index[i][best]; @@ -316,60 +311,39 @@ static void optimize_b(MACROBLOCK *mb, *a = *l = (final_eob > 0); } -void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) { - int x, y; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - optimize_b(mb, plane, block, plane_bsize, - &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size); +static INLINE void fdct32x32(int rd_transform, + const int16_t *src, int16_t *dst, int src_stride) { + if (rd_transform) + vp9_fdct32x32_rd(src, dst, src_stride); + else + vp9_fdct32x32(src, dst, src_stride); } -static void optimize_init_b(int plane, BLOCK_SIZE bsize, - struct encode_b_args *args) { - const MACROBLOCKD *xd = &args->x->e_mbd; - const struct macroblockd_plane* const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; - - vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane], - pd->above_context, pd->left_context, - num_4x4_w, num_4x4_h); -} -void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct encode_b_args* const args = arg; - MACROBLOCK* const x = args->x; - MACROBLOCKD* const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); - int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const scan_order *scan_order; - uint16_t *eob = &p->eobs[block]; +void vp9_xform_quant(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; int i, j; - int16_t *src_diff; + const int16_t *src_diff; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); src_diff = &p->src_diff[4 * (j * diff_stride + i)]; switch (tx_size) { case TX_32X32: - scan_order = &vp9_default_scan_orders[TX_32X32]; - if (x->use_lp32x32fdct) - vp9_fdct32x32_rd(src_diff, coeff, diff_stride); - else - vp9_fdct32x32(src_diff, coeff, diff_stride); + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: - scan_order = &vp9_default_scan_orders[TX_16X16]; vp9_fdct16x16(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, @@ -377,7 +351,6 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, scan_order->scan, scan_order->iscan); break; case TX_8X8: - scan_order = &vp9_default_scan_orders[TX_8X8]; vp9_fdct8x8(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, @@ -385,7 +358,6 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, scan_order->scan, scan_order->iscan); break; case TX_4X4: - scan_order = &vp9_default_scan_orders[TX_4X4]; x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, @@ -421,17 +393,17 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, } if (!x->skip_recode) - vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { - vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); + optimize_b(plane, block, plane_bsize, tx_size, x, ctx); } else { ctx->ta[plane][i] = p->eobs[block] > 0; ctx->tl[plane][j] = p->eobs[block] > 0; } if (p->eobs[block]) - *(args->skip_coeff) = 0; + *(args->skip) = 0; if (x->skip_encode || p->eobs[block] == 0) return; @@ -458,8 +430,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, } static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { - struct encode_b_args *const args = arg; - MACROBLOCK *const x = args->x; + MACROBLOCK *const x = (MACROBLOCK *)arg; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -469,48 +440,43 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i]; - vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); - if (p->eobs[block] == 0) - return; - - xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + if (p->eobs[block] > 0) + xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } -void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - struct optimize_ctx ctx; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff}; - - vp9_subtract_sby(x, bsize); - if (x->optimize) - optimize_init_b(0, bsize, &arg); - - vp9_foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, - &arg); +void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { + vp9_subtract_plane(x, bsize, 0); + vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, x); } void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff}; - - if (!x->skip_recode) - vp9_subtract_sb(x, bsize); + struct encode_b_args arg = {x, &ctx, &mbmi->skip}; + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (!x->skip_recode) + vp9_subtract_plane(x, bsize, plane); + + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { + const struct macroblockd_plane* const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; + vp9_get_entropy_contexts(bsize, tx_size, pd, + ctx.ta[plane], ctx.tl[plane]); + } - if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) - optimize_init_b(i, bsize, &arg); + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, + &arg); } - - vp9_foreach_transformed_block(xd, bsize, encode_block, &arg); } -void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { +static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; @@ -528,14 +494,16 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, uint8_t *src, *dst; int16_t *src_diff; uint16_t *eob = &p->eobs[block]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; int i, j; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); - dst = &pd->dst.buf[4 * (j * pd->dst.stride + i)]; - src = &p->src.buf[4 * (j * p->src.stride + i)]; + dst = &pd->dst.buf[4 * (j * dst_stride + i)]; + src = &p->src.buf[4 * (j * src_stride + i)]; src_diff = &p->src_diff[4 * (j * diff_stride + i)]; // if (x->optimize) - // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); + // optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); switch (tx_size) { case TX_32X32: @@ -543,22 +511,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode, x->skip_encode ? src : dst, - x->skip_encode ? p->src.stride : pd->dst.stride, - dst, pd->dst.stride, i, j, plane); + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { vp9_subtract_block(32, 32, src_diff, diff_stride, - src, p->src.stride, dst, pd->dst.stride); - if (x->use_lp32x32fdct) - vp9_fdct32x32_rd(src_diff, coeff, diff_stride); - else - vp9_fdct32x32(src_diff, coeff, diff_stride); + src, src_stride, dst, dst_stride); + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) - vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob); + vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob); break; case TX_16X16: tx_type = get_tx_type_16x16(pd->plane_type, xd); @@ -566,19 +531,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode, x->skip_encode ? src : dst, - x->skip_encode ? p->src.stride : pd->dst.stride, - dst, pd->dst.stride, i, j, plane); + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { vp9_subtract_block(16, 16, src_diff, diff_stride, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht16x16(tx_type, src_diff, coeff, diff_stride); + src, src_stride, dst, dst_stride); + vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) - vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); + vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; case TX_8X8: tx_type = get_tx_type_8x8(pd->plane_type, xd); @@ -586,19 +551,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode, x->skip_encode ? src : dst, - x->skip_encode ? p->src.stride : pd->dst.stride, - dst, pd->dst.stride, i, j, plane); + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { vp9_subtract_block(8, 8, src_diff, diff_stride, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht8x8(tx_type, src_diff, coeff, diff_stride); + src, src_stride, dst, dst_stride); + vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan_order->scan, scan_order->iscan); } if (!x->skip_encode && *eob) - vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); + vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; case TX_4X4: tx_type = get_tx_type_4x4(pd->plane_type, xd, block); @@ -610,14 +575,14 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, x->skip_encode ? src : dst, - x->skip_encode ? p->src.stride : pd->dst.stride, - dst, pd->dst.stride, i, j, plane); + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { vp9_subtract_block(4, 4, src_diff, diff_stride, - src, p->src.stride, dst, pd->dst.stride); + src, src_stride, dst, dst_stride); if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, diff_stride, tx_type); + vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, @@ -631,33 +596,32 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob); + xd->itxm_add(dqcoeff, dst, dst_stride, *eob); else - vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type); + vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; default: assert(0); } if (*eob) - *(args->skip_coeff) = 0; + *(args->skip) = 0; } -void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD* const xd = &x->e_mbd; - struct optimize_ctx ctx; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff}; - - vp9_foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra, - &arg); +void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + unsigned char *skip) { + struct encode_b_args arg = {x, NULL, skip}; + encode_block_intra(plane, block, plane_bsize, tx_size, &arg); } -void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD* const xd = &x->e_mbd; - struct optimize_ctx ctx; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff}; - vp9_foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg); + + +void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + const MACROBLOCKD *const xd = &x->e_mbd; + struct encode_b_args arg = {x, NULL, &xd->mi_8x8[0]->mbmi.skip}; + + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra, + &arg); } int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) { @@ -668,6 +632,6 @@ int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) { mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; - vp9_encode_intra_block_y(x, mbmi->sb_type); + vp9_encode_intra_block_plane(x, mbmi->sb_type, 0); return vp9_get_mb_ss(x->plane[0].src_diff); } diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 9f6c9f069..dcf6e8759 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -20,32 +20,19 @@ extern "C" { #endif -struct optimize_ctx { - ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; - ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; -}; - -struct encode_b_args { - MACROBLOCK *x; - struct optimize_ctx *ctx; - unsigned char *skip_coeff; -}; - void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg); +void vp9_xform_quant(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); -void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); -void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg); +void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + unsigned char *skip); -void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred); diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index af710a8f4..be6abc2a1 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -224,18 +224,11 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, } } -void vp9_build_nmv_cost_table(int *mvjoint, - int *mvcost[2], - const nmv_context* const mvctx, - int usehp, - int mvc_flag_v, - int mvc_flag_h) { - vp9_clear_system_state(); - vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree); - if (mvc_flag_v) - build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp); - if (mvc_flag_h) - build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp); +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context* ctx, int usehp) { + vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree); + build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp); + build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp); } static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound, diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h index f0463bbd3..7f997ff37 100644 --- a/vp9/encoder/vp9_encodemv.h +++ b/vp9/encoder/vp9_encodemv.h @@ -25,12 +25,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w); void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref, const nmv_context* mvctx, int usehp); -void vp9_build_nmv_cost_table(int *mvjoint, - int *mvcost[2], - const nmv_context* const mvctx, - int usehp, - int mvc_flag_v, - int mvc_flag_h); +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context* mvctx, int usehp); void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 153046440..32ed96999 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -49,8 +49,9 @@ #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001) -#define MIN_BOOST 300 -#define KEY_FRAME_BOOST 2000 +#define MIN_KF_BOOST 300 + +#define DISABLE_RC_LONG_TERM_MEM 0 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { YV12_BUFFER_CONFIG temp = *a; @@ -64,7 +65,7 @@ static int select_cq_level(int qindex) { double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0; - for (i = 0; i < QINDEX_RANGE; i++) { + for (i = 0; i < QINDEX_RANGE; ++i) { if (target_q <= vp9_convert_qindex_to_q(i)) { ret_val = i; break; @@ -105,12 +106,12 @@ static int lookup_next_frame_stats(const struct twopass_rc *p, } -// Read frame stats at an offset from the current position +// Read frame stats at an offset from the current position. static int read_frame_stats(const struct twopass_rc *p, FIRSTPASS_STATS *frame_stats, int offset) { const FIRSTPASS_STATS *fps_ptr = p->stats_in; - // Check legality of offset + // Check legality of offset. if (offset >= 0) { if (&fps_ptr[offset] >= p->stats_in_end) return EOF; @@ -132,9 +133,9 @@ static int input_stats(struct twopass_rc *p, FIRSTPASS_STATS *fps) { return 1; } -static void output_stats(const VP9_COMP *cpi, +static void output_stats(const VP9_COMP *cpi, struct vpx_codec_pkt_list *pktlist, - FIRSTPASS_STATS *stats) { + FIRSTPASS_STATS *stats) { struct vpx_codec_cx_pkt pkt; pkt.kind = VPX_CODEC_STATS_PKT; pkt.data.twopass_stats.buf = stats; @@ -143,7 +144,6 @@ static void output_stats(const VP9_COMP *cpi, // TEMP debug code #if OUTPUT_FPF - { FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); @@ -265,9 +265,9 @@ static void avg_stats(FIRSTPASS_STATS *section) { // Calculate a modified Error used in distributing bits between easier and // harder frames. -static double calculate_modified_err(VP9_COMP *cpi, - FIRSTPASS_STATS *this_frame) { - struct twopass_rc *const twopass = &cpi->twopass; +static double calculate_modified_err(const VP9_COMP *cpi, + const FIRSTPASS_STATS *this_frame) { + const struct twopass_rc *const twopass = &cpi->twopass; const FIRSTPASS_STATS *const stats = &twopass->total_stats; const double av_err = stats->ssim_weighted_pred_err / stats->count; double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err / @@ -336,7 +336,7 @@ static double simple_weight(const YV12_BUFFER_CONFIG *buf) { } // This function returns the maximum target rate per frame. -static int frame_max_bits(VP9_COMP *cpi) { +static int frame_max_bits(const VP9_COMP *cpi) { int64_t max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth * (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100; @@ -376,7 +376,6 @@ static unsigned int zz_motion_search(const VP9_COMP *cpi, const MACROBLOCK *x) { const int src_stride = x->plane[0].src.stride; const uint8_t *const ref = xd->plane[0].pre[0].buf; const int ref_stride = xd->plane[0].pre[0].stride; - unsigned int sse; vp9_variance_fn_t fn = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type); fn(src, src_stride, ref, ref_stride, &sse); @@ -397,18 +396,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int new_mv_mode_penalty = 256; const int quart_frm = MIN(cpi->common.width, cpi->common.height); - // refine the motion search range accroding to the frame dimension - // for first pass test + // Refine the motion search range according to the frame dimension + // for first pass test. while ((quart_frm << sr) < MAX_FULL_PEL_VAL) - sr++; + ++sr; step_param += sr; further_steps -= sr; - // override the default variance function to use MSE + // Override the default variance function to use MSE. v_fn_ptr.vf = get_block_variance_fn(bsize); - // Initial step/diamond search centred on best mv + // Center the initial step/diamond search on best mv. tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param, x->sadperbit16, &num00, &v_fn_ptr, @@ -423,15 +422,15 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, best_mv->col = tmp_mv.col; } - // Further step/diamond searches as necessary + // Carry out further step/diamond searches as necessary. n = num00; num00 = 0; while (n < further_steps) { - n++; + ++n; if (num00) { - num00--; + --num00; } else { tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param + n, x->sadperbit16, @@ -468,7 +467,7 @@ void vp9_first_pass(VP9_COMP *cpi) { TileInfo tile; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - PICK_MODE_CONTEXT *ctx = &x->sb64_context; + const PICK_MODE_CONTEXT *ctx = &x->sb64_context; int i; int recon_yoffset, recon_uvoffset; @@ -496,14 +495,14 @@ void vp9_first_pass(VP9_COMP *cpi) { struct twopass_rc *const twopass = &cpi->twopass; const MV zero_mv = {0, 0}; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); vp9_setup_src_planes(x, cpi->Source, 0, 0); setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL); setup_dst_planes(xd, new_yv12, 0, 0); xd->mi_8x8 = cm->mi_grid_visible; - xd->mi_8x8[0] = cm->mi; // required for vp9_frame_init_quantizer + xd->mi_8x8[0] = cm->mi; vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -520,34 +519,32 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_init_mv_probs(cm); vp9_initialize_rd_consts(cpi); - // tiling is ignored in the first pass + // Tiling is ignored in the first pass. vp9_tile_init(&tile, cm, 0, 0); - // for each macroblock row in image - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { int_mv best_ref_mv; best_ref_mv.as_int = 0; - // reset above block coeffs + // Reset above block coeffs. xd->up_available = (mb_row != 0); recon_yoffset = (mb_row * recon_y_stride * 16); recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders + // outside the UMV borders. x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - // for each macroblock col in image - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { int this_error; const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); double error_weight = 1.0; const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; @@ -565,15 +562,15 @@ void vp9_first_pass(VP9_COMP *cpi) { error_weight = vp9_vaq_inv_q_ratio(energy); } - // do intra 16x16 prediction + // Do intra 16x16 prediction. this_error = vp9_encode_intra(x, use_dc_pred); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - this_error *= error_weight; + vp9_clear_system_state(); + this_error = (int)(this_error * error_weight); } - // intrapenalty below deals with situations where the intra and inter - // error scores are very low (eg a plain black frame). + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). // We do not have special cases in first pass for 0,0 and nearest etc so // all inter modes carry an overhead cost estimate for the mv. // When the error score is very low this causes us to pick all or lots of @@ -581,7 +578,7 @@ void vp9_first_pass(VP9_COMP *cpi) { // This penalty adds a cost matching that of a 0,0 mv to the intra case. this_error += intrapenalty; - // Cumulative intra error total + // Accumulate the intra error. intra_error += (int64_t)this_error; // Set up limit values for motion vectors to prevent them extending @@ -589,23 +586,23 @@ void vp9_first_pass(VP9_COMP *cpi) { x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; - // Other than for the first frame do a motion search + // Other than for the first frame do a motion search. if (cm->current_video_frame > 0) { int tmp_err, motion_error; int_mv mv, tmp_mv; xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset; motion_error = zz_motion_search(cpi, x); - // Simple 0,0 motion with no mv overhead + // Assume 0,0 motion with no mv overhead. mv.as_int = tmp_mv.as_int = 0; // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search + // starting point (best reference) for the search. first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, &motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - motion_error *= error_weight; + vp9_clear_system_state(); + motion_error = (int)(motion_error * error_weight); } // If the current best reference mv is not centered on 0,0 then do a 0,0 @@ -615,8 +612,8 @@ void vp9_first_pass(VP9_COMP *cpi) { first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - tmp_err *= error_weight; + vp9_clear_system_state(); + tmp_err = (int)(tmp_err * error_weight); } if (tmp_err < motion_error) { @@ -625,9 +622,9 @@ void vp9_first_pass(VP9_COMP *cpi) { } } - // Experimental search in an older reference frame + // Search in an older reference frame. if (cm->current_video_frame > 1) { - // Simple 0,0 motion with no mv overhead + // Assume 0,0 motion with no mv overhead. int gf_motion_error; xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; @@ -636,22 +633,22 @@ void vp9_first_pass(VP9_COMP *cpi) { first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &gf_motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - gf_motion_error *= error_weight; + vp9_clear_system_state(); + gf_motion_error = (int)(gf_motion_error * error_weight); } if (gf_motion_error < motion_error && gf_motion_error < this_error) - second_ref_count++; + ++second_ref_count; - // Reset to last frame as reference buffer + // Reset to last frame as reference buffer. xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset; xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset; xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset; - // In accumulating a score for the older reference frame - // take the best of the motion predicted score and - // the intra coded error (just as will be done for) - // accumulation of "coded_error" for the last frame. + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. if (gf_motion_error < this_error) sr_coded_error += gf_motion_error; else @@ -659,17 +656,16 @@ void vp9_first_pass(VP9_COMP *cpi) { } else { sr_coded_error += motion_error; } - /* Intra assumed best */ + // Start by assuming that intra mode is best. best_ref_mv.as_int = 0; if (motion_error <= this_error) { - // Keep a count of cases where the inter and intra were - // very close and very low. This helps with scene cut - // detection for example in cropped clips with black bars - // at the sides or top and bottom. + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. if (((this_error - intrapenalty) * 9 <= motion_error * 10) && this_error < 2 * intrapenalty) - neutral_count++; + ++neutral_count; mv.as_mv.row *= 8; mv.as_mv.col *= 8; @@ -679,50 +675,49 @@ void vp9_first_pass(VP9_COMP *cpi) { xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME; xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE; vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); - vp9_encode_sby(x, bsize); + vp9_encode_sby_pass1(x, bsize); sum_mvr += mv.as_mv.row; sum_mvr_abs += abs(mv.as_mv.row); sum_mvc += mv.as_mv.col; sum_mvc_abs += abs(mv.as_mv.col); sum_mvrs += mv.as_mv.row * mv.as_mv.row; sum_mvcs += mv.as_mv.col * mv.as_mv.col; - intercount++; + ++intercount; best_ref_mv.as_int = mv.as_int; - // Was the vector non-zero if (mv.as_int) { - mvcount++; + ++mvcount; - // Was it different from the last non zero vector + // Non-zero vector, was it different from the last non zero vector? if (mv.as_int != lastmv_as_int) - new_mv_count++; + ++new_mv_count; lastmv_as_int = mv.as_int; - // Does the Row vector point inwards or outwards + // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { if (mv.as_mv.row > 0) - sum_in_vectors--; + --sum_in_vectors; else if (mv.as_mv.row < 0) - sum_in_vectors++; + ++sum_in_vectors; } else if (mb_row > cm->mb_rows / 2) { if (mv.as_mv.row > 0) - sum_in_vectors++; + ++sum_in_vectors; else if (mv.as_mv.row < 0) - sum_in_vectors--; + --sum_in_vectors; } - // Does the Row vector point inwards or outwards + // Does the col vector point inwards or outwards? if (mb_col < cm->mb_cols / 2) { if (mv.as_mv.col > 0) - sum_in_vectors--; + --sum_in_vectors; else if (mv.as_mv.col < 0) - sum_in_vectors++; + ++sum_in_vectors; } else if (mb_col > cm->mb_cols / 2) { if (mv.as_mv.col > 0) - sum_in_vectors++; + ++sum_in_vectors; else if (mv.as_mv.col < 0) - sum_in_vectors--; + --sum_in_vectors; } } } @@ -731,7 +726,7 @@ void vp9_first_pass(VP9_COMP *cpi) { } coded_error += (int64_t)this_error; - // adjust to the next column of macroblocks + // Adjust to the next column of MBs. x->plane[0].src.buf += 16; x->plane[1].src.buf += uv_mb_height; x->plane[2].src.buf += uv_mb_height; @@ -740,24 +735,24 @@ void vp9_first_pass(VP9_COMP *cpi) { recon_uvoffset += uv_mb_height; } - // adjust to the next row of mbs + // Adjust to the next row of MBs. x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); } - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); { FIRSTPASS_STATS fps; fps.frame = cm->current_video_frame; - fps.intra_error = intra_error >> 8; - fps.coded_error = coded_error >> 8; - fps.sr_coded_error = sr_coded_error >> 8; + fps.intra_error = (double)(intra_error >> 8); + fps.coded_error = (double)(coded_error >> 8); + fps.sr_coded_error = (double)(sr_coded_error >> 8); fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source); fps.count = 1.0; fps.pcnt_inter = (double)intercount / cm->MBs; @@ -791,14 +786,14 @@ void vp9_first_pass(VP9_COMP *cpi) { // cpi->source_time_stamp. fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start); - // don't want to do output stats with a stack variable! + // Don't want to do output stats with a stack variable! twopass->this_frame_stats = fps; output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats); accumulate_stats(&twopass->total_stats, &fps); } // Copy the previous Last Frame back into gf and and arf buffers if - // the prediction is good enough... but also dont allow it to lag too far + // the prediction is good enough... but also don't allow it to lag too far. if ((twopass->sr_update_lag > 3) || ((cm->current_video_frame > 0) && (twopass->this_frame_stats.pcnt_inter > 0.20) && @@ -807,9 +802,9 @@ void vp9_first_pass(VP9_COMP *cpi) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); twopass->sr_update_lag = 1; } else { - twopass->sr_update_lag++; + ++twopass->sr_update_lag; } - // swap frame pointers so last frame refers to the frame we just compressed + // Swap frame pointers so last frame refers to the frame we just compressed. swap_yv12(lst_yv12, new_yv12); vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y); @@ -819,7 +814,7 @@ void vp9_first_pass(VP9_COMP *cpi) { if (cm->current_video_frame == 0) vp8_yv12_copy_frame(lst_yv12, gld_yv12); - // use this to see what the first pass reconstruction looks like + // Use this to see what the first pass reconstruction looks like. if (0) { char filename[512]; FILE *recon_file; @@ -835,15 +830,11 @@ void vp9_first_pass(VP9_COMP *cpi) { fclose(recon_file); } - cm->current_video_frame++; + ++cm->current_video_frame; } -// Estimate a cost per mb attributable to overheads such as the coding of -// modes and motion vectors. -// Currently simplistic in its assumptions for testing. -// - - +// Estimate a cost per mb attributable to overheads such as the coding of modes +// and motion vectors. This currently makes simplistic assumptions for testing. static double bitcost(double prob) { return -(log(prob) / log(2.0)); } @@ -866,18 +857,17 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi, motion_cost = bitcost(av_pct_motion); intra_cost = bitcost(av_intra); - // Estimate of extra bits per mv overhead for mbs - // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb + // Estimate the number of extra bits per mv overhead for mbs. We shift (<< 9) + // to match the scaling of number of bits by 512. mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9; - // Crude estimate of overhead cost from modes - // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb + // Produce a crude estimate of the overhead cost from modes. We shift (<< 9) + // to match the scaling of number of bits by 512. mode_cost = (int)((((av_pct_inter - av_pct_motion) * zz_cost) + (av_pct_motion * motion_cost) + (av_intra * intra_cost)) * cpi->common.MBs) << 9; - // return mv_cost + mode_cost; // TODO(paulwilkins): Fix overhead costs for extended Q range. #endif return 0; @@ -894,19 +884,19 @@ static double calc_correction_factor(double err_per_mb, const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low, pt_high); - // Calculate correction factor + // Calculate correction factor. if (power_term < 1.0) assert(error_term >= 0.0); return fclamp(pow(error_term, power_term), 0.05, 5.0); } -static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh) { +int vp9_twopass_worst_quality(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, + int section_target_bandwitdh) { int q; const int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; - RATE_CONTROL *const rc = &cpi->rc; + const RATE_CONTROL *const rc = &cpi->rc; const double section_err = fpstats->coded_error / fpstats->count; const double err_per_mb = section_err / num_mbs; @@ -920,7 +910,7 @@ static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, // Try and pick a max Q that will be high enough to encode the // content at the given rate. - for (q = rc->best_quality; q < rc->worst_quality; q++) { + for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double err_correction_factor = calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.5, 0.90, q); const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q, @@ -936,58 +926,6 @@ static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, return q; } -// For cq mode estimate a cq level that matches the observed -// complexity and data rate. -static int estimate_cq(VP9_COMP *cpi, - FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh) { - int q; - int num_mbs = cpi->common.MBs; - int target_norm_bits_per_mb; - - double section_err = (fpstats->coded_error / fpstats->count); - double err_per_mb = section_err / num_mbs; - double err_correction_factor; - double clip_iiratio; - double clip_iifactor; - - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) - ? (512 * section_target_bandwitdh) / num_mbs - : 512 * (section_target_bandwitdh / num_mbs); - - - // II ratio correction factor for clip as a whole - clip_iiratio = cpi->twopass.total_stats.intra_error / - DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error); - clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025); - if (clip_iifactor < 0.80) - clip_iifactor = 0.80; - - // Try and pick a Q that can encode the content at the given rate. - for (q = 0; q < MAXQ; q++) { - int bits_per_mb_at_this_q; - - // Error per MB based correction factor - err_correction_factor = - calc_correction_factor(err_per_mb, 100.0, 0.5, 0.90, q) * clip_iifactor; - - bits_per_mb_at_this_q = - vp9_rc_bits_per_mb(INTER_FRAME, q, err_correction_factor); - - if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) - break; - } - - // Clip value to range "best allowed to (worst allowed - 1)" - q = select_cq_level(q); - if (q >= cpi->rc.worst_quality) - q = cpi->rc.worst_quality - 1; - if (q < cpi->rc.best_quality) - q = cpi->rc.best_quality; - - return q; -} - extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { @@ -1005,11 +943,11 @@ void vp9_init_second_pass(VP9_COMP *cpi) { twopass->total_stats = *twopass->stats_in_end; twopass->total_left_stats = twopass->total_stats; - // each frame can have a different duration, as the frame rate in the source - // isn't guaranteed to be constant. The frame rate prior to the first frame - // encoded in the second pass is a guess. However the sum duration is not. - // Its calculated based on the actual durations of all frames from the first - // pass. + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count / twopass->total_stats.duration); @@ -1020,18 +958,18 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // Calculate a minimum intra value to be used in determining the IIratio // scores used in the second pass. We have this minimum to make sure // that clips that are static but "low complexity" in the intra domain - // are still boosted appropriately for KF/GF/ARF + // are still boosted appropriately for KF/GF/ARF. twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; - // This variable monitors how far behind the second ref update is lagging + // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; // Scan the first pass file and calculate an average Intra / Inter error score // ratio for the sequence. { double sum_iiratio = 0.0; - start_pos = twopass->stats_in; // Note the starting "file" position. + start_pos = twopass->stats_in; while (input_stats(twopass, &this_frame) != EOF) { const double iiratio = this_frame.intra_error / @@ -1042,7 +980,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) { twopass->avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count); - // Reset file position reset_fpf_position(twopass, start_pos); } @@ -1052,7 +989,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { double av_error = twopass->total_stats.ssim_weighted_pred_err / DOUBLE_DIVIDE_CHECK(twopass->total_stats.count); - start_pos = twopass->stats_in; // Note starting "file" position + start_pos = twopass->stats_in; twopass->modified_error_total = 0.0; twopass->modified_error_min = @@ -1073,8 +1010,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) { void vp9_end_second_pass(VP9_COMP *cpi) { } -// This function gives and estimate of how badly we believe -// the prediction quality is decaying from frame to frame. +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. static double get_prediction_decay_rate(const VP9_COMMON *cm, const FIRSTPASS_STATS *next_frame) { // Look at the observed drop in prediction quality between the last frame @@ -1091,12 +1028,10 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm, // Function to test for a condition where a complex transition is followed // by a static section. For example in slide shows where there is a fade // between slides. This is to help with more optimal kf and gf positioning. -static int detect_transition_to_still( - VP9_COMP *cpi, - int frame_interval, - int still_interval, - double loop_decay_rate, - double last_decay_rate) { +static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval, + int still_interval, + double loop_decay_rate, + double last_decay_rate) { int trans_to_still = 0; // Break clause to detect very still sections after motion @@ -1109,9 +1044,8 @@ static int detect_transition_to_still( FIRSTPASS_STATS *position = cpi->twopass.stats_in; FIRSTPASS_STATS tmp_next_frame; - // Look ahead a few frames to see if static condition - // persists... - for (j = 0; j < still_interval; j++) { + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { if (EOF == input_stats(&cpi->twopass, &tmp_next_frame)) break; @@ -1121,7 +1055,7 @@ static int detect_transition_to_still( reset_fpf_position(&cpi->twopass, position); - // Only if it does do we signal a transition to still + // Only if it does do we signal a transition to still. if (j == still_interval) trans_to_still = 1; } @@ -1131,7 +1065,7 @@ static int detect_transition_to_still( // This function detects a flash through the high relative pcnt_second_ref // score in the frame following a flash frame. The offset passed in should -// reflect this +// reflect this. static int detect_flash(const struct twopass_rc *twopass, int offset) { FIRSTPASS_STATS next_frame; @@ -1144,7 +1078,7 @@ static int detect_flash(const struct twopass_rc *twopass, int offset) { // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref - // comapred to pcnt_inter. + // compared to pcnt_inter. if (next_frame.pcnt_second_ref > next_frame.pcnt_inter && next_frame.pcnt_second_ref >= 0.5) flash_detected = 1; @@ -1153,7 +1087,7 @@ static int detect_flash(const struct twopass_rc *twopass, int offset) { return flash_detected; } -// Update the motion related elements to the GF arf boost calculation +// Update the motion related elements to the GF arf boost calculation. static void accumulate_frame_motion_stats( FIRSTPASS_STATS *this_frame, double *this_frame_mv_in_out, @@ -1165,13 +1099,13 @@ static void accumulate_frame_motion_stats( // Accumulate motion stats. motion_pct = this_frame->pcnt_motion; - // Accumulate Motion In/Out of frame stats + // Accumulate Motion In/Out of frame stats. *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct; *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct; *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct); // Accumulate a measure of how uniform (or conversely how random) - // the motion field is. (A ratio of absmv / mv) + // the motion field is (a ratio of absmv / mv). if (motion_pct > 0.05) { const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr)); @@ -1194,7 +1128,7 @@ static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame, double this_frame_mv_in_out) { double frame_boost; - // Underlying boost factor is based on inter intra error ratio + // Underlying boost factor is based on inter intra error ratio. if (this_frame->intra_error > cpi->twopass.gf_intra_err_min) frame_boost = (IIFACTOR * this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); @@ -1202,13 +1136,12 @@ static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame, frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); - // Increase boost for frames where new data coming into frame - // (eg zoom out). Slightly reduce boost if there is a net balance - // of motion out of the frame (zoom in). - // The range for this_frame_mv_in_out is -1.0 to +1.0 + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // In extreme case boost is halved + // In the extreme case the boost is halved. else frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); @@ -1230,12 +1163,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int arf_boost; int flash_detected = 0; - // Search forward from the proposed arf/next gf position - for (i = 0; i < f_frames; i++) { + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF) break; - // Update the motion related elements to the boost calculation + // Update the motion related elements to the boost calculation. accumulate_frame_motion_stats(&this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, @@ -1246,7 +1179,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, flash_detected = detect_flash(twopass, i + offset) || detect_flash(twopass, i + offset + 1); - // Cumulative effect of prediction quality decay + // Accumulate the effect of prediction quality decay. if (!flash_detected) { decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR @@ -1259,7 +1192,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, *f_boost = (int)boost_score; - // Reset for backward looking loop + // Reset for backward looking loop. boost_score = 0.0; mv_ratio_accumulator = 0.0; decay_accumulator = 1.0; @@ -1267,12 +1200,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, mv_in_out_accumulator = 0.0; abs_mv_in_out_accumulator = 0.0; - // Search backward towards last gf position - for (i = -1; i >= -b_frames; i--) { + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF) break; - // Update the motion related elements to the boost calculation + // Update the motion related elements to the boost calculation. accumulate_frame_motion_stats(&this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, @@ -1283,7 +1216,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, flash_detected = detect_flash(twopass, i + offset) || detect_flash(twopass, i + offset + 1); - // Cumulative effect of prediction quality decay + // Cumulative effect of prediction quality decay. if (!flash_detected) { decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR @@ -1333,8 +1266,7 @@ static void schedule_frames(VP9_COMP *cpi, const int start, const int end, return; } - // ARF Group: work out the ARF schedule. - // Mark ARF frames as negative. + // ARF Group: Work out the ARF schedule and mark ARF frames as negative. if (end < 0) { // printf("start:%d end:%d\n", -end, -end); // ARF frame is at the end of the range. @@ -1457,14 +1389,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; - double loop_decay_rate = 1.00; // Starting decay rate + double loop_decay_rate = 1.00; double last_loop_decay_rate = 1.00; double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double mv_ratio_accumulator_thresh; - int max_bits = frame_max_bits(cpi); // Max for a single frame + const int max_bits = frame_max_bits(cpi); // Max bits for a single frame. unsigned int allow_alt_ref = cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames; @@ -1477,19 +1409,19 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->gf_group_bits = 0; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); start_pos = twopass->stats_in; // Load stats for the current frame. mod_frame_err = calculate_modified_err(cpi, this_frame); - // Note the error of the frame at the start of the group (this will be - // the GF frame error if we code a normal gf + // Note the error of the frame at the start of the group. This will be + // the GF frame error if we code a normal gf. gf_first_frame_err = mod_frame_err; // If this is a key frame or the overlay from a previous arf then - // The error score / cost of this frame has already been accounted for. + // the error score / cost of this frame has already been accounted for. if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active) gf_group_err -= gf_first_frame_err; @@ -1511,9 +1443,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { i = 0; while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) { - i++; // Increment the loop counter + ++i; - // Accumulate error score of frames in this gf group + // Accumulate error score of frames in this gf group. mod_frame_err = calculate_modified_err(cpi, this_frame); gf_group_err += mod_frame_err; @@ -1524,13 +1456,13 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // quality back to an earlier frame is then restored. flash_detected = detect_flash(twopass, 0); - // Update the motion related elements to the boost calculation + // Update the motion related elements to the boost calculation. accumulate_frame_motion_stats(&next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - // Cumulative effect of prediction quality decay + // Accumulate the effect of prediction quality decay. if (!flash_detected) { last_loop_decay_rate = loop_decay_rate; loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame); @@ -1543,8 +1475,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { next_frame.pcnt_motion; } - // Break clause to detect very still sections after motion - // (for example a static image after a fade or other transition). + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, last_loop_decay_rate)) { allow_alt_ref = 0; @@ -1552,16 +1484,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } - // Calculate a boost number for this frame + // Calculate a boost number for this frame. boost_score += (decay_accumulator * calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out)); // Break out conditions. if ( - // Break at cpi->max_gf_interval unless almost totally static + // Break at cpi->max_gf_interval unless almost totally static. (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) || ( - // Don't break out with a very short interval + // Don't break out with a very short interval. (i > MIN_GF_INTERVAL) && ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) && (!flash_detected) && @@ -1580,10 +1512,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); - // Don't allow a gf too near the next kf + // Don't allow a gf too near the next kf. if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) { while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) { - i++; + ++i; if (EOF == input_stats(twopass, this_frame)) break; @@ -1613,20 +1545,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { else rc->baseline_gf_interval = i; - // Should we use the alternate reference frame + // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= MIN_GF_INTERVAL) && - // for real scene cuts (not forced kfs) dont allow arf very near kf. + // For real scene cuts (not forced kfs) don't allow arf very near kf. (rc->next_key_frame_forced || - (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) && - ((next_frame.pcnt_inter > 0.75) || - (next_frame.pcnt_second_ref > 0.5)) && - ((mv_in_out_accumulator / (double)i > -0.2) || - (mv_in_out_accumulator > -2.0)) && - (boost_score > 100)) { - - // Alternative boost calculation for alt ref + (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) { + // Calculate the boost for alt ref. rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); rc->source_alt_ref_pending = 1; @@ -1688,28 +1614,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #endif #endif - // Calculate the bits to be allocated to the group as a whole - if ((cpi->twopass.kf_group_bits > 0) && - (cpi->twopass.kf_group_error_left > 0)) { - cpi->twopass.gf_group_bits = - (int64_t)(cpi->twopass.kf_group_bits * + // Calculate the bits to be allocated to the group as a whole. + if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) { + twopass->gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits * (gf_group_err / cpi->twopass.kf_group_error_left)); } else { - cpi->twopass.gf_group_bits = 0; + twopass->gf_group_bits = 0; } - cpi->twopass.gf_group_bits = - (cpi->twopass.gf_group_bits < 0) - ? 0 - : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) - ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits; + twopass->gf_group_bits = (twopass->gf_group_bits < 0) ? + 0 : (twopass->gf_group_bits > twopass->kf_group_bits) ? + twopass->kf_group_bits : twopass->gf_group_bits; // Clip cpi->twopass.gf_group_bits based on user supplied data rate - // variability limit (cpi->oxcf.two_pass_vbrmax_section) - if (cpi->twopass.gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) - cpi->twopass.gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + // variability limit, cpi->oxcf.two_pass_vbrmax_section. + if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) + twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; - // Reset the file position - reset_fpf_position(&cpi->twopass, start_pos); + // Reset the file position. + reset_fpf_position(twopass, start_pos); // Assign bits to the arf or gf. for (i = 0; i <= (rc->source_alt_ref_pending && @@ -1720,7 +1642,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100; - // Set max and minimum boost and hence minimum allocation + // Set max and minimum boost and hence minimum allocation. boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200); if (rc->source_alt_ref_pending && i == 0) @@ -1728,7 +1650,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { else allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100); - // Prevent overflow + // Prevent overflow. if (boost > 1023) { int divisor = boost >> 10; boost /= divisor; @@ -1736,18 +1658,18 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Calculate the number of bits to be spent on the gf or arf based on - // the boost number - gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits / - (double)allocation_chunks)); + // the boost number. + gf_bits = (int)((double)boost * (twopass->gf_group_bits / + (double)allocation_chunks)); // If the frame that is to be boosted is simpler than the average for // the gf/arf group then use an alternative calculation - // based on the error score of the frame itself + // based on the error score of the frame itself. if (rc->baseline_gf_interval < 1 || mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) { - double alt_gf_grp_bits = (double)cpi->twopass.kf_group_bits * + double alt_gf_grp_bits = (double)twopass->kf_group_bits * (mod_frame_err * (double)rc->baseline_gf_interval) / - DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left); + DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left); int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits / (double)allocation_chunks)); @@ -1758,70 +1680,68 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // If it is harder than other frames in the group make sure it at // least receives an allocation in keeping with its relative error // score, otherwise it may be worse off than an "un-boosted" frame. - int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits * + int alt_gf_bits = (int)((double)twopass->kf_group_bits * mod_frame_err / - DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left)); + DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left)); if (alt_gf_bits > gf_bits) gf_bits = alt_gf_bits; } - // Dont allow a negative value for gf_bits + // Don't allow a negative value for gf_bits. if (gf_bits < 0) gf_bits = 0; if (i == 0) { - cpi->twopass.gf_bits = gf_bits; + twopass->gf_bits = gf_bits; } if (i == 1 || (!rc->source_alt_ref_pending && - (cpi->common.frame_type != KEY_FRAME))) { - // Per frame bit target for this frame - rc->per_frame_bandwidth = gf_bits; + cpi->common.frame_type != KEY_FRAME)) { + // Calculate the per frame bit target for this frame. + vp9_rc_set_frame_target(cpi, gf_bits); } } { - // Adjust KF group bits and error remaining - cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err; - cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits; + // Adjust KF group bits and error remaining. + twopass->kf_group_error_left -= (int64_t)gf_group_err; + twopass->kf_group_bits -= twopass->gf_group_bits; - if (cpi->twopass.kf_group_bits < 0) - cpi->twopass.kf_group_bits = 0; + if (twopass->kf_group_bits < 0) + twopass->kf_group_bits = 0; - // If this is an arf update we want to remove the score for the - // overlay frame at the end which will usually be very cheap to code. - // The overlay frame has already in effect been coded so we want to spread - // the remaining bits amoung the other frames/ + // If this is an arf update we want to remove the score for the overlay + // frame at the end which will usually be very cheap to code. + // The overlay frame has already, in effect, been coded so we want to spread + // the remaining bits among the other frames. // For normal GFs remove the score for the GF itself unless this is // also a key frame in which case it has already been accounted for. if (rc->source_alt_ref_pending) { - cpi->twopass.gf_group_error_left = (int64_t)gf_group_err - mod_frame_err; + twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err); } else if (cpi->common.frame_type != KEY_FRAME) { - cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err + twopass->gf_group_error_left = (int64_t)(gf_group_err - gf_first_frame_err); } else { - cpi->twopass.gf_group_error_left = (int64_t)gf_group_err; + twopass->gf_group_error_left = (int64_t)gf_group_err; } - cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits; + twopass->gf_group_bits -= twopass->gf_bits; - if (cpi->twopass.gf_group_bits < 0) - cpi->twopass.gf_group_bits = 0; + if (twopass->gf_group_bits < 0) + twopass->gf_group_bits = 0; // This condition could fail if there are two kfs very close together - // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the + // despite MIN_GF_INTERVAL and would cause a divide by 0 in the // calculation of alt_extra_bits. if (rc->baseline_gf_interval >= 3) { const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost; if (boost >= 150) { - int alt_extra_bits; - int pct_extra = (boost - 100) / 50; - pct_extra = (pct_extra > 20) ? 20 : pct_extra; - - alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100); - cpi->twopass.gf_group_bits -= alt_extra_bits; + const int pct_extra = MIN(20, (boost - 100) / 50); + const int alt_extra_bits = (int)((twopass->gf_group_bits * pct_extra) / + 100); + twopass->gf_group_bits -= alt_extra_bits; } } } @@ -1830,20 +1750,20 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { FIRSTPASS_STATS sectionstats; zero_stats(§ionstats); - reset_fpf_position(&cpi->twopass, start_pos); + reset_fpf_position(twopass, start_pos); - for (i = 0; i < rc->baseline_gf_interval; i++) { - input_stats(&cpi->twopass, &next_frame); + for (i = 0; i < rc->baseline_gf_interval; ++i) { + input_stats(twopass, &next_frame); accumulate_stats(§ionstats, &next_frame); } avg_stats(§ionstats); - cpi->twopass.section_intra_rating = (int) + twopass->section_intra_rating = (int) (sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); - reset_fpf_position(&cpi->twopass, start_pos); + reset_fpf_position(twopass, start_pos); } } @@ -1879,34 +1799,27 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->twopass.gf_group_bits = 0; // Per frame bit target for this frame. - cpi->rc.per_frame_bandwidth = target_frame_size; -} - -static int test_for_kf_one_pass(VP9_COMP *cpi) { - // Placeholder function for auto key frame - return 0; + vp9_rc_set_frame_target(cpi, target_frame_size); } static int test_candidate_kf(VP9_COMP *cpi, - FIRSTPASS_STATS *last_frame, - FIRSTPASS_STATS *this_frame, - FIRSTPASS_STATS *next_frame) { + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *next_frame) { int is_viable_kf = 0; - // Does the frame satisfy the primary criteria of a key frame - // If so, then examine how well it predicts subsequent frames + // Does the frame satisfy the primary criteria of a key frame? + // If so, then examine how well it predicts subsequent frames. if ((this_frame->pcnt_second_ref < 0.10) && (next_frame->pcnt_second_ref < 0.10) && ((this_frame->pcnt_inter < 0.05) || - (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) && + (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && ((fabs(last_frame->coded_error - this_frame->coded_error) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > - .40) || + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) || (fabs(last_frame->intra_error - this_frame->intra_error) / - DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > - .40) || + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) || ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) { int i; @@ -1920,37 +1833,34 @@ static int test_candidate_kf(VP9_COMP *cpi, local_next_frame = *next_frame; - // Note the starting file position so we can reset to it + // Note the starting file position so we can reset to it. start_pos = cpi->twopass.stats_in; - // Examine how well the key frame predicts subsequent frames - for (i = 0; i < 16; i++) { + // Examine how well the key frame predicts subsequent frames. + for (i = 0; i < 16; ++i) { double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); if (next_iiratio > RMAX) next_iiratio = RMAX; - // Cumulative effect of decay in prediction quality + // Cumulative effect of decay in prediction quality. if (local_next_frame.pcnt_inter > 0.85) decay_accumulator *= local_next_frame.pcnt_inter; else decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; - // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; - - // Keep a running total + // Keep a running total. boost_score += (decay_accumulator * next_iiratio); - // Test various breakout clauses + // Test various breakout clauses. if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < 0.20) && (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 3.0) || - (local_next_frame.intra_error < 200) - ) { + (local_next_frame.intra_error < 200)) { break; } @@ -1990,8 +1900,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double kf_mod_err = 0.0; double kf_group_err = 0.0; - double kf_group_intra_err = 0.0; - double kf_group_coded_err = 0.0; double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; RATE_CONTROL *const rc = &cpi->rc; @@ -1999,23 +1907,23 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { vp9_zero(next_frame); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); start_position = twopass->stats_in; cpi->common.frame_type = KEY_FRAME; - // is this a forced key frame by interval + // Is this a forced key frame by interval. rc->this_key_frame_forced = rc->next_key_frame_forced; - // Clear the alt ref active flag as this can never be active on a key frame + // Clear the alt ref active flag as this can never be active on a key frame. rc->source_alt_ref_active = 0; - // Kf is always a gf so clear frames till next gf counter + // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; rc->frames_to_key = 1; - // Take a copy of the initial frame details + // Take a copy of the initial frame details. first_frame = *this_frame; twopass->kf_group_bits = 0; // Total bits available to kf group @@ -2023,86 +1931,75 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { kf_mod_err = calculate_modified_err(cpi, this_frame); - // find the next keyframe + // Find the next keyframe. i = 0; while (twopass->stats_in < twopass->stats_in_end) { - // Accumulate kf group error + // Accumulate kf group error. kf_group_err += calculate_modified_err(cpi, this_frame); - // These figures keep intra and coded error counts for all frames including - // key frames in the group. The effect of the key frame itself can be - // subtracted out using the first_frame data collected above. - kf_group_intra_err += this_frame->intra_error; - kf_group_coded_err += this_frame->coded_error; - - // load a the next frame's stats + // Load the next frame's stats. last_frame = *this_frame; input_stats(twopass, this_frame); // Provided that we are not at the end of the file... if (cpi->oxcf.auto_key && lookup_next_frame_stats(twopass, &next_frame) != EOF) { - // Normal scene cut check + // Check for a scene cut. if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) break; - - // How fast is prediction quality decaying + // How fast is the prediction quality decaying? loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame); // We want to know something about the recent past... rather than - // as used elsewhere where we are concened with decay in prediction + // as used elsewhere where we are concerned with decay in prediction // quality since the last GF or KF. recent_loop_decay[i % 8] = loop_decay_rate; decay_accumulator = 1.0; - for (j = 0; j < 8; j++) + for (j = 0; j < 8; ++j) decay_accumulator *= recent_loop_decay[j]; // Special check for transition or high motion followed by a - // to a static scene. + // static scene. if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i, loop_decay_rate, decay_accumulator)) break; - // Step on to the next frame - rc->frames_to_key++; + // Step on to the next frame. + ++rc->frames_to_key; // If we don't have a real key frame within the next two - // forcekeyframeevery intervals then break out of the loop. + // key_frame_frequency intervals then break out of the loop. if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency) break; } else { - rc->frames_to_key++; + ++rc->frames_to_key; } - i++; + ++i; } // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. - // This code centers the extra kf if the actual natural - // interval is between 1x and 2x + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. if (cpi->oxcf.auto_key && rc->frames_to_key > (int)cpi->key_frame_frequency) { FIRSTPASS_STATS tmp_frame; rc->frames_to_key /= 2; - // Copy first frame details + // Copy first frame details. tmp_frame = first_frame; - // Reset to the start of the group + // Reset to the start of the group. reset_fpf_position(twopass, start_position); kf_group_err = 0; - kf_group_intra_err = 0; - kf_group_coded_err = 0; - // Rescan to get the correct error data for the forced kf group - for (i = 0; i < rc->frames_to_key; i++) { - // Accumulate kf group errors + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + // Accumulate kf group errors. kf_group_err += calculate_modified_err(cpi, &tmp_frame); - kf_group_intra_err += tmp_frame.intra_error; - kf_group_coded_err += tmp_frame.coded_error; // Load the next frame's stats. input_stats(twopass, &tmp_frame); @@ -2114,28 +2011,22 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->next_key_frame_forced = 0; } - // Special case for the last key frame of the file + // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { - // Accumulate kf group error + // Accumulate kf group error. kf_group_err += calculate_modified_err(cpi, this_frame); - - // These figures keep intra and coded error counts for all frames including - // key frames in the group. The effect of the key frame itself can be - // subtracted out using the first_frame data collected above. - kf_group_intra_err += this_frame->intra_error; - kf_group_coded_err += this_frame->coded_error; } // Calculate the number of bits that should be assigned to the kf group. if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { - // Max for a single normal frame (not key frame) + // Maximum number of bits for a single normal frame (not key frame). int max_bits = frame_max_bits(cpi); - // Maximum bits for the kf group + // Maximum number of bits allocated to the key frame group. int64_t max_grp_bits; // Default allocation based on bits left and relative - // complexity of the section + // complexity of the section. twopass->kf_group_bits = (int64_t)(twopass->bits_left * (kf_group_err / twopass->modified_error_left)); @@ -2146,17 +2037,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } else { twopass->kf_group_bits = 0; } - // Reset the first pass file position + // Reset the first pass file position. reset_fpf_position(twopass, start_position); // Determine how big to make this keyframe based on how well the subsequent // frames use inter blocks. decay_accumulator = 1.0; boost_score = 0.0; - loop_decay_rate = 1.00; // Starting decay rate // Scan through the kf group collating various stats. - for (i = 0; i < rc->frames_to_key; i++) { + for (i = 0; i < rc->frames_to_key; ++i) { double r; if (EOF == input_stats(twopass, &next_frame)) @@ -2181,7 +2071,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (r > RMAX) r = RMAX; - // How fast is prediction quality decaying + // How fast is prediction quality decaying. if (!detect_flash(twopass, 0)) { loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame); decay_accumulator *= loop_decay_rate; @@ -2199,7 +2089,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { zero_stats(§ionstats); reset_fpf_position(twopass, start_position); - for (i = 0; i < rc->frames_to_key; i++) { + for (i = 0; i < rc->frames_to_key; ++i) { input_stats(twopass, &next_frame); accumulate_stats(§ionstats, &next_frame); } @@ -2210,10 +2100,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); } - // Reset the first pass file position + // Reset the first pass file position. reset_fpf_position(twopass, start_position); - // Work out how many bits to allocate for the key frame itself + // Work out how many bits to allocate for the key frame itself. if (1) { int kf_boost = (int)boost_score; int allocation_chunks; @@ -2222,33 +2112,34 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (kf_boost < (rc->frames_to_key * 3)) kf_boost = (rc->frames_to_key * 3); - if (kf_boost < MIN_BOOST) - kf_boost = MIN_BOOST; + if (kf_boost < MIN_KF_BOOST) + kf_boost = MIN_KF_BOOST; // Make a note of baseline boost and the zero motion // accumulator value for use elsewhere. rc->kf_boost = kf_boost; twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); - // We do three calculations for kf size. - // The first is based on the error score for the whole kf group. - // The second (optionally) on the key frames own error if this is - // smaller than the average for the group. - // The final one insures that the frame receives at least the - // allocation it would have received based on its own error score vs - // the error score remaining - // Special case if the sequence appears almost totaly static - // In this case we want to spend almost all of the bits on the - // key frame. - // cpi->rc.frames_to_key-1 because key frame itself is taken - // care of by kf_boost. + // Key frame size depends on: + // (1) the error score for the whole key frame group, + // (2) the key frames' own error if this is smaller than the + // average for the group (optional), + // (3) insuring that the frame receives at least the allocation it would + // have received based on its own error score vs the error score + // remaining. + // Special case: + // If the sequence appears almost totally static we want to spend almost + // all of the bits on the key frame. + // + // We use (cpi->rc.frames_to_key - 1) below because the key frame itself is + // taken care of by kf_boost. if (zero_motion_accumulator >= 0.99) { allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost; } else { allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost; } - // Prevent overflow + // Prevent overflow. if (kf_boost > 1028) { int divisor = kf_boost >> 10; kf_boost /= divisor; @@ -2258,7 +2149,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0 : twopass->kf_group_bits; - // Calculate the number of bits to be spent on the key frame + // Calculate the number of bits to be spent on the key frame. twopass->kf_bits = (int)((double)kf_boost * ((double)twopass->kf_group_bits / allocation_chunks)); @@ -2277,9 +2168,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (twopass->kf_bits > alt_kf_bits) twopass->kf_bits = alt_kf_bits; } else { - // Else if it is much harder than other frames in the group make sure - // it at least receives an allocation in keeping with its relative - // error score + // Else if it is much harder than other frames in the group make sure + // it at least receives an allocation in keeping with its relative + // error score. alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err / DOUBLE_DIVIDE_CHECK(twopass->modified_error_left))); @@ -2287,16 +2178,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_bits = alt_kf_bits; } } - twopass->kf_group_bits -= twopass->kf_bits; - - // Peer frame bit target for this frame - rc->per_frame_bandwidth = twopass->kf_bits; - // Convert to a per second bitrate - cpi->target_bandwidth = (int)(twopass->kf_bits * cpi->output_framerate); + // Per frame bit target for this frame. + vp9_rc_set_frame_target(cpi, twopass->kf_bits); } - // Note the total error score of the kf group minus the key frame itself + // Note the total error score of the kf group minus the key frame itself. twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); // Adjust the count of total modified error left. @@ -2305,73 +2192,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->modified_error_left -= kf_group_err; } -void vp9_get_svc_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - if ((cm->current_video_frame == 0) || - (cm->frame_flags & FRAMEFLAGS_KEY) || - (cpi->oxcf.auto_key && (cpi->rc.frames_since_key % - cpi->key_frame_frequency == 0))) { - cm->frame_type = KEY_FRAME; - cpi->rc.source_alt_ref_active = 0; - } else { - cm->frame_type = INTER_FRAME; - } - cpi->rc.frames_till_gf_update_due = INT_MAX; - cpi->rc.baseline_gf_interval = INT_MAX; -} - -// Use this macro to turn on/off use of alt-refs in one-pass mode. -#define USE_ALTREF_FOR_ONE_PASS 1 - -void vp9_get_one_pass_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - if (!cpi->refresh_alt_ref_frame && - (cm->current_video_frame == 0 || - cm->frame_flags & FRAMEFLAGS_KEY || - cpi->rc.frames_to_key == 0 || - (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { - cm->frame_type = KEY_FRAME; - cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 && - cpi->rc.frames_to_key == 0; - cpi->rc.frames_to_key = cpi->key_frame_frequency; - cpi->rc.kf_boost = KEY_FRAME_BOOST; - cpi->rc.source_alt_ref_active = 0; - } else { - cm->frame_type = INTER_FRAME; - } - if (cpi->rc.frames_till_gf_update_due == 0) { - cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL; - cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval; - // NOTE: frames_till_gf_update_due must be <= frames_to_key. - if (cpi->rc.frames_till_gf_update_due > cpi->rc.frames_to_key) - cpi->rc.frames_till_gf_update_due = cpi->rc.frames_to_key; - cpi->refresh_golden_frame = 1; - cpi->rc.source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; - cpi->rc.gfu_boost = 1000; - } -} - -void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - if ((cm->current_video_frame == 0 || - cm->frame_flags & FRAMEFLAGS_KEY || - cpi->rc.frames_to_key == 0 || - (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { - cm->frame_type = KEY_FRAME; - cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 && - cpi->rc.frames_to_key == 0; - cpi->rc.frames_to_key = cpi->key_frame_frequency; - cpi->rc.kf_boost = KEY_FRAME_BOOST; - cpi->rc.source_alt_ref_active = 0; - } else { - cm->frame_type = INTER_FRAME; - } - // Don't use gf_update by default in CBR mode. - cpi->rc.frames_till_gf_update_due = INT_MAX; - cpi->rc.baseline_gf_interval = INT_MAX; -} - -void vp9_get_first_pass_params(VP9_COMP *cpi) { +void vp9_rc_get_first_pass_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || @@ -2380,11 +2201,11 @@ void vp9_get_first_pass_params(VP9_COMP *cpi) { } else { cm->frame_type = INTER_FRAME; } - // Do not use periodic key frames + // Do not use periodic key frames. cpi->rc.frames_to_key = INT_MAX; } -void vp9_get_second_pass_params(VP9_COMP *cpi) { +void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; struct twopass_rc *const twopass = &cpi->twopass; @@ -2395,37 +2216,30 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) { double this_frame_intra_error; double this_frame_coded_error; + int target; if (!twopass->stats_in) return; if (cpi->refresh_alt_ref_frame) { cm->frame_type = INTER_FRAME; - rc->per_frame_bandwidth = twopass->gf_bits; + vp9_rc_set_frame_target(cpi, twopass->gf_bits); return; } vp9_clear_system_state(); if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { - rc->active_worst_quality = cpi->oxcf.cq_level; + twopass->active_worst_quality = cpi->oxcf.cq_level; } else if (cm->current_video_frame == 0) { // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / - frames_left); - const int tmp_q = estimate_max_q(cpi, &twopass->total_left_stats, - section_target_bandwidth); - - rc->active_worst_quality = tmp_q; + frames_left); + const int tmp_q = vp9_twopass_worst_quality(cpi, &twopass->total_left_stats, + section_target_bandwidth); + twopass->active_worst_quality = tmp_q; rc->ni_av_qi = tmp_q; rc->avg_q = vp9_convert_qindex_to_q(tmp_q); - - // Limit the maxq value returned subsequently. - // This increases the risk of overspend or underspend if the initial - // estimate for the clip is bad, but helps prevent excessive - // variation in Q, especially near the end of a clip - // where for example a small overspend may cause Q to crash - // adjust_maxq_qrange(cpi); } vp9_zero(this_frame); if (EOF == input_stats(twopass, &this_frame)) @@ -2434,19 +2248,19 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) { this_frame_intra_error = this_frame.intra_error; this_frame_coded_error = this_frame.coded_error; - // keyframe and section processing ! + // Keyframe and section processing. if (rc->frames_to_key == 0 || (cm->frame_flags & FRAMEFLAGS_KEY)) { - // Define next KF group and assign bits to it + // Define next KF group and assign bits to it. this_frame_copy = this_frame; find_next_key_frame(cpi, &this_frame_copy); } else { cm->frame_type = INTER_FRAME; } - // Is this a GF / ARF (Note that a KF is always also a GF) + // Is this frame a GF / ARF? (Note: a key frame is always also a GF). if (rc->frames_till_gf_update_due == 0) { - // Define next gf group and assign bits to it + // Define next gf group and assign bits to it. this_frame_copy = this_frame; #if CONFIG_MULTIPLE_ARF @@ -2461,18 +2275,19 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) { if (twopass->gf_zeromotion_pct > 995) { // As long as max_thresh for encode breakout is small enough, it is ok - // to enable it for no-show frame, i.e. set enable_encode_breakout to 2. + // to enable it for show frame, i.e. set allow_encode_breakout to + // ENCODE_BREAKOUT_LIMITED. if (!cm->show_frame) - cpi->enable_encode_breakout = 0; + cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED; else - cpi->enable_encode_breakout = 2; + cpi->allow_encode_breakout = ENCODE_BREAKOUT_LIMITED; } rc->frames_till_gf_update_due = rc->baseline_gf_interval; cpi->refresh_golden_frame = 1; } else { - // Otherwise this is an ordinary frame - // Assign bits from those allocated to the GF group + // Otherwise this is an ordinary frame. + // Assign bits from those allocated to the GF group. this_frame_copy = this_frame; assign_std_frame_bits(cpi, &this_frame_copy); } @@ -2488,13 +2303,13 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) { } } - // Set nominal per second bandwidth for this frame - cpi->target_bandwidth = (int)(rc->per_frame_bandwidth * - cpi->output_framerate); - if (cpi->target_bandwidth < 0) - cpi->target_bandwidth = 0; + if (cpi->common.frame_type == KEY_FRAME) + target = vp9_rc_clamp_iframe_target_size(cpi, rc->this_frame_target); + else + target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target); + vp9_rc_set_frame_target(cpi, target); - // Update the total stats remaining structure + // Update the total stats remaining structure. subtract_stats(&twopass->total_left_stats, &this_frame); } @@ -2503,5 +2318,18 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { cpi->twopass.bits_left -= cpi->rc.this_frame_target; #else cpi->twopass.bits_left -= 8 * bytes_used; + // Update bits left to the kf and gf groups to account for overshoot or + // undershoot on these frames. + if (cm->frame_type == KEY_FRAME) { + cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - + cpi->rc.projected_frame_size; + + cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0); + } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { + cpi->twopass.gf_group_bits += cpi->rc.this_frame_target - + cpi->rc.projected_frame_size; + + cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0); + } #endif } diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index ca5b10080..83e337b6d 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -10,25 +10,92 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ -#include "vp9/encoder/vp9_onyx_int.h" #ifdef __cplusplus extern "C" { #endif -void vp9_init_first_pass(VP9_COMP *cpi); -void vp9_first_pass(VP9_COMP *cpi); -void vp9_end_first_pass(VP9_COMP *cpi); +typedef struct { + double frame; + double intra_error; + double coded_error; + double sr_coded_error; + double ssim_weighted_pred_err; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double new_mv_count; + double duration; + double count; +} FIRSTPASS_STATS; -void vp9_init_second_pass(VP9_COMP *cpi); -void vp9_get_second_pass_params(VP9_COMP *cpi); -void vp9_end_second_pass(VP9_COMP *cpi); +struct twopass_rc { + unsigned int section_intra_rating; + unsigned int next_iiratio; + unsigned int this_iiratio; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; + FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start; + FIRSTPASS_STATS total_left_stats; + int first_pass_done; + int64_t bits_left; + int64_t clip_bits_total; + double avg_iiratio; + double modified_error_min; + double modified_error_max; + double modified_error_total; + double modified_error_left; + double kf_intra_err_min; + double gf_intra_err_min; + int static_scene_max_gf_interval; + int kf_bits; + // Remaining error from uncoded frames in a gf group. Two pass use only + int64_t gf_group_error_left; -void vp9_get_first_pass_params(VP9_COMP *cpi); -void vp9_get_one_pass_params(VP9_COMP *cpi); -void vp9_get_one_pass_cbr_params(VP9_COMP *cpi); -void vp9_get_svc_params(VP9_COMP *cpi); + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + // Error score of frames still to be coded in kf group + int64_t kf_group_error_left; + + // Projected Bits available for a group of frames including 1 GF or ARF + int64_t gf_group_bits; + // Bits for the golden frame or ARF - 2 pass only + int gf_bits; + int alt_extra_bits; + + int sr_update_lag; + + int kf_zeromotion_pct; + int gf_zeromotion_pct; + + int active_worst_quality; +}; + +struct VP9_COMP; + +void vp9_init_first_pass(struct VP9_COMP *cpi); +void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); +void vp9_first_pass(struct VP9_COMP *cpi); +void vp9_end_first_pass(struct VP9_COMP *cpi); + +void vp9_init_second_pass(struct VP9_COMP *cpi); +void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); +void vp9_end_second_pass(struct VP9_COMP *cpi); +int vp9_twopass_worst_quality(struct VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, + int section_target_bandwitdh); + +// Post encode update of the rate control parameters for 2-pass +void vp9_twopass_postencode_update(struct VP9_COMP *cpi, + uint64_t bytes_used); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c index e6e59c05a..4b642e2b6 100644 --- a/vp9/encoder/vp9_lookahead.c +++ b/vp9/encoder/vp9_lookahead.c @@ -11,9 +11,12 @@ #include <stdlib.h> #include "./vpx_config.h" + #include "vp9/common/vp9_common.h" + #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_onyx_int.h" struct lookahead_ctx { unsigned int max_sz; /* Absolute size of the queue */ diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index c50098678..44c1f9078 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -29,7 +29,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; - unsigned int best_err; const int tmp_col_min = x->mv_col_min; const int tmp_col_max = x->mv_col_max; @@ -48,27 +47,22 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, ref_full.row = ref_mv->row >> 3; /*cpi->sf.search_method == HEX*/ - best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit, - 0, &v_fn_ptr, 0, ref_mv, dst_mv); + vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0, + ref_mv, dst_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) { int distortion; unsigned int sse; - best_err = cpi->find_fractional_mv_step( - x, dst_mv, ref_mv, - cpi->common.allow_high_precision_mv, - x->errorperbit, &v_fn_ptr, - 0, cpi->sf.subpel_iters_per_step, NULL, NULL, - & distortion, &sse); + cpi->find_fractional_mv_step( + x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, + &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion, + &sse); } vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv); vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16); - best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - INT_MAX); /* restore UMV window */ x->mv_col_min = tmp_col_min; @@ -76,7 +70,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x->mv_row_min = tmp_row_min; x->mv_row_max = tmp_row_max; - return best_err; + return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride, + INT_MAX); } static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv, @@ -355,7 +351,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { // If any of the blocks in the sequence failed then the MB // goes in segment 0 - if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) { + if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) { ncnt[0]++; cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0; } else { @@ -423,7 +419,7 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) { golden_ref, cpi->Source); } - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); separate_arf_mbs(cpi); } diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h index 79dd2bc95..bc2a7048f 100644 --- a/vp9/encoder/vp9_mbgraph.h +++ b/vp9/encoder/vp9_mbgraph.h @@ -15,7 +15,23 @@ extern "C" { #endif -void vp9_update_mbgraph_stats(VP9_COMP *cpi); +typedef struct { + struct { + int err; + union { + int_mv mv; + MB_PREDICTION_MODE mode; + } m; + } ref[MAX_REF_FRAMES]; +} MBGRAPH_MB_STATS; + +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; + +struct VP9_COMP; + +void vp9_update_mbgraph_stats(struct VP9_COMP *cpi); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index ec9934a30..7d6fd3b99 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -349,6 +349,10 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, tr = br; tc = bc; } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; bestmv->row = br; bestmv->col = bc; @@ -452,6 +456,11 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, tr = br; tc = bc; } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; + bestmv->row = br; bestmv->col = bc; @@ -466,7 +475,6 @@ int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x, #undef PRE #undef DIST #undef CHECK_BETTER -#undef SP static INLINE int check_bounds(const MACROBLOCK *x, int row, int col, int range) { @@ -476,11 +484,9 @@ static INLINE int check_bounds(const MACROBLOCK *x, int row, int col, ((col + range) <= x->mv_col_max); } -static INLINE int check_point(const MACROBLOCK *x, const MV *mv) { - return (mv->col < x->mv_col_min) | - (mv->col > x->mv_col_max) | - (mv->row < x->mv_row_min) | - (mv->row > x->mv_row_max); +static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) { + return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) && + (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max); } #define CHECK_BETTER \ @@ -496,11 +502,6 @@ static INLINE int check_point(const MACROBLOCK *x, const MV *mv) { }\ } -#define get_next_chkpts(list, i, n) \ - list[0] = ((i) == 0 ? (n) - 1 : (i) - 1); \ - list[1] = (i); \ - list[2] = ((i) == (n) - 1 ? 0 : (i) + 1); - #define MAX_PATTERN_SCALES 11 #define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale #define PATTERN_CANDIDATES_REF 3 // number of refinement candidates @@ -578,7 +579,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, for (i = 0; i < num_candidates[t]; i++) { this_mv.row = br + candidates[t][i].row; this_mv.col = bc + candidates[t][i].col; - if (check_point(x, &this_mv)) + if (!is_mv_in(x, &this_mv)) continue; this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col; @@ -622,7 +623,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, for (i = 0; i < num_candidates[s]; i++) { this_mv.row = br + candidates[s][i].row; this_mv.col = bc + candidates[s][i].col; - if (check_point(x, &this_mv)) + if (!is_mv_in(x, &this_mv)) continue; this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col; @@ -644,7 +645,10 @@ static int vp9_pattern_search(const MACROBLOCK *x, do { int next_chkpts_indices[PATTERN_CANDIDATES_REF]; best_site = -1; - get_next_chkpts(next_chkpts_indices, k, num_candidates[s]); + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + if (check_bounds(x, br, bc, 1 << s)) { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; @@ -659,7 +663,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; - if (check_point(x, &this_mv)) + if (!is_mv_in(x, &this_mv)) continue; this_offset = base_offset + (this_mv.row * (in_what_stride)) + this_mv.col; @@ -698,7 +702,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, for (i = 0; i < 4; i++) { this_mv.row = br + neighbors[i].row; this_mv.col = bc + neighbors[i].col; - if (check_point(x, &this_mv)) + if (!is_mv_in(x, &this_mv)) continue; this_offset = base_offset + this_mv.row * in_what_stride + this_mv.col; @@ -851,12 +855,191 @@ int vp9_square_search(const MACROBLOCK *x, square_num_candidates, square_candidates); }; +// Number of candidates in first hex search +#define FIRST_HEX_CANDIDATES 6 +// Index of previous hex search's best match +#define PRE_BEST_CANDIDATE 6 +// Number of candidates in following hex search +#define NEXT_HEX_CANDIDATES 3 +// Number of candidates in refining search +#define REFINE_CANDIDATES 4 + +int vp9_fast_hex_search(const MACROBLOCK *x, + MV *ref_mv, + int search_param, + int sad_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + const MV *center_mv, + MV *best_mv) { + const MACROBLOCKD* const xd = &x->e_mbd; + static const MV hex[FIRST_HEX_CANDIDATES] = { + { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} + }; + static const MV next_chkpts[PRE_BEST_CANDIDATE][NEXT_HEX_CANDIDATES] = { + {{ -2, 0}, { -1, -2}, {1, -2}}, + {{ -1, -2}, {1, -2}, {2, 0}}, + {{1, -2}, {2, 0}, {1, 2}}, + {{2, 0}, {1, 2}, { -1, 2}}, + {{1, 2}, { -1, 2}, { -2, 0}}, + {{ -1, 2}, { -2, 0}, { -1, -2}} + }; + static const MV neighbors[REFINE_CANDIDATES] = { + {0, -1}, { -1, 0}, {1, 0}, {0, 1} + }; + int i, j; + + const uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = xd->plane[0].pre[0].stride; + int br, bc; + MV this_mv; + unsigned int bestsad = 0x7fffffff; + unsigned int thissad; + const uint8_t *base_offset; + const uint8_t *this_offset; + int k = -1; + int best_site = -1; + const int max_hex_search = 512; + const int max_dia_search = 32; + + const int *mvjsadcost = x->nmvjointsadcost; + int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; + + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + + // Adjust ref_mv to make sure it is within MV range + clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + br = ref_mv->row; + bc = ref_mv->col; + + // Check the start point + base_offset = xd->plane[0].pre[0].buf; + this_offset = base_offset + (br * in_what_stride) + bc; + this_mv.row = br; + this_mv.col = bc; + bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, + sad_per_bit); + + // Initial 6-point hex search + if (check_bounds(x, br, bc, 2)) { + for (i = 0; i < FIRST_HEX_CANDIDATES; i++) { + this_mv.row = br + hex[i].row; + this_mv.col = bc + hex[i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < FIRST_HEX_CANDIDATES; i++) { + this_mv.row = br + hex[i].row; + this_mv.col = bc + hex[i].col; + if (!is_mv_in(x, &this_mv)) + continue; + this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } + + // Continue hex search if we find a better match in first round + if (best_site != -1) { + br += hex[best_site].row; + bc += hex[best_site].col; + k = best_site; + + // Allow search covering maximum MV range + for (j = 1; j < max_hex_search; j++) { + best_site = -1; + + if (check_bounds(x, br, bc, 2)) { + for (i = 0; i < 3; i++) { + this_mv.row = br + next_chkpts[k][i].row; + this_mv.col = bc + next_chkpts[k][i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < 3; i++) { + this_mv.row = br + next_chkpts[k][i].row; + this_mv.col = bc + next_chkpts[k][i].col; + if (!is_mv_in(x, &this_mv)) + continue; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } + + if (best_site == -1) { + break; + } else { + br += next_chkpts[k][best_site].row; + bc += next_chkpts[k][best_site].col; + k += 5 + best_site; + if (k >= 12) k -= 12; + else if (k >= 6) k -= 6; + } + } + } + + // Check 4 1-away neighbors + for (j = 0; j < max_dia_search; j++) { + best_site = -1; + + if (check_bounds(x, br, bc, 1)) { + for (i = 0; i < REFINE_CANDIDATES; i++) { + this_mv.row = br + neighbors[i].row; + this_mv.col = bc + neighbors[i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < REFINE_CANDIDATES; i++) { + this_mv.row = br + neighbors[i].row; + this_mv.col = bc + neighbors[i].col; + if (!is_mv_in(x, &this_mv)) + continue; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } + + if (best_site == -1) { + break; + } else { + br += neighbors[best_site].row; + bc += neighbors[best_site].col; + } + } + + best_mv->row = br; + best_mv->col = bc; + + return bestsad; +} + #undef CHECK_BETTER int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], const MV *center_mv) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const uint8_t *what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; @@ -866,10 +1049,10 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, MV this_mv; - int bestsad = INT_MAX; + unsigned int bestsad = INT_MAX; int ref_row, ref_col; - int thissad; + unsigned int thissad; const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; const int *mvjsadcost = x->nmvjointsadcost; @@ -970,8 +1153,9 @@ int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int vp9_diamond_search_sad_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], const MV *center_mv) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv) { int i, j, step; const MACROBLOCKD *const xd = &x->e_mbd; @@ -1104,7 +1288,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, int vp9_diamond_search_sadx4(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { int i, j, step; @@ -1283,148 +1467,122 @@ int vp9_diamond_search_sadx4(const MACROBLOCK *x, int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, - int sadpb, int further_steps, - int do_refine, vp9_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, int_mv *dst_mv) { - int_mv temp_mv; - int thissme, n, num00; - int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv, - step_param, sadpb, &num00, + int sadpb, int further_steps, int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv) { + MV temp_mv; + int thissme, n, num00 = 0; + int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, + step_param, sadpb, &n, fn_ptr, x->nmvjointcost, x->mvcost, ref_mv); - dst_mv->as_int = temp_mv.as_int; - - n = num00; - num00 = 0; + *dst_mv = temp_mv; - /* If there won't be more n-step search, check to see if refining search is - * needed. */ + // If there won't be more n-step search, check to see if refining search is + // needed. if (n > further_steps) do_refine = 0; while (n < further_steps) { - n++; + ++n; if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv, + thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, step_param + n, sadpb, &num00, fn_ptr, x->nmvjointcost, x->mvcost, ref_mv); - /* check to see if refining search is needed. */ - if (num00 > (further_steps - n)) + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; if (thissme < bestsme) { bestsme = thissme; - dst_mv->as_int = temp_mv.as_int; + *dst_mv = temp_mv; } } } - /* final 1-away diamond refining search */ - if (do_refine == 1) { - int search_range = 8; - int_mv best_mv; - best_mv.as_int = dst_mv->as_int; - thissme = cpi->refining_search_sad(x, &best_mv.as_mv, sadpb, search_range, + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV best_mv = *dst_mv; + thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr, x->nmvjointcost, x->mvcost, ref_mv); - if (thissme < bestsme) { bestsme = thissme; - dst_mv->as_int = best_mv.as_int; + *dst_mv = best_mv; } } + return bestsme; } -int vp9_full_search_sad_c(const MACROBLOCK *x, MV *ref_mv, +int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], - const MV *center_mv, int n) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv, MV *best_mv) { + int r, c; const MACROBLOCKD *const xd = &x->e_mbd; const uint8_t *const what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; const uint8_t *const in_what = xd->plane[0].pre[0].buf; const int in_what_stride = xd->plane[0].pre[0].stride; - MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv; - MV this_mv; - int bestsad = INT_MAX; - int r, c; - int thissad; - int ref_row = ref_mv->row; - int ref_col = ref_mv->col; - // Apply further limits to prevent us looking using vectors that stretch - // beyond the UMV border - const int row_min = MAX(ref_row - distance, x->mv_row_min); - const int row_max = MIN(ref_row + distance, x->mv_row_max); - const int col_min = MAX(ref_col - distance, x->mv_col_min); - const int col_max = MIN(ref_col + distance, x->mv_col_max); - const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + const int row_min = MAX(ref_mv->row - distance, x->mv_row_min); + const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); + const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); + const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); const int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - // Work out the mid point for the search - const uint8_t *bestaddress = &in_what[ref_row * in_what_stride + ref_col]; - - best_mv->row = ref_row; - best_mv->col = ref_col; - - // Baseline value at the centre - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, - in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - for (r = row_min; r < row_max; r++) { - const uint8_t *check_here = &in_what[r * in_what_stride + col_min]; - this_mv.row = r; - - for (c = col_min; c < col_max; c++) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - this_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->row = r; - best_mv->col = c; - bestaddress = check_here; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride + + ref_mv->col]; + int best_sad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, + 0x7fffffff) + + mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); + *best_mv = *ref_mv; + + for (r = row_min; r < row_max; ++r) { + for (c = col_min; c < col_max; ++c) { + const MV this_mv = {r, c}; + const uint8_t *check_here = &in_what[r * in_what_stride + c]; + const int sad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + best_sad) + + mvsad_err_cost(&this_mv, &fcenter_mv, + mvjsadcost, mvsadcost, sad_per_bit); + + if (sad < best_sad) { + best_sad = sad; + *best_mv = this_mv; + best_address = check_here; } - - check_here++; } } - this_mv.row = best_mv->row * 8; - this_mv.col = best_mv->col * 8; - - if (bestsad < INT_MAX) - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, - mvjcost, mvcost, x->errorperbit); - else + if (best_sad < INT_MAX) { + unsigned int unused; + const MV mv = {best_mv->row * 8, best_mv->col * 8}; + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &unused) + + mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit); + } else { return INT_MAX; + } } -int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv, +int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], const MV *center_mv, int n) { + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv, MV *best_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const uint8_t *const what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; const uint8_t *const in_what = xd->plane[0].pre[0].buf; const int in_what_stride = xd->plane[0].pre[0].stride; - MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv; MV this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1520,17 +1678,16 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, MV *ref_mv, return INT_MAX; } -int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv, +int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], - const MV *center_mv, int n) { + const MV *center_mv, MV *best_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const uint8_t *const what = x->plane[0].src.buf; const int what_stride = x->plane[0].src.stride; const uint8_t *const in_what = xd->plane[0].pre[0].buf; const int in_what_stride = xd->plane[0].pre[0].stride; - MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv; MV this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1656,7 +1813,8 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, MV *ref_mv, int vp9_refining_search_sad_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; @@ -1669,11 +1827,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x, const uint8_t *const in_what = xd->plane[0].pre[0].buf; const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col]; - unsigned int thissad; - const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - MV this_mv; - const int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; @@ -1685,18 +1839,13 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x, int best_site = -1; for (j = 0; j < 4; j++) { - this_mv.row = ref_mv->row + neighbors[j].row; - this_mv.col = ref_mv->col + neighbors[j].col; - - if ((this_mv.col > x->mv_col_min) && - (this_mv.col < x->mv_col_max) && - (this_mv.row > x->mv_row_min) && - (this_mv.row < x->mv_row_max)) { + const MV this_mv = {ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col}; + if (is_mv_in(x, &this_mv)) { const uint8_t *check_here = &in_what[this_mv.row * in_what_stride + this_mv.col]; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - + unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here, + in_what_stride, bestsad); if (thissad < bestsad) { thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); @@ -1718,20 +1867,21 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x, } } - this_mv.row = ref_mv->row * 8; - this_mv.col = ref_mv->col * 8; - - if (bestsad < INT_MAX) + if (bestsad < INT_MAX) { + unsigned int unused; + const MV mv = {ref_mv->row * 8, ref_mv->col * 8}; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); - else + &unused) + + mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit); + } else { return INT_MAX; + } } int vp9_refining_search_sadx4(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; @@ -1844,8 +1994,10 @@ int vp9_refining_search_sadx4(const MACROBLOCK *x, // mode. int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], const MV *center_mv, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + const MV *center_mv, const uint8_t *second_pred, int w, int h) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, @@ -1878,10 +2030,7 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, this_mv.row = ref_mv->row + neighbors[j].row; this_mv.col = ref_mv->col + neighbors[j].col; - if ((this_mv.col > x->mv_col_min) && - (this_mv.col < x->mv_col_max) && - (this_mv.row > x->mv_row_min) && - (this_mv.row < x->mv_row_max)) { + if (is_mv_in(x, &this_mv)) { const uint8_t *check_here = &in_what[this_mv.row * in_what_stride + this_mv.col]; diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 28b46b503..586a74c9c 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -45,8 +45,8 @@ int vp9_init_search_range(struct VP9_COMP *cpi, int size); int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, - vp9_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, int_mv *dst_mv); + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv); int vp9_hex_search(const MACROBLOCK *x, MV *ref_mv, @@ -75,6 +75,14 @@ int vp9_square_search(const MACROBLOCK *x, int use_mvcost, const MV *center_mv, MV *best_mv); +int vp9_fast_hex_search(const MACROBLOCK *x, + MV *ref_mv, + int search_param, + int sad_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + const MV *center_mv, + MV *best_mv); typedef int (fractional_mv_step_fp) ( const MACROBLOCK *x, @@ -107,15 +115,16 @@ typedef int (fractional_mv_step_comp_fp) ( extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree; typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, - MV *ref_mv, int sad_per_bit, - int distance, vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, int sad_per_bit, + int distance, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], - const MV *center_mv, int n); + const MV *center_mv, MV *best_mv); typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv); @@ -123,13 +132,14 @@ typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv, const uint8_t *second_pred, int w, int h); diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index a9b0718c8..95ebb0c6d 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -14,6 +14,8 @@ #include "./vpx_config.h" #include "./vpx_scale_rtcd.h" +#include "vpx/internal/vpx_psnr.h" +#include "vpx_ports/vpx_timer.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_filter.h" @@ -30,7 +32,6 @@ #include "vp9/encoder/vp9_mbgraph.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_picklpf.h" -#include "vp9/encoder/vp9_psnr.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" @@ -38,8 +39,6 @@ #include "vp9/encoder/vp9_vaq.h" #include "vp9/encoder/vp9_resize.h" -#include "vpx_ports/vpx_timer.h" - void vp9_entropy_mode_init(); void vp9_coef_tree_initialize(); @@ -93,19 +92,10 @@ FILE *kf_list; FILE *keyfile; #endif -#ifdef SPEEDSTATS -unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0}; -#endif - -#if defined(SECTIONBITS_OUTPUT) -extern unsigned __int64 Sectionbits[500]; -#endif - -extern void vp9_init_quantizer(VP9_COMP *cpi); +void vp9_init_quantizer(VP9_COMP *cpi); static const double in_frame_q_adj_ratio[MAX_SEGMENTS] = - {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { @@ -163,20 +153,22 @@ void vp9_initialize_enc() { } static void dealloc_compressor_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + // Delete sementation map vpx_free(cpi->segmentation_map); - cpi->segmentation_map = 0; - vpx_free(cpi->common.last_frame_seg_map); - cpi->common.last_frame_seg_map = 0; + cpi->segmentation_map = NULL; + vpx_free(cm->last_frame_seg_map); + cm->last_frame_seg_map = NULL; vpx_free(cpi->coding_context.last_frame_seg_map_copy); - cpi->coding_context.last_frame_seg_map_copy = 0; + cpi->coding_context.last_frame_seg_map_copy = NULL; vpx_free(cpi->complexity_map); cpi->complexity_map = 0; vpx_free(cpi->active_map); cpi->active_map = 0; - vp9_free_frame_buffers(&cpi->common); + vp9_free_frame_buffers(cm); vp9_free_frame_buffer(&cpi->last_frame_uf); vp9_free_frame_buffer(&cpi->scaled_source); @@ -203,19 +195,20 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { // to a target value // target q value int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) { + const RATE_CONTROL *const rc = &cpi->rc; + int start_index = rc->worst_quality; + int target_index = rc->worst_quality; int i; - int start_index = cpi->rc.worst_quality; - int target_index = cpi->rc.worst_quality; // Convert the average q value to an index. - for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) { + for (i = rc->best_quality; i < rc->worst_quality; ++i) { start_index = i; if (vp9_convert_qindex_to_q(i) >= qstart) break; } // Convert the q target to an index - for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) { + for (i = rc->best_quality; i < rc->worst_quality; ++i) { target_index = i; if (vp9_convert_qindex_to_q(i) >= qtarget) break; @@ -227,28 +220,23 @@ int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) { // Computes a q delta (in "q index" terms) to get from a starting q value // to a value that should equate to thegiven rate ratio. -int vp9_compute_qdelta_by_rate(VP9_COMP *cpi, - double base_q_index, double rate_target_ratio) { +static int compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index, + double rate_target_ratio) { int i; - int base_bits_per_mb; - int target_bits_per_mb; int target_index = cpi->rc.worst_quality; - // Make SURE use of floating point in this function is safe. - vp9_clear_system_state(); - // Look up the current projected bits per block for the base index - base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type, - base_q_index, 1.0); + const int base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type, + base_q_index, 1.0); // Find the target bits per mb based on the base value and given ratio. - target_bits_per_mb = rate_target_ratio * base_bits_per_mb; + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); // Convert the q target to an index - for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) { + for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; ++i) { target_index = i; - if (vp9_rc_bits_per_mb(cpi->common.frame_type, - i, 1.0) <= target_bits_per_mb ) + if (vp9_rc_bits_per_mb(cpi->common.frame_type, i, 1.0) <= + target_bits_per_mb ) break; } @@ -258,11 +246,8 @@ int vp9_compute_qdelta_by_rate(VP9_COMP *cpi, // This function sets up a set of segments with delta Q values around // the baseline frame quantizer. static void setup_in_frame_q_adj(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - struct segmentation *seg = &cm->seg; - // double q_ratio; - int segment; - int qindex_delta; + VP9_COMMON *const cm = &cpi->common; + struct segmentation *const seg = &cm->seg; // Make SURE use of floating point in this function is safe. vp9_clear_system_state(); @@ -270,13 +255,14 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) { if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int segment; + // Clear down the segment map vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); // Clear down the complexity map used for rd vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols); - // Enable segmentation vp9_enable_segmentation((VP9_PTR)cpi); vp9_clearall_segfeatures(seg); @@ -287,9 +273,8 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) { vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); // Use some of the segments for in frame Q adjustment - for (segment = 1; segment < 3; segment++) { - qindex_delta = - vp9_compute_qdelta_by_rate(cpi, cm->base_qindex, + for (segment = 1; segment < 2; segment++) { + const int qindex_delta = compute_qdelta_by_rate(cpi, cm->base_qindex, in_frame_q_adj_ratio[segment]); vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); @@ -297,8 +282,8 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) { } } static void configure_static_seg_features(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - struct segmentation *seg = &cm->seg; + VP9_COMMON *const cm = &cpi->common; + struct segmentation *const seg = &cm->seg; int high_q = (int)(cpi->rc.avg_q > 48.0); int qi_delta; @@ -442,13 +427,13 @@ static void print_seg_map(VP9_COMP *cpi) { static void update_reference_segmentation_map(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; + uint8_t *cache_ptr = cm->last_frame_seg_map; int row, col; - MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible; - uint8_t *cache_ptr = cm->last_frame_seg_map, *cache; for (row = 0; row < cm->mi_rows; row++) { - mi_8x8 = mi_8x8_ptr; - cache = cache_ptr; + MODE_INFO **mi_8x8 = mi_8x8_ptr; + uint8_t *cache = cache_ptr; for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) cache[0] = mi_8x8[0]->mbmi.segment_id; mi_8x8_ptr += cm->mode_info_stride; @@ -581,7 +566,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, int speed) { int i; sf->adaptive_rd_thresh = 1; - sf->recode_loop = (speed < 1); + sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW); if (speed == 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; @@ -599,7 +584,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->adaptive_pred_interp_filter = 1; sf->auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; - sf->recode_loop = 2; + sf->recode_loop = ALLOW_RECODE_KFARFGF; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; @@ -635,7 +620,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->last_partitioning_redo_frequency = 3; sf->adaptive_rd_thresh = 2; - sf->recode_loop = 2; + sf->recode_loop = ALLOW_RECODE_KFARFGF; sf->use_lp32x32fdct = 1; sf->mode_skip_start = 11; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -663,6 +648,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->reference_masking = 1; sf->auto_mv_step_size = 1; + sf->disable_split_var_thresh = 32; sf->disable_filter_search_var_thresh = 100; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; @@ -698,6 +684,7 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->reference_masking = 1; sf->auto_mv_step_size = 1; + sf->disable_split_var_thresh = 64; sf->disable_filter_search_var_thresh = 200; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; @@ -715,9 +702,9 @@ static void set_good_speed_feature(VP9_COMMON *cm, sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; } - if (speed == 5) { + if (speed >= 5) { sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - sf->use_one_partition_size_always = 1; + sf->partition_search_type = FIXED_PARTITION; sf->always_this_block_size = BLOCK_16X16; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; @@ -752,7 +739,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm, int speed) { sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; - sf->recode_loop = (speed < 1); + sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW); + sf->encode_breakout_thresh = 1; + if (speed == 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; @@ -770,10 +759,11 @@ static void set_rt_speed_feature(VP9_COMMON *cm, sf->adaptive_pred_interp_filter = 1; sf->auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; - sf->recode_loop = 2; + sf->recode_loop = ALLOW_RECODE_KFARFGF; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->encode_breakout_thresh = 8; } if (speed >= 2) { sf->use_square_partition_only = !frame_is_intra_only(cm); @@ -806,13 +796,14 @@ static void set_rt_speed_feature(VP9_COMMON *cm, sf->last_partitioning_redo_frequency = 3; sf->adaptive_rd_thresh = 2; - sf->recode_loop = 2; + sf->recode_loop = ALLOW_RECODE_KFARFGF; sf->use_lp32x32fdct = 1; sf->mode_skip_start = 11; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->encode_breakout_thresh = 200; } if (speed >= 3) { sf->use_square_partition_only = 1; @@ -835,25 +826,37 @@ static void set_rt_speed_feature(VP9_COMMON *cm, sf->use_fast_coef_updates = 2; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; + sf->encode_breakout_thresh = 400; } if (speed >= 4) { sf->optimize_coefficients = 0; + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->use_fast_lpf_pick = 2; + sf->encode_breakout_thresh = 700; } if (speed >= 5) { int i; - sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->adaptive_rd_thresh = 5; sf->auto_min_max_partition_size = frame_is_intra_only(cm) ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; + sf->adjust_partitioning_from_last_frame = + cm->last_frame_type == KEY_FRAME || (0 == + (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency); sf->subpel_force_stop = 1; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_DC_H_V; sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; } - sf->use_fast_lpf_pick = 2; - sf->RD = 0; + sf->frame_parameter_update = 0; + sf->encode_breakout_thresh = 1000; + sf->search_method = FAST_HEX; } if (speed >= 6) { - sf->super_fast_rtc = 1; + sf->partition_search_type = VAR_BASED_FIXED_PARTITION; + } + if (speed >= 7) { + sf->partition_search_type = VAR_BASED_FIXED_PARTITION; + sf->use_nonrd_pick_mode = 1; } } @@ -867,13 +870,15 @@ void vp9_set_speed_features(VP9_COMP *cpi) { if (speed < 0) speed = -speed; +#if CONFIG_INTERNAL_STATS for (i = 0; i < MAX_MODES; ++i) cpi->mode_chosen_counts[i] = 0; +#endif // best quality defaults - sf->RD = 1; + sf->frame_parameter_update = 1; sf->search_method = NSTEP; - sf->recode_loop = 1; + sf->recode_loop = ALLOW_RECODE; sf->subpel_search_method = SUBPEL_TREE; sf->subpel_iters_per_step = 2; sf->subpel_force_stop = 0; @@ -889,7 +894,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; sf->reference_masking = 0; - sf->use_one_partition_size_always = 0; + sf->partition_search_type = SEARCH_PARTITION; sf->less_rectangular_check = 0; sf->use_square_partition_only = 0; sf->auto_min_max_partition_size = NOT_IN_USE; @@ -910,9 +915,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_uv_intra_rd_estimate = 0; sf->use_fast_lpf_pick = 0; sf->use_fast_coef_updates = 0; - sf->using_small_partition_info = 0; sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set - sf->super_fast_rtc = 0; + sf->use_nonrd_pick_mode = 0; + sf->encode_breakout_thresh = 0; switch (cpi->oxcf.mode) { case MODE_BESTQUALITY: @@ -941,7 +946,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { // No recode for 1 pass. if (cpi->pass == 0) { - sf->recode_loop = 0; + sf->recode_loop = DISALLOW_RECODE; sf->optimize_coefficients = 0; } @@ -957,25 +962,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1; -#ifdef SPEEDSTATS - frames_at_speed[cpi->speed]++; -#endif + if (cpi->encode_breakout && cpi->oxcf.mode == MODE_REALTIME && + sf->encode_breakout_thresh > cpi->encode_breakout) + cpi->encode_breakout = sf->encode_breakout_thresh; + + if (sf->disable_split_mask == DISABLE_ALL_SPLIT) + sf->adaptive_pred_interp_filter = 0; } static void alloc_raw_frame_buffers(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; + const VP9_CONFIG *oxcf = &cpi->oxcf; - cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height, + cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, - cpi->oxcf.lag_in_frames); + oxcf->lag_in_frames); if (!cpi->lookahead) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, - cpi->oxcf.width, cpi->oxcf.height, + oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS)) + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } @@ -1043,14 +1052,14 @@ static void update_frame_size(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS)) + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate last frame buffer"); if (vp9_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS)) + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer"); @@ -1100,15 +1109,13 @@ int vp9_reverse_trans(int x) { void vp9_new_framerate(VP9_COMP *cpi, double framerate) { VP9_COMMON *const cm = &cpi->common; - int64_t vbr_max_bits; + int vbr_max_bits; if (framerate < 0.1) framerate = 30; cpi->oxcf.framerate = framerate; cpi->output_framerate = cpi->oxcf.framerate; - cpi->rc.per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth - / cpi->output_framerate); cpi->rc.av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); cpi->rc.min_frame_bandwidth = (int)(cpi->rc.av_per_frame_bandwidth * @@ -1126,10 +1133,10 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) { // be acheived because of a user specificed max q (e.g. when the user // specifies lossless encode. // - vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth * - (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100; + vbr_max_bits = (int)(((int64_t)cpi->rc.av_per_frame_bandwidth * + cpi->oxcf.two_pass_vbrmax_section) / 100); cpi->rc.max_frame_bandwidth = - MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); // Set Maximum gf/arf interval cpi->rc.max_gf_interval = 16; @@ -1150,7 +1157,7 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) { cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval; } -static int64_t rescale(int val, int64_t num, int denom) { +static int64_t rescale(int64_t val, int64_t num, int denom) { int64_t llnum = num; int64_t llden = denom; int64_t llval = val; @@ -1158,6 +1165,124 @@ static int64_t rescale(int val, int64_t num, int denom) { return (llval * llnum / llden); } +// Initialize layer context data from init_config(). +static void init_layer_context(VP9_COMP *const cpi) { + const VP9_CONFIG *const oxcf = &cpi->oxcf; + int temporal_layer = 0; + cpi->svc.spatial_layer_id = 0; + cpi->svc.temporal_layer_id = 0; + for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers; + ++temporal_layer) { + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = q_trans[oxcf->worst_allowed_q]; + lrc->last_q[INTER_FRAME] = q_trans[oxcf->worst_allowed_q]; + lrc->ni_av_qi = q_trans[oxcf->worst_allowed_q]; + lrc->total_actual_bits = 0; + lrc->total_target_vs_actual = 0; + lrc->ni_tot_qi = 0; + lrc->tot_q = 0.0; + lrc->avg_q = 0.0; + lrc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + lrc->rate_correction_factor = 1.0; + lrc->key_frame_rate_correction_factor = 1.0; + lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * + 1000; + lrc->buffer_level = rescale((int)(oxcf->starting_buffer_level), + lc->target_bandwidth, 1000); + lrc->bits_off_target = lrc->buffer_level; + } +} + +// Update the layer context from a change_config() call. +static void update_layer_context_change_config(VP9_COMP *const cpi, + const int target_bandwidth) { + const VP9_CONFIG *const oxcf = &cpi->oxcf; + const RATE_CONTROL *const rc = &cpi->rc; + int temporal_layer = 0; + float bitrate_alloc = 1.0; + for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers; + ++temporal_layer) { + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer]; + RATE_CONTROL *const lrc = &lc->rc; + lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000; + bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth; + // Update buffer-related quantities. + lc->starting_buffer_level = + (int64_t)(oxcf->starting_buffer_level * bitrate_alloc); + lc->optimal_buffer_level = + (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc); + lc->maximum_buffer_size = + (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size); + lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size); + // Update framerate-related quantities. + lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer]; + lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + // Update qp-related quantities. + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } +} + +// Prior to encoding the frame, update framerate-related quantities +// for the current layer. +static void update_layer_framerate(VP9_COMP *const cpi) { + int temporal_layer = cpi->svc.temporal_layer_id; + const VP9_CONFIG *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer]; + RATE_CONTROL *const lrc = &lc->rc; + lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer]; + lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; + // Update the average layer frame size (non-cumulative per-frame-bw). + if (temporal_layer == 0) { + lc->avg_frame_size = lrc->av_per_frame_bandwidth; + } else { + double prev_layer_framerate = oxcf->framerate / + oxcf->ts_rate_decimator[temporal_layer - 1]; + int prev_layer_target_bandwidth = + oxcf->ts_target_bitrate[temporal_layer - 1] * 1000; + lc->avg_frame_size = + (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } +} + +// Prior to encoding the frame, set the layer context, for the current layer +// to be encoded, to the cpi struct. +static void restore_layer_context(VP9_COMP *const cpi) { + int temporal_layer = cpi->svc.temporal_layer_id; + LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer]; + int frame_since_key = cpi->rc.frames_since_key; + int frame_to_key = cpi->rc.frames_to_key; + cpi->rc = lc->rc; + cpi->oxcf.target_bandwidth = lc->target_bandwidth; + cpi->oxcf.starting_buffer_level = lc->starting_buffer_level; + cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level; + cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size; + cpi->output_framerate = lc->framerate; + // Reset the frames_since_key and frames_to_key counters to their values + // before the layer restore. Keep these defined for the stream (not layer). + cpi->rc.frames_since_key = frame_since_key; + cpi->rc.frames_to_key = frame_to_key; +} + +// Save the layer context after encoding the frame. +static void save_layer_context(VP9_COMP *const cpi) { + int temporal_layer = cpi->svc.temporal_layer_id; + LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer]; + lc->rc = cpi->rc; + lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth; + lc->starting_buffer_level = cpi->oxcf.starting_buffer_level; + lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level; + lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size; + lc->framerate = cpi->output_framerate; +} + static void set_tile_limits(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -1184,12 +1309,20 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cm->subsampling_y = 0; vp9_alloc_compressor_data(cpi); + // Spatial scalability. + cpi->svc.number_spatial_layers = oxcf->ss_number_layers; + // Temporal scalability. + cpi->svc.number_temporal_layers = oxcf->ts_number_layers; + + if (cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + init_layer_context(cpi); + } + // change includes all joint functionality vp9_change_config(ptr, oxcf); // Initialize active best and worst q and average q values. - cpi->rc.active_worst_quality = cpi->oxcf.worst_allowed_q; - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { cpi->rc.avg_frame_qindex[0] = cpi->oxcf.worst_allowed_q; cpi->rc.avg_frame_qindex[1] = cpi->oxcf.worst_allowed_q; @@ -1224,9 +1357,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->gld_fb_idx = 1; cpi->alt_fb_idx = 2; - cpi->current_layer = 0; - cpi->use_svc = 0; - set_tile_limits(cpi); cpi->fixed_divide[0] = 0; @@ -1234,7 +1364,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->fixed_divide[i] = 0x80000 / i; } - void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { VP9_COMP *cpi = (VP9_COMP *)(ptr); VP9_COMMON *const cm = &cpi->common; @@ -1248,6 +1377,9 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf = *oxcf; + if (cpi->oxcf.cpu_used == -6) + cpi->oxcf.play_alternate = 0; + switch (cpi->oxcf.mode) { // Real time and one pass deprecated in test code base case MODE_GOODQUALITY: @@ -1298,6 +1430,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { for (i = 0; i < MAX_SEGMENTS; i++) cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } + cpi->encode_breakout = cpi->oxcf.encode_breakout; // local file playback mode == really big buffer if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) { @@ -1326,10 +1459,10 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.target_bandwidth, 1000); // Under a configuration change, where maximum_buffer_size may change, // keep buffer level clipped to the maximum allowed buffer size. - if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) { - cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size; - cpi->rc.buffer_level = cpi->rc.bits_off_target; - } + cpi->rc.bits_off_target = MIN(cpi->rc.bits_off_target, + cpi->oxcf.maximum_buffer_size); + cpi->rc.buffer_level = MIN(cpi->rc.buffer_level, + cpi->oxcf.maximum_buffer_size); // Set up frame rate and related parameters rate control values. vp9_new_framerate(cpi, cpi->oxcf.framerate); @@ -1339,16 +1472,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->rc.best_quality = cpi->oxcf.best_allowed_q; // active values should only be modified if out of new range - cpi->rc.active_worst_quality = clamp(cpi->rc.active_worst_quality, - cpi->rc.best_quality, - cpi->rc.worst_quality); cpi->cq_target_quality = cpi->oxcf.cq_level; cm->interp_filter = DEFAULT_INTERP_FILTER; - cpi->target_bandwidth = cpi->oxcf.target_bandwidth; - cm->display_width = cpi->oxcf.width; cm->display_height = cpi->oxcf.height; @@ -1359,24 +1487,24 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { if (cpi->initial_width) { // Increasing the size of the frame beyond the first seen frame, or some - // otherwise signalled maximum size, is not supported. + // otherwise signaled maximum size, is not supported. // TODO(jkoleszar): exit gracefully. assert(cm->width <= cpi->initial_width); assert(cm->height <= cpi->initial_height); } update_frame_size(cpi); - cpi->speed = cpi->oxcf.cpu_used; + if (cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth); + } + + cpi->speed = abs(cpi->oxcf.cpu_used); - if (cpi->oxcf.lag_in_frames == 0) { - // force to allowlag to 0 if lag_in_frames is 0; - cpi->oxcf.allow_lag = 0; - } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) { - // Limit on lag buffers as these are not currently dynamically allocated + // Limit on lag buffers as these are not currently dynamically allocated. + if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; - } - // YX Temp #if CONFIG_MULTIPLE_ARF vp9_zero(cpi->alt_ref_source); #else @@ -1441,6 +1569,7 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, int num_pix = num_4x4_blk << 4; int i, k; ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_4x4_blk, sizeof(uint8_t))); for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -1484,7 +1613,6 @@ static void init_pick_mode_context(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; - for (i = 0; i < BLOCK_SIZES; ++i) { const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; const int num_4x4_h = num_4x4_blocks_high_lookup[i]; @@ -1589,6 +1717,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { vp9_create_common(cm); + cpi->use_svc = 0; + init_config((VP9_PTR)cpi, oxcf); init_pick_mode_context(cpi); @@ -1604,9 +1734,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->alt_is_last = 0; cpi->gold_is_alt = 0; - // Spatial scalability - cpi->number_spatial_layers = oxcf->ss_number_layers; - // Create the encoder segmentation map and set all entries to 0 CHECK_MEM_ERROR(cm, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); @@ -1632,11 +1759,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } -#ifdef ENTROPY_STATS - if (cpi->pass != 1) - init_context_counters(); -#endif - /*Initialize the feed-forward activity masking.*/ cpi->activity_avg = 90 << 12; cpi->key_frame_frequency = cpi->oxcf.key_freq; @@ -1741,7 +1863,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->output_pkt_list = oxcf->output_pkt_list; - cpi->enable_encode_breakout = 1; + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; if (cpi->pass == 1) { vp9_init_first_pass(cpi); @@ -1908,10 +2030,12 @@ void vp9_remove_compressor(VP9_PTR *ptr) { / time_encoded; if (cpi->b_calculate_psnr) { - const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0, - cpi->total_sq_error); - const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0, - cpi->totalp_sq_error); + const double total_psnr = + vpx_sse_to_psnr((double)cpi->total_samples, 255.0, + (double)cpi->total_sq_error); + const double totalp_psnr = + vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0, + (double)cpi->totalp_sq_error); const double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); const double totalp_ssim = 100 * pow(cpi->summedp_quality / @@ -1967,21 +2091,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) { } #endif -#if defined(SECTIONBITS_OUTPUT) - - if (0) { - int i; - FILE *f = fopen("tokenbits.stt", "a"); - - for (i = 0; i < 28; i++) - fprintf(f, "%8d", (int)(Sectionbits[i] / 256)); - - fprintf(f, "\n"); - fclose(f); - } - -#endif - #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); @@ -2102,12 +2211,12 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, const int w = widths[i]; const int h = heights[i]; const uint32_t samples = w * h; - const double sse = calc_plane_error(a_planes[i], a_strides[i], - b_planes[i], b_strides[i], - w, h); + const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i], + b_planes[i], b_strides[i], + w, h); psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; - psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse); + psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse); total_sse += sse; total_samples += samples; @@ -2115,7 +2224,8 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, psnr->sse[0] = total_sse; psnr->samples[0] = total_samples; - psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse); + psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0, + (double)total_sse); } static void generate_psnr_packet(VP9_COMP *cpi) { @@ -2448,34 +2558,33 @@ static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) { // Function to test for conditions that indicate we should loop // back and recode a frame. -static int recode_loop_test(VP9_COMP *cpi, +static int recode_loop_test(const VP9_COMP *cpi, int high_limit, int low_limit, int q, int maxq, int minq) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; int force_recode = 0; - VP9_COMMON *cm = &cpi->common; // Special case trap if maximum allowed frame size exceeded. - if (cpi->rc.projected_frame_size > cpi->rc.max_frame_bandwidth) { + if (rc->projected_frame_size > rc->max_frame_bandwidth) { force_recode = 1; // Is frame recode allowed. // Yes if either recode mode 1 is selected or mode 2 is selected // and the frame is a key frame, golden frame or alt_ref_frame - } else if ((cpi->sf.recode_loop == 1) || - ((cpi->sf.recode_loop == 2) && - ((cm->frame_type == KEY_FRAME) || - cpi->refresh_golden_frame || - cpi->refresh_alt_ref_frame))) { + } else if ((cpi->sf.recode_loop == ALLOW_RECODE) || + ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) && + (cm->frame_type == KEY_FRAME || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { // General over and under shoot tests - if (((cpi->rc.projected_frame_size > high_limit) && (q < maxq)) || - ((cpi->rc.projected_frame_size < low_limit) && (q > minq))) { + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { force_recode = 1; } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { // Deal with frame undershoot and whether or not we are // below the automatically set cq level. if (q > cpi->cq_target_quality && - cpi->rc.projected_frame_size < - ((cpi->rc.this_frame_target * 7) >> 3)) { + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { force_recode = 1; } } @@ -2583,7 +2692,7 @@ static void scale_references(VP9_COMP *cpi) { vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS); + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; } else { @@ -2629,14 +2738,14 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); int recon_err; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); if (cpi->twopass.total_left_stats.coded_error != 0.0) fprintf(f, "%10u %10d %10d %10d %10d %10d " "%10"PRId64" %10"PRId64" %10d " - "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" + "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" "%6d %6d %5d %5d %5d " "%10"PRId64" %10.3lf" "%10lf %8u %10d %10d %10d\n", @@ -2649,7 +2758,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { cpi->rc.total_actual_bits, cm->base_qindex, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, - vp9_convert_qindex_to_q(cpi->rc.active_worst_quality), cpi->rc.avg_q, + cpi->rc.avg_q, vp9_convert_qindex_to_q(cpi->rc.ni_av_qi), vp9_convert_qindex_to_q(cpi->cq_target_quality), cpi->refresh_last_frame, cpi->refresh_golden_frame, @@ -2673,8 +2782,6 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { for (i = 0; i < MAX_MODES; ++i) fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); - for (i = 0; i < MAX_REFS; ++i) - fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]); fprintf(fmodes, "\n"); @@ -2683,25 +2790,68 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { } #endif +static void encode_without_recode_loop(VP9_COMP *cpi, + size_t *size, + uint8_t *dest, + int q) { + VP9_COMMON *const cm = &cpi->common; + vp9_clear_system_state(); + vp9_set_quantizer(cpi, q); + + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + if (cm->frame_type == KEY_FRAME) { + vp9_setup_key_frame(cpi); + } else { + if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) { + cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; + } + vp9_setup_inter_frame(cpi); + } + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + setup_in_frame_q_adj(cpi); + } + // transform / motion compensation build reconstruction frame + vp9_encode_frame(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + vp9_clear_system_state(); +} + static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest, - int *q, + int q, int bottom_index, - int top_index, - int frame_over_shoot_limit, - int frame_under_shoot_limit) { + int top_index) { VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; int loop_count = 0; int loop = 0; int overshoot_seen = 0; int undershoot_seen = 0; int q_low = bottom_index, q_high = top_index; + int frame_over_shoot_limit; + int frame_under_shoot_limit; + + // Decide frame size bounds + vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); do { - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); - vp9_set_quantizer(cpi, *q); + vp9_set_quantizer(cpi, q); if (loop_count == 0) { // Set up entropy context depending on frame type. The decoder mandates @@ -2712,7 +2862,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, if (cm->frame_type == KEY_FRAME) { vp9_setup_key_frame(cpi); } else { - if (!cm->intra_only && !cm->error_resilient_mode) { + if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) { cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; } vp9_setup_inter_frame(cpi); @@ -2728,25 +2878,24 @@ static void encode_with_recode_loop(VP9_COMP *cpi, } // transform / motion compensation build reconstruction frame - vp9_encode_frame(cpi); // Update the skip mb flag probabilities based on the distribution // seen in the last encoder iteration. // update_base_skip_probs(cpi); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); // Dummy pack of the bitstream using up to date stats to get an // accurate estimate of output frame size to determine if we need // to recode. - if (cpi->sf.recode_loop != 0) { + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { vp9_save_coding_context(cpi); cpi->dummy_packing = 1; - if (!cpi->sf.super_fast_rtc) + if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size); - cpi->rc.projected_frame_size = (*size) << 3; + rc->projected_frame_size = (int)(*size) << 3; vp9_restore_coding_context(cpi); if (frame_over_shoot_limit == 0) @@ -2757,9 +2906,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, loop = 0; } else { if ((cm->frame_type == KEY_FRAME) && - cpi->rc.this_key_frame_forced && - (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) { - int last_q = *q; + rc->this_key_frame_forced && + (rc->projected_frame_size < rc->max_frame_bandwidth)) { + int last_q = q; int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); int high_err_target = cpi->ambient_err; @@ -2771,65 +2920,65 @@ static void encode_with_recode_loop(VP9_COMP *cpi, // The key frame is not good enough or we can afford // to make it better without undue risk of popping. if ((kf_err > high_err_target && - cpi->rc.projected_frame_size <= frame_over_shoot_limit) || + rc->projected_frame_size <= frame_over_shoot_limit) || (kf_err > low_err_target && - cpi->rc.projected_frame_size <= frame_under_shoot_limit)) { + rc->projected_frame_size <= frame_under_shoot_limit)) { // Lower q_high - q_high = *q > q_low ? *q - 1 : q_low; + q_high = q > q_low ? q - 1 : q_low; // Adjust Q - *q = ((*q) * high_err_target) / kf_err; - *q = MIN((*q), (q_high + q_low) >> 1); + q = (q * high_err_target) / kf_err; + q = MIN(q, (q_high + q_low) >> 1); } else if (kf_err < low_err_target && - cpi->rc.projected_frame_size >= frame_under_shoot_limit) { + rc->projected_frame_size >= frame_under_shoot_limit) { // The key frame is much better than the previous frame // Raise q_low - q_low = *q < q_high ? *q + 1 : q_high; + q_low = q < q_high ? q + 1 : q_high; // Adjust Q - *q = ((*q) * low_err_target) / kf_err; - *q = MIN((*q), (q_high + q_low + 1) >> 1); + q = (q * low_err_target) / kf_err; + q = MIN(q, (q_high + q_low + 1) >> 1); } // Clamp Q to upper and lower limits: - *q = clamp(*q, q_low, q_high); + q = clamp(q, q_low, q_high); - loop = *q != last_q; + loop = q != last_q; } else if (recode_loop_test( cpi, frame_over_shoot_limit, frame_under_shoot_limit, - *q, MAX(q_high, top_index), bottom_index)) { + q, MAX(q_high, top_index), bottom_index)) { // Is the projected frame size out of range and are we allowed // to attempt to recode. - int last_q = *q; + int last_q = q; int retries = 0; // Frame size out of permitted range: // Update correction factor & compute new Q to try... // Frame is too large - if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) { + if (rc->projected_frame_size > rc->this_frame_target) { // Special case if the projected size is > the max allowed. - if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth) - q_high = cpi->rc.worst_quality; + if (rc->projected_frame_size >= rc->max_frame_bandwidth) + q_high = rc->worst_quality; // Raise Qlow as to at least the current value - q_low = *q < q_high ? *q + 1 : q_high; + q_low = q < q_high ? q + 1 : q_high; if (undershoot_seen || loop_count > 1) { // Update rate_correction_factor unless vp9_rc_update_rate_correction_factors(cpi, 1); - *q = (q_high + q_low + 1) / 2; + q = (q_high + q_low + 1) / 2; } else { // Update rate_correction_factor unless vp9_rc_update_rate_correction_factors(cpi, 0); - *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target, + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, MAX(q_high, top_index)); - while (*q < q_low && retries < 10) { + while (q < q_low && retries < 10) { vp9_rc_update_rate_correction_factors(cpi, 0); - *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target, + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, MAX(q_high, top_index)); retries++; } @@ -2838,27 +2987,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi, overshoot_seen = 1; } else { // Frame is too small - q_high = *q > q_low ? *q - 1 : q_low; + q_high = q > q_low ? q - 1 : q_low; if (overshoot_seen || loop_count > 1) { vp9_rc_update_rate_correction_factors(cpi, 1); - *q = (q_high + q_low) / 2; + q = (q_high + q_low) / 2; } else { vp9_rc_update_rate_correction_factors(cpi, 0); - *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target, + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, top_index); // Special case reset for qlow for constrained quality. // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above // the user passsed in value. if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && - *q < q_low) { - q_low = *q; + q < q_low) { + q_low = q; } - while (*q > q_high && retries < 10) { + while (q > q_high && retries < 10) { vp9_rc_update_rate_correction_factors(cpi, 0); - *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target, + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, top_index); retries++; } @@ -2868,17 +3017,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, } // Clamp Q to upper and lower limits: - *q = clamp(*q, q_low, q_high); + q = clamp(q, q_low, q_high); - loop = *q != last_q; + loop = q != last_q; } else { loop = 0; } } // Special case for overlay frame. - if (cpi->rc.is_src_frame_alt_ref && - (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) + if (rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; if (loop) { @@ -2912,6 +3061,9 @@ static void get_ref_frame_flags(VP9_COMP *cpi) { if (cpi->gold_is_last) cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; + if (cpi->rc.frames_till_gf_update_due == INT_MAX) + cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; + if (cpi->alt_is_last) cpi->ref_frame_flags &= ~VP9_ALT_FLAG; @@ -2943,20 +3095,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; TX_SIZE t; int q; - int frame_over_shoot_limit; - int frame_under_shoot_limit; int top_index; int bottom_index; - SPEED_FEATURES *const sf = &cpi->sf; - unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); + const SPEED_FEATURES *const sf = &cpi->sf; + const unsigned int max_mv_def = MIN(cm->width, cm->height); struct segmentation *const seg = &cm->seg; set_ext_overrides(cpi); /* Scale the source buffer, if required. */ - if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width || - cm->mi_rows * 8 != cpi->un_scaled_source->y_height) { + if (cm->mi_cols * MI_SIZE != cpi->un_scaled_source->y_width || + cm->mi_rows * MI_SIZE != cpi->un_scaled_source->y_height) { scale_and_extend_frame_nonnormative(cpi->un_scaled_source, &cpi->scaled_source); cpi->Source = &cpi->scaled_source; @@ -2965,12 +3115,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } scale_references(cpi); - // Clear down mmx registers to allow floating point in what follows. vp9_clear_system_state(); - // Clear zbin over-quant value and mode boost values. - cpi->zbin_mode_boost = 0; - // Enable or disable mode based tweaking of the zbin. // For 2 pass only used where GF/ARF prediction quality // is above a threshold. @@ -2978,7 +3124,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->zbin_mode_boost_enabled = 0; // Current default encoder behavior for the altref sign bias. - cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active; + cm->ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active; // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -2987,7 +3133,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. if (sf->auto_mv_step_size) { - if (frame_is_intra_only(&cpi->common)) { + if (frame_is_intra_only(cm)) { // Initialize max_mv_magnitude for use in the first INTER frame // after a key/intra-only frame. cpi->max_mv_magnitude = max_mv_def; @@ -2996,8 +3142,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Allow mv_steps to correspond to twice the max mv magnitude found // in the previous frame, capped by the default max_mv_magnitude based // on resolution. - cpi->mv_step_param = vp9_init_search_range( - cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + cpi->mv_step_param = vp9_init_search_range(cpi, MIN(max_mv_def, 2 * + cpi->max_mv_magnitude)); cpi->max_mv_magnitude = 0; } } @@ -3020,7 +3166,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); cm->frame_parallel_decoding_mode = (cpi->oxcf.frame_parallel_decoding_mode != 0); + + // By default, encoder assumes decoder can use prev_mi. + cm->coding_use_prev_mi = 1; if (cm->error_resilient_mode) { + cm->coding_use_prev_mi = 0; cm->frame_parallel_decoding_mode = 1; cm->reset_frame_context = 0; cm->refresh_frame_context = 0; @@ -3034,21 +3184,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // static regions if indicated. // Only allowed in second pass of two pass (as requires lagged coding) // and if the relevant speed feature flag is set. - if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) { + if (cpi->pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); - } // For 1 pass CBR, check if we are dropping this frame. // Never drop on key frame. if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && cm->frame_type != KEY_FRAME) { - if (vp9_drop_frame(cpi)) { - // Update buffer level with zero size, update frame counters, and return. - vp9_update_buffer_level(cpi, 0); - cm->last_frame_type = cm->frame_type; + if (vp9_rc_drop_frame(cpi)) { vp9_rc_postencode_update_drop_frame(cpi); - cm->current_video_frame++; + ++cm->current_video_frame; return; } } @@ -3086,44 +3232,20 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_write_yuv_frame(cpi->Source); #endif - // Decide how big to make the frame. - vp9_rc_pick_frame_size_target(cpi); - - // Decide frame size bounds - vp9_rc_compute_frame_size_bounds(cpi, cpi->rc.this_frame_target, - &frame_under_shoot_limit, - &frame_over_shoot_limit); - // Decide q and q bounds. - q = vp9_rc_pick_q_and_adjust_q_bounds(cpi, - &bottom_index, - &top_index); - - // JBB : This is realtime mode. In real time mode the first frame - // should be larger. Q of 0 is disabled because we force tx size to be - // 16x16... - if (cpi->sf.super_fast_rtc) { - if (cpi->common.current_video_frame == 0) - q /= 3; - - if (q == 0) - q++; - } + q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index); if (!frame_is_intra_only(cm)) { cm->interp_filter = DEFAULT_INTERP_FILTER; /* TODO: Decide this more intelligently */ - set_high_precision_mv(cpi, (q < HIGH_PRECISION_MV_QTHRESH)); + set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH); } - encode_with_recode_loop(cpi, - size, - dest, - &q, - bottom_index, - top_index, - frame_over_shoot_limit, - frame_under_shoot_limit); + if (cpi->sf.recode_loop == DISALLOW_RECODE) { + encode_without_recode_loop(cpi, size, dest, q); + } else { + encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index); + } // Special case code to reduce pulsing when key frames are forced at a // fixed interval. Note the reconstruction error if it is the frame before @@ -3170,41 +3292,30 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, update_reference_frames(cpi); for (t = TX_4X4; t <= TX_32X32; t++) - full_to_model_counts(cpi->common.counts.coef[t], - cpi->coef_counts[t]); - if (!cpi->common.error_resilient_mode && - !cpi->common.frame_parallel_decoding_mode) { - vp9_adapt_coef_probs(&cpi->common); - } - - if (!frame_is_intra_only(&cpi->common)) { - if (!cpi->common.error_resilient_mode && - !cpi->common.frame_parallel_decoding_mode) { - vp9_adapt_mode_probs(&cpi->common); - vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv); - } - } + full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]); -#ifdef ENTROPY_STATS - vp9_update_mode_context_stats(cpi); -#endif + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) + vp9_adapt_coef_probs(cm); - /* Move storing frame_type out of the above loop since it is also - * needed in motion search besides loopfilter */ - cm->last_frame_type = cm->frame_type; + if (!frame_is_intra_only(cm)) { + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + } #if 0 output_frame_level_debug_stats(cpi); #endif if (cpi->refresh_golden_frame == 1) - cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; + cm->frame_flags |= FRAMEFLAGS_GOLDEN; else - cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN; + cm->frame_flags &= ~FRAMEFLAGS_GOLDEN; if (cpi->refresh_alt_ref_frame == 1) - cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF; + cm->frame_flags |= FRAMEFLAGS_ALTREF; else - cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF; + cm->frame_flags &= ~FRAMEFLAGS_ALTREF; get_ref_frame_flags(cpi); @@ -3253,6 +3364,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // reset to normal state now that we are done. if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame; + if (cm->show_frame) { // current mip will be the prev_mip for the next frame MODE_INFO *temp = cm->prev_mip; @@ -3273,6 +3385,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // update not a real frame ++cm->current_video_frame; } + // restore prev_mi cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; @@ -3280,16 +3393,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { - vp9_get_svc_params(cpi); + vp9_rc_get_svc_params(cpi); encode_frame_to_data_rate(cpi, size, dest, frame_flags); } static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - vp9_get_one_pass_cbr_params(cpi); + vp9_rc_get_one_pass_cbr_params(cpi); } else { - vp9_get_one_pass_params(cpi); + vp9_rc_get_one_pass_vbr_params(cpi); } encode_frame_to_data_rate(cpi, size, dest, frame_flags); } @@ -3300,16 +3413,16 @@ static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, (void) dest; (void) frame_flags; - vp9_get_first_pass_params(cpi); + vp9_rc_get_first_pass_params(cpi); vp9_set_quantizer(cpi, find_fp_qindex()); vp9_first_pass(cpi); } static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { - cpi->enable_encode_breakout = 1; + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - vp9_get_second_pass_params(cpi); + vp9_rc_get_second_pass_params(cpi); encode_frame_to_data_rate(cpi, size, dest, frame_flags); vp9_twopass_postencode_update(cpi, *size); @@ -3318,6 +3431,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, static void check_initial_width(VP9_COMP *cpi, int subsampling_x, int subsampling_y) { VP9_COMMON *const cm = &cpi->common; + if (!cpi->initial_width) { cm->subsampling_x = subsampling_x; cm->subsampling_y = subsampling_y; @@ -3331,12 +3445,12 @@ static void check_initial_width(VP9_COMP *cpi, int subsampling_x, int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { - VP9_COMP *cpi = (VP9_COMP *) ptr; - VP9_COMMON *cm = &cpi->common; - struct vpx_usec_timer timer; - int res = 0; - const int subsampling_x = sd->uv_width < sd->y_width; - const int subsampling_y = sd->uv_height < sd->y_height; + VP9_COMP *cpi = (VP9_COMP *)ptr; + VP9_COMMON *cm = &cpi->common; + struct vpx_usec_timer timer; + int res = 0; + const int subsampling_x = sd->uv_width < sd->y_width; + const int subsampling_y = sd->uv_height < sd->y_height; check_initial_width(cpi, subsampling_x, subsampling_y); vpx_usec_timer_start(&timer); @@ -3558,11 +3672,17 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, adjust_frame_rate(cpi); } + if (cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + update_layer_framerate(cpi); + restore_layer_context(cpi); + } + // start with a 0 size frame *size = 0; // Clear down mmx registers - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); /* find a free buffer for the new frame, releasing the reference previously * held. @@ -3587,7 +3707,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9_ENC_BORDER_IN_PIXELS); + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; @@ -3607,8 +3727,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, xd->interp_kernel = vp9_get_interp_kernel( DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_vaq_init(); + } if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); @@ -3633,6 +3754,12 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->droppable = !frame_is_reference(cpi); } + // Save layer specific state. + if (cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + save_layer_context(cpi); + } + vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); @@ -3642,7 +3769,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #if CONFIG_INTERNAL_STATS if (cpi->pass != 1) { - cpi->bytes += *size; + cpi->bytes += (int)(*size); if (cm->show_frame) { cpi->count++; @@ -3717,22 +3844,23 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags) { - VP9_COMP *cpi = (VP9_COMP *) comp; + VP9_COMP *cpi = (VP9_COMP *)comp; + VP9_COMMON *cm = &cpi->common; - if (!cpi->common.show_frame) { + if (!cm->show_frame) { return -1; } else { int ret; #if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(&cpi->common, dest, flags); + ret = vp9_post_proc_frame(cm, dest, flags); #else - if (cpi->common.frame_to_show) { - *dest = *cpi->common.frame_to_show; - dest->y_width = cpi->common.width; - dest->y_height = cpi->common.height; - dest->uv_width = cpi->common.width >> cpi->common.subsampling_x; - dest->uv_height = cpi->common.height >> cpi->common.subsampling_y; + if (cm->frame_to_show) { + *dest = *cm->frame_to_show; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->subsampling_x; + dest->uv_height = cm->height >> cm->subsampling_y; ret = 0; } else { ret = -1; @@ -3846,11 +3974,11 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width, cm->width = width; if (cm->width * 5 < cpi->initial_width) { cm->width = cpi->initial_width / 5 + 1; - printf("Warning: Desired width too small, changed to %d \n", cm->width); + printf("Warning: Desired width too small, changed to %d\n", cm->width); } if (cm->width > cpi->initial_width) { cm->width = cpi->initial_width; - printf("Warning: Desired width too large, changed to %d \n", cm->width); + printf("Warning: Desired width too large, changed to %d\n", cm->width); } } @@ -3858,11 +3986,11 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width, cm->height = height; if (cm->height * 5 < cpi->initial_height) { cm->height = cpi->initial_height / 5 + 1; - printf("Warning: Desired height too small, changed to %d \n", cm->height); + printf("Warning: Desired height too small, changed to %d\n", cm->height); } if (cm->height > cpi->initial_height) { cm->height = cpi->initial_height; - printf("Warning: Desired height too large, changed to %d \n", cm->height); + printf("Warning: Desired height too large, changed to %d\n", cm->height); } } diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index d928312b6..fd2356591 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -23,7 +23,9 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_mbgraph.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -35,17 +37,17 @@ extern "C" { #endif -#define DISABLE_RC_LONG_TERM_MEM 0 // #define MODE_TEST_HIT_STATS -// #define SPEEDSTATS 1 #if CONFIG_MULTIPLE_ARF // Set MIN_GF_INTERVAL to 1 for the full decomposition. #define MIN_GF_INTERVAL 2 #else #define MIN_GF_INTERVAL 4 #endif -#define DEFAULT_GF_INTERVAL 7 +#define DEFAULT_GF_INTERVAL 10 +#define DEFAULT_KF_BOOST 2000 +#define DEFAULT_GF_BOOST 2000 #define KEY_FRAME_CONTEXT 5 @@ -78,42 +80,6 @@ typedef struct { FRAME_CONTEXT fc; } CODING_CONTEXT; -typedef struct { - double frame; - double intra_error; - double coded_error; - double sr_coded_error; - double ssim_weighted_pred_err; - double pcnt_inter; - double pcnt_motion; - double pcnt_second_ref; - double pcnt_neutral; - double MVr; - double mvr_abs; - double MVc; - double mvc_abs; - double MVrv; - double MVcv; - double mv_in_out_count; - double new_mv_count; - double duration; - double count; -} FIRSTPASS_STATS; - -typedef struct { - struct { - int err; - union { - int_mv mv; - MB_PREDICTION_MODE mode; - } m; - } ref[MAX_REF_FRAMES]; -} MBGRAPH_MB_STATS; - -typedef struct { - MBGRAPH_MB_STATS *mb_stats; -} MBGRAPH_FRAME_STATS; - // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. typedef enum { @@ -170,7 +136,8 @@ typedef enum { NSTEP = 1, HEX = 2, BIGDIA = 3, - SQUARE = 4 + SQUARE = 4, + FAST_HEX = 5 } SEARCH_METHODS; typedef enum { @@ -231,18 +198,50 @@ typedef enum { LAST_FRAME_PARTITION_ALL = 2 } LAST_FRAME_PARTITION_METHOD; +typedef enum { + // No recode. + DISALLOW_RECODE = 0, + // Allow recode for KF and exceeding maximum frame bandwidth. + ALLOW_RECODE_KFMAXBW = 1, + // Allow recode only for KF/ARF/GF frames. + ALLOW_RECODE_KFARFGF = 2, + // Allow recode for all frames based on bitrate constraints. + ALLOW_RECODE = 3, +} RECODE_LOOP_TYPE; + +typedef enum { + // encode_breakout is disabled. + ENCODE_BREAKOUT_DISABLED = 0, + // encode_breakout is enabled. + ENCODE_BREAKOUT_ENABLED = 1, + // encode_breakout is enabled with small max_thresh limit. + ENCODE_BREAKOUT_LIMITED = 2 +} ENCODE_BREAKOUT_TYPE; + +typedef enum { + // Search partitions using RD/NONRD criterion + SEARCH_PARTITION = 0, + + // Always use a fixed size partition + FIXED_PARTITION = 1, + + // Use a fixed size partition in every 64X64 SB, where the size is + // determined based on source variance + VAR_BASED_FIXED_PARTITION = 2, + + // Use an arbitrary partitioning scheme based on source variance within + // a 64X64 SB + VAR_BASED_PARTITION +} PARTITION_SEARCH_TYPE; + typedef struct { - // This flag refers to whether or not to perform rd optimization. - int RD; + // Frame level coding parameter update + int frame_parameter_update; // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; - // Recode_loop can be: - // 0 means we only encode a frame once - // 1 means we can re-encode based on bitrate constraints on any frame - // 2 means we can only recode gold, alt, and key frames. - int recode_loop; + RECODE_LOOP_TYPE recode_loop; // Subpel_search_method can only be subpel_tree which does a subpixel // logarithmic search that keeps stepping at 1/2 pixel units until @@ -321,16 +320,6 @@ typedef struct { // TODO(JBB): remove this as its no longer used. - // If set partition size will always be always_this_block_size. - int use_one_partition_size_always; - - // Skip rectangular partition test when partition type none gives better - // rd than partition type split. - int less_rectangular_check; - - // Disable testing non square partitions. (eg 16x32) - int use_square_partition_only; - // After looking at the first set of modes (set by index here), skip // checking modes for reference frames that don't match the reference frame // of the best so far. @@ -339,9 +328,18 @@ typedef struct { // TODO(JBB): Remove this. int reference_masking; - // Used in conjunction with use_one_partition_size_always. + PARTITION_SEARCH_TYPE partition_search_type; + + // Used if partition_search_type = FIXED_SIZE_PARTITION BLOCK_SIZE always_this_block_size; + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + int less_rectangular_check; + + // Disable testing non square partitions. (eg 16x32) + int use_square_partition_only; + // Sets min and max partition sizes for this 64x64 region based on the // same 64x64 in last encoded frame, and the left and above neighbor. AUTO_MIN_MAX_MODE auto_min_max_partition_size; @@ -364,11 +362,6 @@ typedef struct { // inter modes or to enable it always. int disable_split_mask; - // TODO(jbb): Remove this and everything that uses it. It's only valid if - // we were doing small to large partition checks. We currently do the - // reverse. - int using_small_partition_info; - // TODO(jingning): combine the related motion search speed features // This allows us to use motion search at other sizes as a starting // point for this motion search and limits the search range around it. @@ -417,10 +410,24 @@ typedef struct { // by only looking at counts from 1/2 the bands. int use_fast_coef_updates; // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced - // This flag control the use of the new super fast rtc mode - int super_fast_rtc; + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // This variable sets the encode_breakout threshold. Currently, it is only + // enabled in real time mode. + int encode_breakout_thresh; } SPEED_FEATURES; +typedef struct { + RATE_CONTROL rc; + int target_bandwidth; + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + double framerate; + int avg_frame_size; +} LAYER_CONTEXT; + typedef struct VP9_COMP { DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); @@ -454,7 +461,7 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG *un_scaled_source; YV12_BUFFER_CONFIG scaled_source; - unsigned int key_frame_frequency; + int key_frame_frequency; int gold_is_last; // gold same as last frame ( short circuit gold searches) int alt_is_last; // Alt same as last ( short circuit altref search) @@ -465,9 +472,6 @@ typedef struct VP9_COMP { int gld_fb_idx; int alt_fb_idx; - int current_layer; - int use_svc; - #if CONFIG_MULTIPLE_ARF int alt_ref_fb_idx[REF_FRAMES - 3]; #endif @@ -498,12 +502,6 @@ typedef struct VP9_COMP { // Ambient reconstruction err target for force key frames int ambient_err; - unsigned int mode_chosen_counts[MAX_MODES]; - unsigned int sub8x8_mode_chosen_counts[MAX_REFS]; - int64_t mode_skip_mask; - int ref_frame_mask; - int set_ref_frame_mask; - int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS]; @@ -543,7 +541,6 @@ typedef struct VP9_COMP { vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES]; vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES]; - int64_t target_bandwidth; struct vpx_codec_pkt_list *output_pkt_list; MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; @@ -567,6 +564,13 @@ typedef struct VP9_COMP { unsigned int max_mv_magnitude; int mv_step_param; + // Default value is 1. From first pass stats, encode_breakout may be disabled. + ENCODE_BREAKOUT_TYPE allow_encode_breakout; + + // Get threshold from external input. In real time mode, it can be + // overwritten according to encoding speed. + int encode_breakout; + unsigned char *segmentation_map; // segment threashold for encode breakout @@ -588,52 +592,15 @@ typedef struct VP9_COMP { uint64_t time_pick_lpf; uint64_t time_encode_sb_row; - struct twopass_rc { - unsigned int section_intra_rating; - unsigned int next_iiratio; - unsigned int this_iiratio; - FIRSTPASS_STATS total_stats; - FIRSTPASS_STATS this_frame_stats; - FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start; - FIRSTPASS_STATS total_left_stats; - int first_pass_done; - int64_t bits_left; - int64_t clip_bits_total; - double avg_iiratio; - double modified_error_min; - double modified_error_max; - double modified_error_total; - double modified_error_left; - double kf_intra_err_min; - double gf_intra_err_min; - int static_scene_max_gf_interval; - int kf_bits; - // Remaining error from uncoded frames in a gf group. Two pass use only - int64_t gf_group_error_left; - - // Projected total bits available for a key frame group of frames - int64_t kf_group_bits; - - // Error score of frames still to be coded in kf group - int64_t kf_group_error_left; - - // Projected Bits available for a group of frames including 1 GF or ARF - int64_t gf_group_bits; - // Bits for the golden frame or ARF - 2 pass only - int gf_bits; - int alt_extra_bits; - - int sr_update_lag; - - int kf_zeromotion_pct; - int gf_zeromotion_pct; - } twopass; + struct twopass_rc twopass; YV12_BUFFER_CONFIG alt_ref_buffer; YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; int fixed_divide[512]; #if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; + int count; double total_y; double total_u; @@ -684,9 +651,17 @@ typedef struct VP9_COMP { int initial_width; int initial_height; - int number_spatial_layers; - int enable_encode_breakout; // Default value is 1. From first pass stats, - // encode_breakout may be disabled. + int use_svc; + + struct svc { + int spatial_layer_id; + int temporal_layer_id; + int number_spatial_layers; + int number_temporal_layers; + // Layer context used for rate control in CBR mode, only defined for + // temporal layers for now. + LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS]; + } svc; #if CONFIG_MULTIPLE_ARF // ARF tracking variables. @@ -741,8 +716,6 @@ void vp9_encode_frame(VP9_COMP *cpi); void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); -void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x); - void vp9_set_speed_features(VP9_COMP *cpi); int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 1aaa4162b..87f20fa1c 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -98,10 +98,14 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[bsize], - &ref_mv.as_mv, tmp_mv); + if (cpi->sf.search_method == FAST_HEX) { + vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, &cpi->fn_ptr[bsize], + 1, &ref_mv.as_mv, &tmp_mv->as_mv); + } else { + vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, + 1, &cpi->fn_ptr[bsize], &ref_mv.as_mv, + &tmp_mv->as_mv); + } x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -130,9 +134,50 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // calculate the bit cost on motion vector *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + return bestsme; +} +static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; + int ref = mbmi->ref_frame[0]; + int_mv ref_mv = mbmi->ref_mvs[ref][0]; + int dis; - return bestsme; + const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, + ref); + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[0]; + + setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + + tmp_mv->as_mv.col >>= 3; + tmp_mv->as_mv.row >>= 3; + + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.subpel_force_stop, + cpi->sf.subpel_iters_per_step, + x->nmvjointcost, x->mvcost, + &dis, &x->pred_sse[ref]); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } } // TODO(jingning) placeholder for inter-frame non-RD mode decision. @@ -145,16 +190,21 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); - MB_PREDICTION_MODE this_mode; - MV_REFERENCE_FRAME ref_frame; + MB_PREDICTION_MODE this_mode, best_mode = ZEROMV; + MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; int64_t this_rd; - int64_t cost[4]= { 0, 100, 150, 205 }; + static const int cost[4]= { 0, 50, 75, 100 }; + + const int64_t inter_mode_thresh = 300; + const int64_t intra_mode_cost = 50; x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; @@ -164,12 +214,17 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // initialize mode decisions *returnrate = INT_MAX; + *returndistortion = INT64_MAX; vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO)); mbmi->sb_type = bsize; mbmi->ref_frame[0] = NONE; mbmi->ref_frame[1] = NONE; mbmi->tx_size = MIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ? + EIGHTTAP : cpi->common.interp_filter; + mbmi->skip = 0; + mbmi->segment_id = 0; for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; @@ -194,12 +249,14 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd); clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd); + mbmi->ref_frame[0] = ref_frame; + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { - int rate = cost[this_mode - NEARESTMV]; + int rate = cost[INTER_OFFSET(this_mode)]; int64_t dist; if (this_mode == NEWMV) { - if (this_rd < 300) + if (this_rd < 500) continue; x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] = @@ -208,34 +265,55 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV) continue; + + sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame]); } - dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)]; + mbmi->mode = this_mode; + mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + + dist = cpi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, INT_MAX); this_rd = rate + dist; if (this_rd < best_rd) { best_rd = this_rd; - mbmi->mode = this_mode; - mbmi->ref_frame[0] = ref_frame; - mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; - xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; - mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ? - EIGHTTAP : cpi->common.interp_filter; - - mbmi->ref_frame[1] = INTRA_FRAME; - mbmi->tx_size = max_txsize_lookup[bsize]; - mbmi->uv_mode = this_mode; - mbmi->skip_coeff = 0; - mbmi->sb_type = bsize; - mbmi->segment_id = 0; + best_mode = this_mode; + best_ref_frame = ref_frame; } } } - // TODO(jingning) sub-pixel motion search, if NEWMV is chosen + mbmi->mode = best_mode; + mbmi->ref_frame[0] = best_ref_frame; + mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; - // TODO(jingning) intra prediction search, if the best SAD is above a certain + // Perform intra prediction search, if the best SAD is above a certain // threshold. + if (best_rd > inter_mode_thresh) { + for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) { + vp9_predict_intra_block(xd, 0, b_width_log2(bsize), + mbmi->tx_size, this_mode, + &p->src.buf[0], p->src.stride, + &pd->dst.buf[0], pd->dst.stride, 0, 0, 0); + + this_rd = cpi->fn_ptr[bsize].sdf(p->src.buf, + p->src.stride, + pd->dst.buf, + pd->dst.stride, INT_MAX); + + if (this_rd + intra_mode_cost < best_rd) { + best_rd = this_rd; + mbmi->mode = this_mode; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->uv_mode = this_mode; + mbmi->mv[0].as_int = INVALID_MV; + } + } + } return INT64_MAX; } diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c deleted file mode 100644 index 58294e15a..000000000 --- a/vp9/encoder/vp9_psnr.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <math.h> - -#include "vpx_scale/yv12config.h" - -#define MAX_PSNR 100 - -double vp9_mse2psnr(double samples, double peak, double mse) { - double psnr; - - if (mse > 0.0) - psnr = 10.0 * log10(peak * peak * samples / mse); - else - psnr = MAX_PSNR; // Limit to prevent / 0 - - if (psnr > MAX_PSNR) - psnr = MAX_PSNR; - - return psnr; -} diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h deleted file mode 100644 index ffe00ed2c..000000000 --- a/vp9/encoder/vp9_psnr.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_VP9_PSNR_H_ -#define VP9_ENCODER_VP9_PSNR_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -double vp9_mse2psnr(double samples, double peak, double mse); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_ENCODER_VP9_PSNR_H_ diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index a2eea1cd7..372c36221 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -26,7 +26,7 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = count, eob = -1; + int i, non_zero_count = (int)count, eob = -1; const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, zbin_ptr[1] + zbin_oq_value }; const int nzbins[2] = { zbins[0] * -1, @@ -37,7 +37,7 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, if (!skip_block) { // Pre-scan pass - for (i = count - 1; i >= 0; i--) { + for (i = (int)count - 1; i >= 0; i--) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; @@ -79,55 +79,47 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, rc, eob; - int zbins[2], nzbins[2]; - int x, y, z, sz; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1), + ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) }; + const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; + int idx = 0; int idx_arr[1024]; + int i, eob = -1; - vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - - eob = -1; - - // Base ZBIN - zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); - zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); - nzbins[0] = zbins[0] * -1; - nzbins[1] = zbins[1] * -1; + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); if (!skip_block) { // Pre-scan pass for (i = 0; i < n_coeffs; i++) { - rc = scan[i]; - z = coeff_ptr[rc]; + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; // If the coefficient is out of the base ZBIN range, keep it for // quantization. - if (z >= zbins[rc != 0] || z <= nzbins[rc != 0]) + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; } // Quantization pass: only process the coefficients selected in // pre-scan pass. Note: idx can be zero. for (i = 0; i < idx; i++) { - rc = scan[idx_arr[i]]; - - z = coeff_ptr[rc]; - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - x = clamp(x, INT16_MIN, INT16_MAX); - y = ((((x * quant_ptr[rc != 0]) >> 16) + x) * - quant_shift_ptr[rc != 0]) >> 15; // quantize (x) - - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value - - if (y) - eob = idx_arr[i]; // last nonzero coeffs + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> 15; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + + if (tmp) + eob = idx_arr[i]; } } *eob_ptr = eob + 1; @@ -136,8 +128,8 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan) { MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane* p = &x->plane[plane]; - struct macroblockd_plane* pd = &xd->plane[plane]; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; vp9_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, @@ -223,38 +215,30 @@ void vp9_init_quantizer(VP9_COMP *cpi) { } void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { - int i; - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - int zbin_extra; - int segment_id = xd->mi_8x8[0]->mbmi.segment_id; - const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id, - cpi->common.base_qindex); - - int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + const int segment_id = xd->mi_8x8[0]->mbmi.segment_id; + const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + const int zbin = cpi->zbin_mode_boost + x->act_zbin_adj; + int i; // Y - zbin_extra = (cpi->common.y_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; - x->plane[0].quant = cpi->y_quant[qindex]; x->plane[0].quant_shift = cpi->y_quant_shift[qindex]; x->plane[0].zbin = cpi->y_zbin[qindex]; x->plane[0].round = cpi->y_round[qindex]; - x->plane[0].zbin_extra = (int16_t)zbin_extra; - x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex]; + x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7); + xd->plane[0].dequant = cm->y_dequant[qindex]; // UV - zbin_extra = (cpi->common.uv_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; - for (i = 1; i < 3; i++) { x->plane[i].quant = cpi->uv_quant[qindex]; x->plane[i].quant_shift = cpi->uv_quant_shift[qindex]; x->plane[i].zbin = cpi->uv_zbin[qindex]; x->plane[i].round = cpi->uv_round[qindex]; - x->plane[i].zbin_extra = (int16_t)zbin_extra; - x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex]; + x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7); + xd->plane[i].dequant = cm->uv_dequant[qindex]; } #if CONFIG_ALPHA @@ -263,18 +247,14 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[3].zbin = cpi->a_zbin[qindex]; x->plane[3].round = cpi->a_round[qindex]; x->plane[3].zbin_extra = (int16_t)zbin_extra; - x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex]; + xd->plane[3].dequant = cm->a_dequant[qindex]; #endif - x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id, - SEG_LVL_SKIP); - - /* save this macroblock QIndex for vp9_update_zbin_extra() */ + x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->q_index = qindex; - /* R/D setup */ - cpi->mb.errorperbit = rdmult >> 6; - cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); + x->errorperbit = rdmult >> 6; + x->errorperbit += (x->errorperbit == 0); vp9_initialize_me_consts(cpi, x->q_index); } diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 701557238..89aa82140 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -8,23 +8,24 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <limits.h> #include <assert.h> +#include <limits.h> #include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_ratectrl.h" #include "vp9/common/vp9_entropymode.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_encodemv.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_ratectrl.h" #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1 @@ -209,51 +210,60 @@ static int estimate_bits_at_q(int frame_kind, int q, int mbs, : (bpm * mbs) >> BPER_MB_NORMBITS; } - -static void calc_iframe_target_size(VP9_COMP *cpi) { - const VP9_CONFIG *oxcf = &cpi->oxcf; - RATE_CONTROL *const rc = &cpi->rc; - int target; - - vp9_clear_system_state(); // __asm emms; - - // For 1-pass. - if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) { - if (cpi->common.current_video_frame == 0) { - target = oxcf->starting_buffer_level / 2; - } else { - // TODO(marpan): Add in adjustment based on Q. - // If this keyframe was forced, use a more recent Q estimate. - // int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY) ? - // cpi->rc.avg_frame_qindex : cpi->rc.ni_av_qi; - int initial_boost = 32; - // Boost depends somewhat on frame rate. - int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16)); - // Adjustment up based on q: need to fix. - // kf_boost = kf_boost * kfboost_qadjust(Q) / 100; - // Frame separation adjustment (down). - if (rc->frames_since_key < cpi->output_framerate / 2) { - kf_boost = (int)(kf_boost * rc->frames_since_key / - (cpi->output_framerate / 2)); - } - kf_boost = (kf_boost < 16) ? 16 : kf_boost; - target = ((16 + kf_boost) * rc->per_frame_bandwidth) >> 4; - } - rc->active_worst_quality = rc->worst_quality; - } else { - target = rc->per_frame_bandwidth; +int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const int min_frame_target = MAX(rc->min_frame_bandwidth, + rc->av_per_frame_bandwidth >> 5); + if (target < min_frame_target) + target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; } + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) + target = rc->max_frame_bandwidth; + return target; +} +int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const VP9_CONFIG *oxcf = &cpi->oxcf; if (oxcf->rc_max_intra_bitrate_pct) { - const int max_rate = rc->per_frame_bandwidth * + const int max_rate = rc->av_per_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; target = MIN(target, max_rate); } - rc->this_frame_target = target; + if (target > rc->max_frame_bandwidth) + target = rc->max_frame_bandwidth; + return target; +} + + +// Update the buffer level for higher layers, given the encoded current layer. +static void update_layer_buffer_level(VP9_COMP *const cpi, + int encoded_frame_size) { + int temporal_layer = 0; + int current_temporal_layer = cpi->svc.temporal_layer_id; + for (temporal_layer = current_temporal_layer + 1; + temporal_layer < cpi->svc.number_temporal_layers; ++temporal_layer) { + LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer]; + RATE_CONTROL *lrc = &lc->rc; + int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate - + encoded_frame_size); + lrc->bits_off_target += bits_off_for_this_layer; + + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + } } // Update the buffer level: leaky bucket model. -void vp9_update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { +static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { const VP9_COMMON *const cm = &cpi->common; const VP9_CONFIG *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; @@ -266,14 +276,18 @@ void vp9_update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { } // Clip the buffer level to the maximum specified buffer size. - rc->buffer_level = MIN(rc->bits_off_target, oxcf->maximum_buffer_size); + rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; + + if (cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + update_layer_buffer_level(cpi, encoded_frame_size); + } } -int vp9_drop_frame(VP9_COMP *cpi) { +int vp9_rc_drop_frame(VP9_COMP *cpi) { const VP9_CONFIG *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - if (!oxcf->drop_frames_water_mark) { return 0; } else { @@ -284,7 +298,7 @@ int vp9_drop_frame(VP9_COMP *cpi) { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = (int)(oxcf->drop_frames_water_mark * - oxcf->optimal_buffer_level / 100); + oxcf->optimal_buffer_level / 100); if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { --rc->decimation_factor; @@ -308,127 +322,12 @@ int vp9_drop_frame(VP9_COMP *cpi) { } } -// Adjust active_worst_quality level based on buffer level. -static int adjust_active_worst_quality_from_buffer_level(const VP9_CONFIG *oxcf, - const RATE_CONTROL *rc) { - // Adjust active_worst_quality: If buffer is above the optimal/target level, - // bring active_worst_quality down depending on fullness over buffer. - // If buffer is below the optimal level, let the active_worst_quality go from - // ambient Q (at buffer = optimal level) to worst_quality level - // (at buffer = critical level). - - int active_worst_quality = rc->active_worst_quality; - // Maximum limit for down adjustment, ~20%. - int max_adjustment_down = active_worst_quality / 5; - // Buffer level below which we push active_worst to worst_quality. - int critical_level = oxcf->optimal_buffer_level >> 2; - int adjustment = 0; - int buff_lvl_step = 0; - if (rc->buffer_level > oxcf->optimal_buffer_level) { - // Adjust down. - if (max_adjustment_down) { - buff_lvl_step = (int)((oxcf->maximum_buffer_size - - oxcf->optimal_buffer_level) / max_adjustment_down); - if (buff_lvl_step) - adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) / - buff_lvl_step); - active_worst_quality -= adjustment; - } - } else if (rc->buffer_level > critical_level) { - // Adjust up from ambient Q. - if (critical_level) { - buff_lvl_step = (oxcf->optimal_buffer_level - critical_level); - if (buff_lvl_step) { - adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) * - (oxcf->optimal_buffer_level - rc->buffer_level) / - buff_lvl_step; - } - active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment; - } - } else { - // Set to worst_quality if buffer is below critical level. - active_worst_quality = rc->worst_quality; - } - return active_worst_quality; -} - -// Adjust target frame size with respect to the buffering constraints: -static int target_size_from_buffer_level(const VP9_CONFIG *oxcf, - const RATE_CONTROL *rc) { - int target = rc->this_frame_target; - const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level; - const int one_pct_bits = 1 + oxcf->optimal_buffer_level / 100; - - if (diff > 0) { - // Lower the target bandwidth for this frame. - const int pct_low = MIN(diff / one_pct_bits, oxcf->under_shoot_pct); - target -= (target * pct_low) / 200; - } else if (diff < 0) { - // Increase the target bandwidth for this frame. - const int pct_high = MIN(-diff / one_pct_bits, oxcf->over_shoot_pct); - target += (target * pct_high) / 200; - } - - return target; -} - -static void calc_pframe_target_size(VP9_COMP *const cpi) { - RATE_CONTROL *const rc = &cpi->rc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; - int min_frame_target; - rc->this_frame_target = rc->per_frame_bandwidth; - - if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) { - // Need to decide how low min_frame_target should be for 1-pass CBR. - // For now, use: cpi->rc.av_per_frame_bandwidth / 16: - min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4, - FRAME_OVERHEAD_BITS); - rc->this_frame_target = target_size_from_buffer_level(oxcf, rc); - // Adjust qp-max based on buffer level. - rc->active_worst_quality = - adjust_active_worst_quality_from_buffer_level(oxcf, rc); - - if (rc->this_frame_target < min_frame_target) - rc->this_frame_target = min_frame_target; - return; - } - - // Check that the total sum of adjustments is not above the maximum allowed. - // That is, having allowed for the KF and GF penalties, we have not pushed - // the current inter-frame target too low. If the adjustment we apply here is - // not capable of recovering all the extra bits we have spent in the KF or GF, - // then the remainder will have to be recovered over a longer time span via - // other buffer / rate control mechanisms. - min_frame_target = MAX(rc->min_frame_bandwidth, - rc->av_per_frame_bandwidth >> 5); - - if (rc->this_frame_target < min_frame_target) - rc->this_frame_target = min_frame_target; - - // Adjust target frame size for Golden Frames: - if (cpi->refresh_golden_frame) { - // If we are using alternate ref instead of gf then do not apply the boost - // It will instead be applied to the altref update - // Jims modified boost - if (!rc->source_alt_ref_active) { - // The spend on the GF is defined in the two pass code - // for two pass encodes - rc->this_frame_target = rc->per_frame_bandwidth; - } else { - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a constructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for constructed ARFs. - rc->this_frame_target = 0; - } - } -} - static double get_rate_correction_factor(const VP9_COMP *cpi) { if (cpi->common.frame_type == KEY_FRAME) { return cpi->rc.key_frame_rate_correction_factor; } else { - if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) return cpi->rc.gf_rate_correction_factor; else return cpi->rc.rate_correction_factor; @@ -439,7 +338,8 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { if (cpi->common.frame_type == KEY_FRAME) { cpi->rc.key_frame_rate_correction_factor = factor; } else { - if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) cpi->rc.gf_rate_correction_factor = factor; else cpi->rc.rate_correction_factor = factor; @@ -455,7 +355,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { int projected_size_based_on_q = 0; // Clear down mmx registers to allow floating point in what follows - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); // Work out how big we would have expected the frame to be at this Q given // the current correction factor. @@ -463,7 +363,6 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q, cpi->common.MBs, rate_correction_factor); - // Work out a size correction factor. if (projected_size_based_on_q > 0) correction_factor = (100 * cpi->rc.projected_frame_size) / @@ -562,13 +461,206 @@ static int get_active_quality(int q, int gfu_boost, int low, int high, } } -int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, - int *bottom_index, int *top_index) { +static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { + int active_worst_quality; + if (cpi->common.frame_type == KEY_FRAME) { + if (cpi->common.current_video_frame == 0) { + active_worst_quality = cpi->rc.worst_quality; + } else { + // Choose active worst quality twice as large as the last q. + active_worst_quality = cpi->rc.last_q[KEY_FRAME] * 2; + } + } else if (!cpi->rc.is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + if (cpi->common.current_video_frame == 1) { + active_worst_quality = cpi->rc.last_q[KEY_FRAME] * 5 / 4; + } else { + // Choose active worst quality twice as large as the last q. + active_worst_quality = cpi->rc.last_q[INTER_FRAME]; + } + } else { + if (cpi->common.current_video_frame == 1) { + active_worst_quality = cpi->rc.last_q[KEY_FRAME] * 2; + } else { + // Choose active worst quality twice as large as the last q. + active_worst_quality = cpi->rc.last_q[INTER_FRAME] * 2; + } + } + if (active_worst_quality > cpi->rc.worst_quality) + active_worst_quality = cpi->rc.worst_quality; + return active_worst_quality; +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const VP9_CONFIG *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = oxcf->optimal_buffer_level >> 2; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + if (cpi->common.frame_type == KEY_FRAME) + return rc->worst_quality; + if (cpi->common.current_video_frame > 1) + active_worst_quality = MIN(rc->worst_quality, + rc->avg_frame_qindex[INTER_FRAME] * 5 / 4); + else + active_worst_quality = MIN(rc->worst_quality, + rc->avg_frame_qindex[KEY_FRAME] * 3 / 2); + if (rc->buffer_level > oxcf->optimal_buffer_level) { + // Adjust down. + // Maximum limit for down adjustment, ~30%. + int max_adjustment_down = active_worst_quality / 3; + if (max_adjustment_down) { + buff_lvl_step = ((oxcf->maximum_buffer_size - + oxcf->optimal_buffer_level) / max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + if (critical_level) { + buff_lvl_step = (oxcf->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = + (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) * + (oxcf->optimal_buffer_level - rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + int q; + + if (frame_is_intra_only(cm)) { + active_best_quality = rc->best_quality; + // Handle the special case for key frames forced when we have75 reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex); + int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q, + (last_boosted_q * 0.75)); + active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); + } else if (cm->current_video_frame > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME], + rc->kf_boost, + kf_low, kf_high, + kf_low_motion_minq, + kf_high_motion_minq); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality); + active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val * + q_adj_factor); + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + active_best_quality = get_active_quality( + q, rc->gfu_boost, gf_low, gf_high, + gf_low_motion_minq, gf_high_motion_minq); + } else { + // Use the lower of active_worst_quality and recent/average Q. + if (cm->current_video_frame > 1) { + if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]]; + else + active_best_quality = inter_minq[active_worst_quality]; + } else { + if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality) + active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; + else + active_best_quality = inter_minq[active_worst_quality]; + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = clamp(active_best_quality, + rc->best_quality, rc->worst_quality); + active_worst_quality = clamp(active_worst_quality, + active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) { + if (!(cm->current_video_frame == 0)) + *top_index = (active_worst_quality + active_best_quality * 3) / 4; + } +#endif + // Special case code to try and match quality with forced key frames + if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + active_best_quality, active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + assert(*top_index <= rc->worst_quality && + *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const VP9_CONFIG *const oxcf = &cpi->oxcf; int active_best_quality; - int active_worst_quality = rc->active_worst_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); int q; if (frame_is_intra_only(cm)) { @@ -583,13 +675,12 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q, (last_boosted_q * 0.75)); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); - } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) { + } else if (cm->current_video_frame > 0) { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; - // Baseline value derived from cpi->active_worst_quality and kf boost - active_best_quality = get_active_quality(active_worst_quality, + active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME], rc->kf_boost, kf_low, kf_high, kf_low_motion_minq, @@ -600,9 +691,6 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, q_adj_factor -= 0.25; } - // Make a further adjustment based on the kf zero motion measure. - q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); - // Convert the adjustment factor to a qindex delta // on active_best_quality. q_val = vp9_convert_qindex_to_q(active_best_quality); @@ -618,7 +706,6 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, #endif } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { - // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. @@ -626,7 +713,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { q = rc->avg_frame_qindex[INTER_FRAME]; } else { - q = active_worst_quality; + q = rc->avg_frame_qindex[KEY_FRAME]; } // For constrained quality dont allow Q less than the cq level if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) { @@ -669,14 +756,11 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { active_best_quality = cpi->cq_target_quality; } else { - if (cpi->pass == 0 && - rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) - // 1-pass: for now, use the average Q for the active_best, if its lower - // than active_worst. + // Use the lower of active_worst_quality and recent/average Q. + if (cm->current_video_frame > 1) active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]]; else - active_best_quality = inter_minq[active_worst_quality]; - + active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; // For the constrained quality mode we don't want // q to fall below the cq level. if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) && @@ -693,17 +777,192 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, } // Clip the active best and worst quality values to limits - if (active_worst_quality > rc->worst_quality) - active_worst_quality = rc->worst_quality; + active_best_quality = clamp(active_best_quality, + rc->best_quality, rc->worst_quality); + active_worst_quality = clamp(active_worst_quality, + active_best_quality, rc->worst_quality); - if (active_best_quality < rc->best_quality) - active_best_quality = rc->best_quality; + *top_index = active_worst_quality; + *bottom_index = active_best_quality; - if (active_best_quality > rc->worst_quality) - active_best_quality = rc->worst_quality; +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) { + if (!(cm->current_video_frame == 0)) + *top_index = (active_worst_quality + active_best_quality * 3) / 4; + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + *top_index = (active_worst_quality + active_best_quality) / 2; + } +#endif + if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + active_best_quality, active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } +#if CONFIG_MULTIPLE_ARF + // Force the quantizer determined by the coding order pattern. + if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && + cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) { + double new_q; + double current_q = vp9_convert_qindex_to_q(active_worst_quality); + int level = cpi->this_frame_weight; + assert(level >= 0); + new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level))); + q = active_worst_quality + + vp9_compute_qdelta(cpi, current_q, new_q); + + *bottom_index = q; + *top_index = q; + printf("frame:%d q:%d\n", cm->current_video_frame, q); + } +#endif + assert(*top_index <= rc->worst_quality && + *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9_CONFIG *const oxcf = &cpi->oxcf; + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + int q; + + if (frame_is_intra_only(cm)) { +#if !CONFIG_MULTIPLE_ARF + // Handle the special case for key frames forced when we have75 reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex); + int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q, + (last_boosted_q * 0.75)); + active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = get_active_quality(active_worst_quality, + rc->kf_boost, + kf_low, kf_high, + kf_low_motion_minq, + kf_high_motion_minq); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality); + active_best_quality += vp9_compute_qdelta(cpi, q_val, q_val * + q_adj_factor); + } +#else + double current_q; + // Force the KF quantizer to be 30% of the active_worst_quality. + current_q = vp9_convert_qindex_to_q(active_worst_quality); + active_best_quality = active_worst_quality + + vp9_compute_qdelta(cpi, current_q, current_q * 0.3); +#endif + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + // For constrained quality dont allow Q less than the cq level + if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) { + if (q < cpi->cq_target_quality) + q = cpi->cq_target_quality; + if (rc->frames_since_key > 1) { + active_best_quality = get_active_quality(q, rc->gfu_boost, + gf_low, gf_high, + afq_low_motion_minq, + afq_high_motion_minq); + } else { + active_best_quality = get_active_quality(q, rc->gfu_boost, + gf_low, gf_high, + gf_low_motion_minq, + gf_high_motion_minq); + } + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; - if (active_worst_quality < active_best_quality) - active_worst_quality = active_best_quality; + } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + if (!cpi->refresh_alt_ref_frame) { + active_best_quality = cpi->cq_target_quality; + } else { + if (rc->frames_since_key > 1) { + active_best_quality = get_active_quality( + q, rc->gfu_boost, gf_low, gf_high, + afq_low_motion_minq, afq_high_motion_minq); + } else { + active_best_quality = get_active_quality( + q, rc->gfu_boost, gf_low, gf_high, + gf_low_motion_minq, gf_high_motion_minq); + } + } + } else { + active_best_quality = get_active_quality( + q, rc->gfu_boost, gf_low, gf_high, + gf_low_motion_minq, gf_high_motion_minq); + } + } else { + if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { + active_best_quality = cpi->cq_target_quality; + } else { + active_best_quality = inter_minq[active_worst_quality]; + + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) && + (active_best_quality < cpi->cq_target_quality)) { + // If we are strongly undershooting the target rate in the last + // frames then use the user passed in cq value not the auto + // cq value. + if (rc->rolling_actual_bits < rc->min_frame_bandwidth) + active_best_quality = oxcf->cq_level; + else + active_best_quality = cpi->cq_target_quality; + } + } + } + + // Clip the active best and worst quality values to limits. + active_best_quality = clamp(active_best_quality, + rc->best_quality, rc->worst_quality); + active_worst_quality = clamp(active_worst_quality, + active_best_quality, rc->worst_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; @@ -711,8 +970,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY // Limit Q range for the adaptive loop. if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) { - if (!(cpi->pass == 0 && cm->current_video_frame == 0)) - *top_index = (active_worst_quality + active_best_quality * 3) / 4; + *top_index = (active_worst_quality + active_best_quality * 3) / 4; } else if (!rc->is_src_frame_alt_ref && (oxcf->end_usage != USAGE_STREAM_FROM_SERVER) && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { @@ -722,14 +980,14 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) { q = active_best_quality; - // Special case code to try and match quality with forced key frames + // Special case code to try and match quality with forced key frames. } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { q = rc->last_boosted_qindex; } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality); if (q > *top_index) { - // Special case when we are targeting the max allowed rate + // Special case when we are targeting the max allowed rate. if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth) *top_index = q; else @@ -761,6 +1019,35 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, return q; } +int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + int q; + if (cpi->pass == 0) { + if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); + else + q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); + } else { + q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); + } + + // JBB : This is realtime mode. In real time mode the first frame + // should be larger. Q of 0 is disabled because we force tx size to be + // 16x16... + if (cpi->sf.use_nonrd_pick_mode) { + if (cpi->common.current_video_frame == 0) + q /= 3; + if (q == 0) + q++; + if (q < *bottom_index) + *bottom_index = q; + else if (q > *top_index) + *top_index = q; + } + return q; +} + void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int this_frame_target, int *frame_under_shoot_limit, @@ -804,24 +1091,14 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, } } -// return of 0 means drop frame -int vp9_rc_pick_frame_size_target(VP9_COMP *cpi) { +void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { const VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - if (cm->frame_type == KEY_FRAME) - calc_iframe_target_size(cpi); - else - calc_pframe_target_size(cpi); - - // Clip the frame target to the maximum allowed value. - if (rc->this_frame_target > rc->max_frame_bandwidth) - rc->this_frame_target = rc->max_frame_bandwidth; - + rc->this_frame_target = target; // Target rate per SB64 (including partial SB64s. rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) / (cm->width * cm->height); - return 1; } static void update_alt_ref_frame_stats(VP9_COMP *cpi) { @@ -865,11 +1142,14 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + + cm->last_frame_type = cm->frame_type; // Update rate control heuristics - rc->projected_frame_size = (bytes_used << 3); + rc->projected_frame_size = (int)(bytes_used << 3); // Post encode loop adjustment of Q prediction. - vp9_rc_update_rate_correction_factors(cpi, (cpi->sf.recode_loop || + vp9_rc_update_rate_correction_factors( + cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF || cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0); // Keep a record of last Q and ambient average Q. @@ -878,7 +1158,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO( 3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2); } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) && + !(cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) { rc->last_q[2] = cm->base_qindex; rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO( 3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2); @@ -907,7 +1188,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->last_boosted_qindex = cm->base_qindex; } - vp9_update_buffer_level(cpi, rc->projected_frame_size); + update_buffer_level(cpi, rc->projected_frame_size); // Rolling monitors of whether we are over or underspending used to help // regulate min and Max Q in two pass. @@ -929,22 +1210,6 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->total_target_vs_actual += (rc->this_frame_target - rc->projected_frame_size); -#ifndef DISABLE_RC_LONG_TERM_MEM - // Update bits left to the kf and gf groups to account for overshoot or - // undershoot on these frames - if (cm->frame_type == KEY_FRAME) { - cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - - cpi->rc.projected_frame_size; - - cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0); - } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { - cpi->twopass.gf_group_bits += cpi->rc.this_frame_target - - cpi->rc.projected_frame_size; - - cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0); - } -#endif - if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME)) // Update the alternate reference frame stats as appropriate. @@ -962,6 +1227,172 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { + // Update buffer level with zero size, update frame counters, and return. + update_buffer_level(cpi, 0); + cpi->common.last_frame_type = cpi->common.frame_type; cpi->rc.frames_since_key++; cpi->rc.frames_to_key--; } + +static int test_for_kf_one_pass(VP9_COMP *cpi) { + // Placeholder function for auto key frame + return 0; +} +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 1 + +static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { + static const int af_ratio = 10; + const RATE_CONTROL *rc = &cpi->rc; + int target; +#if USE_ALTREF_FOR_ONE_PASS + target = (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ? + (rc->av_per_frame_bandwidth * cpi->rc.baseline_gf_interval * af_ratio) / + (cpi->rc.baseline_gf_interval + af_ratio - 1) : + (rc->av_per_frame_bandwidth * cpi->rc.baseline_gf_interval) / + (cpi->rc.baseline_gf_interval + af_ratio - 1); +#else + target = rc->av_per_frame_bandwidth; +#endif + return vp9_rc_clamp_pframe_target_size(cpi, target); +} + +static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + int target = rc->av_per_frame_bandwidth * kf_ratio; + return vp9_rc_clamp_iframe_target_size(cpi, target); +} + +void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || + cm->frame_flags & FRAMEFLAGS_KEY || + rc->frames_to_key == 0 || + (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = cm->current_video_frame != 0 && + rc->frames_to_key == 0; + rc->frames_to_key = cpi->key_frame_frequency; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + if (rc->frames_till_gf_update_due == 0) { + rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // NOTE: frames_till_gf_update_due must be <= frames_to_key. + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + if (cm->frame_type == KEY_FRAME) + target = calc_iframe_target_size_one_pass_vbr(cpi); + else + target = calc_pframe_target_size_one_pass_vbr(cpi); + vp9_rc_set_frame_target(cpi, target); +} + +static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { + const VP9_CONFIG *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100; + int min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4, + FRAME_OVERHEAD_BITS); + int target = rc->av_per_frame_bandwidth; + if (cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + // Note that for layers, av_per_frame_bandwidth is the cumulative + // per-frame-bandwidth. For the target size of this frame, use the + // layer average frame size (i.e., non-cumulative per-frame-bw). + int current_temporal_layer = cpi->svc.temporal_layer_id; + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[current_temporal_layer]; + target = lc->avg_frame_size; + min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); + } + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct); + target += (target * pct_high) / 200; + } + return MAX(min_frame_target, target); +} + +static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + int target; + + if (cpi->common.current_video_frame == 0) { + target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX) + ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2); + } else { + const int initial_boost = 32; + int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16)); + if (rc->frames_since_key < cpi->output_framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / + (cpi->output_framerate / 2)); + } + target = ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4; + } + return vp9_rc_clamp_iframe_target_size(cpi, target); +} + +void vp9_rc_get_svc_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int target = cpi->rc.av_per_frame_bandwidth; + if ((cm->current_video_frame == 0) || + (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && (cpi->rc.frames_since_key % + cpi->key_frame_frequency == 0))) { + cm->frame_type = KEY_FRAME; + cpi->rc.source_alt_ref_active = 0; + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + target = calc_iframe_target_size_one_pass_cbr(cpi); + } + } else { + cm->frame_type = INTER_FRAME; + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { + target = calc_pframe_target_size_one_pass_cbr(cpi); + } + } + vp9_rc_set_frame_target(cpi, target); + cpi->rc.frames_till_gf_update_due = INT_MAX; + cpi->rc.baseline_gf_interval = INT_MAX; +} + +void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + if ((cm->current_video_frame == 0 || + cm->frame_flags & FRAMEFLAGS_KEY || + rc->frames_to_key == 0 || + (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = cm->current_video_frame != 0 && + rc->frames_to_key == 0; + rc->frames_to_key = cpi->key_frame_frequency; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + target = calc_iframe_target_size_one_pass_cbr(cpi); + } else { + cm->frame_type = INTER_FRAME; + target = calc_pframe_target_size_one_pass_cbr(cpi); + } + vp9_rc_set_frame_target(cpi, target); + // Don't use gf_update by default in CBR mode. + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; +} diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index eba4b7a92..5dbc7d138 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -34,19 +34,18 @@ typedef struct { double key_frame_rate_correction_factor; double gf_rate_correction_factor; - unsigned int frames_since_golden; - unsigned int frames_till_gf_update_due; // Count down till next GF - unsigned int max_gf_interval; - unsigned int baseline_gf_interval; - unsigned int frames_to_key; - unsigned int frames_since_key; - unsigned int this_key_frame_forced; - unsigned int next_key_frame_forced; - unsigned int source_alt_ref_pending; - unsigned int source_alt_ref_active; - unsigned int is_src_frame_alt_ref; - - int per_frame_bandwidth; // Current section per frame bandwidth target + int frames_since_golden; + int frames_till_gf_update_due; + int max_gf_interval; + int baseline_gf_interval; + int frames_to_key; + int frames_since_key; + int this_key_frame_forced; + int next_key_frame_forced; + int source_alt_ref_pending; + int source_alt_ref_active; + int is_src_frame_alt_ref; + int av_per_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation used for any frame int max_frame_bandwidth; // Maximum burst rate allowed for a frame. @@ -58,8 +57,8 @@ typedef struct { double tot_q; double avg_q; - int buffer_level; - int bits_off_target; + int64_t buffer_level; + int64_t bits_off_target; int decimation_factor; int decimation_count; @@ -74,7 +73,6 @@ typedef struct { int total_target_vs_actual; // debug stats int worst_quality; - int active_worst_quality; int best_quality; // int active_best_quality; } RATE_CONTROL; @@ -89,50 +87,79 @@ void vp9_setup_inter_frame(struct VP9_COMP *cpi); double vp9_convert_qindex_to_q(int qindex); -// Updates rate correction factors -void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var); - // initialize luts for minq void vp9_rc_init_minq_luts(void); -// return of 0 means drop frame -// Changes only rc.this_frame_target and rc.sb64_rate_target -int vp9_rc_pick_frame_size_target(struct VP9_COMP *cpi); +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// vp9_rc_get_one_pass_vbr_params() +// vp9_rc_get_one_pass_cbr_params() +// vp9_rc_get_svc_params() +// vp9_rc_get_first_pass_params() +// vp9_rc_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by one of: +// vp9_rc_postencode_update() +// vp9_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the vp9_rc_get_..._params() functions and +// updated during the vp9_rc_postencode_update...() functions. +// The only exceptions are vp9_rc_drop_frame() and +// vp9_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi); +void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi); +void vp9_rc_get_svc_params(struct VP9_COMP *cpi); + +// Post encode update of the rate control parameters based +// on bytes used +void vp9_rc_postencode_update(struct VP9_COMP *cpi, + uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); + +// Updates rate correction factors +// Changes only the rate correction factors in the rate control structure. +void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var); + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int vp9_rc_drop_frame(struct VP9_COMP *cpi); +// Computes frame size bounds. void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, int this_frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit); // Picks q and q bounds given the target for bits -int vp9_rc_pick_q_and_adjust_q_bounds(const struct VP9_COMP *cpi, - int *bottom_index, - int *top_index); +int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi, + int *bottom_index, + int *top_index); // Estimates q to achieve a target bits per frame int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality); -// Post encode update of the rate control parameters based -// on bytes used -void vp9_rc_postencode_update(struct VP9_COMP *cpi, - uint64_t bytes_used); -// for dropped frames -void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); - -// estimates bits per mb for a given qindex and correction factor +// Estimates bits per mb for a given qindex and correction factor. int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, double correction_factor); -// Post encode update of the rate control parameters for 2-pass -void vp9_twopass_postencode_update(struct VP9_COMP *cpi, - uint64_t bytes_used); - -// Decide if we should drop this frame: For 1-pass CBR. -int vp9_drop_frame(struct VP9_COMP *cpi); - -// Update the buffer level. -void vp9_update_buffer_level(struct VP9_COMP *cpi, int encoded_frame_size); +// Clamping utilities for bitrate targets for iframes and pframes. +int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi, + int target); +int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi, + int target); +// Utility to set frame_target into the RATE_CONTROL structure +// This function is called only from the vp9_rc_get_..._params() functions. +void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e5230feb4..f7577e174 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -68,7 +68,7 @@ struct rdcost_block_args { int64_t this_rd; int64_t best_rd; int skip; - const int16_t *scan, *nb; + const scan_order *so; }; const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { @@ -274,7 +274,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { MACROBLOCK *x = &cpi->mb; int qindex, i; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); // Further tests required to see if optimum is different // for key frames, golden frames and arf frames. @@ -285,7 +285,8 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128) cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex); - x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO + (x->errorperbit == 0); + x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO; + x->errorperbit += (x->errorperbit == 0); vp9_set_speed_features(cpi); @@ -294,21 +295,22 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { set_block_thresholds(cpi); - fill_token_costs(x->token_costs, cm->fc.coef_probs); + if (!cpi->sf.use_nonrd_pick_mode) { + fill_token_costs(x->token_costs, cm->fc.coef_probs); - if (!cpi->sf.super_fast_rtc) { for (i = 0; i < PARTITION_CONTEXTS; i++) vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i), vp9_partition_tree); + } + if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1) { fill_mode_costs(cpi); if (!frame_is_intra_only(cm)) { vp9_build_nmv_cost_table(x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, - &cm->fc.nmvc, - cm->allow_high_precision_mv, 1, 1); + &cm->fc.nmvc, cm->allow_high_precision_mv); for (i = 0; i < INTER_MODE_CONTEXTS; ++i) vp9_cost_tokens((int *)x->inter_mode_cost[i], @@ -414,9 +416,10 @@ static void model_rd_from_var_lapndz(unsigned int var, unsigned int n, *dist = 0; } else { int d_q10, r_q10; - uint64_t xsq_q10_64 = + const uint64_t xsq_q10_64 = ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var; - int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? MAX_XSQ_Q10 : xsq_q10_64; + const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? + MAX_XSQ_Q10 : (int)xsq_q10_64; model_rd_norm(xsq_q10, &r_q10, &d_q10); *rate = (n * r_q10 + 2) >> 2; *dist = (var * (int64_t)d_q10 + 512) >> 10; @@ -429,7 +432,9 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. - int i, rate_sum = 0, dist_sum = 0; + int i; + int64_t rate_sum = 0; + int64_t dist_sum = 0; int ref = xd->mi_8x8[0]->mbmi.ref_frame[0]; unsigned int sse; @@ -443,20 +448,33 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, if (i == 0) x->pred_sse[ref] = sse; - if (cpi->sf.super_fast_rtc) { - dist_sum += (int)sse; + + // Fast approximate the modelling function. + if (cpi->speed > 4) { + int64_t rate; + int64_t dist; + int64_t square_error = sse; + int quantizer = (pd->dequant[1] >> 3); + + if (quantizer < 120) + rate = (square_error * (280 - quantizer)) >> 8; + else + rate = 0; + dist = (square_error * quantizer) >> 8; + rate_sum += rate; + dist_sum += dist; } else { int rate; int64_t dist; model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; - dist_sum += (int)dist; + dist_sum += dist; } } - *out_rate_sum = rate_sum; - *out_dist_sum = (int64_t)dist_sum << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum << 4; } static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize, @@ -546,18 +564,16 @@ static INLINE int cost_coeffs(MACROBLOCK *x, const PLANE_TYPE type = pd->plane_type; const int16_t *band_count = &band_counts[tx_size][1]; const int eob = p->eobs[block]; - const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block); - const int ref = mbmi->ref_frame[0] != INTRA_FRAME; + const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - x->token_costs[tx_size][type][ref]; - const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L; + x->token_costs[tx_size][type][is_inter_block(mbmi)]; uint8_t *p_tok = x->token_cache; - int pt = combine_entropy_contexts(above_ec, left_ec); + int pt = combine_entropy_contexts(*A, *L); int c, cost; // Check for consistency of tx_size with mode info assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size - : get_uv_tx_size(mbmi) == tx_size); + : get_uv_tx_size(mbmi) == tx_size); if (eob == 0) { // single eob token @@ -567,7 +583,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x, int band_left = *band_count++; // dc token - int v = qcoeff_ptr[0]; + int v = qcoeff[0]; int prev_t = vp9_dct_value_tokens_ptr[v].token; cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; p_tok[0] = vp9_pt_energy_class[prev_t]; @@ -578,7 +594,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x, const int rc = scan[c]; int t; - v = qcoeff_ptr[rc]; + v = qcoeff[rc]; t = vp9_dct_value_tokens_ptr[v].token; pt = get_coef_context(nb, p_tok, c); cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; @@ -634,7 +650,7 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx, args->t_left + y_idx, tx_size, - args->scan, args->nb); + args->so->scan, args->so->neighbors); } static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, @@ -643,17 +659,15 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct encode_b_args encode_args = {x, NULL, &mbmi->skip_coeff}; - int64_t rd1, rd2, rd; if (args->skip) return; - if (!is_inter_block(&xd->mi_8x8[0]->mbmi)) - vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args); + if (!is_inter_block(mbmi)) + vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); else - vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args); + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); dist_block(plane, block, tx_size, args); rate_block(plane, block, plane_bsize, tx_size, args); @@ -677,10 +691,16 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, } } -void vp9_get_entropy_contexts(TX_SIZE tx_size, - ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], - const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, - int num_4x4_w, int num_4x4_h) { +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_context; + const ENTROPY_CONTEXT *const left = pd->left_context; + int i; switch (tx_size) { case TX_4X4: @@ -710,49 +730,35 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size, } } -static void init_rdcost_stack(MACROBLOCK *x, const int64_t ref_rdcost, - struct rdcost_block_args *arg) { - vpx_memset(arg, 0, sizeof(struct rdcost_block_args)); - arg->x = x; - arg->best_rd = ref_rdcost; -} - static void txfm_rd_in_plane(MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size) { - struct rdcost_block_args rd_stack; MACROBLOCKD *const xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[bs]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bs]; - const scan_order *so; + struct rdcost_block_args args = { 0 }; + args.x = x; + args.best_rd = ref_best_rd; - init_rdcost_stack(x, ref_best_rd, &rd_stack); if (plane == 0) xd->mi_8x8[0]->mbmi.tx_size = tx_size; - vp9_get_entropy_contexts(tx_size, rd_stack.t_above, rd_stack.t_left, - pd->above_context, pd->left_context, - num_4x4_w, num_4x4_h); + vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); - so = get_scan(xd, tx_size, pd->plane_type, 0); - rd_stack.scan = so->scan; - rd_stack.nb = so->neighbors; + args.so = get_scan(xd, tx_size, pd->plane_type, 0); vp9_foreach_transformed_block_in_plane(xd, bsize, plane, - block_rd_txfm, &rd_stack); - if (rd_stack.skip) { + block_rd_txfm, &args); + if (args.skip) { *rate = INT_MAX; *distortion = INT64_MAX; *sse = INT64_MAX; *skippable = 0; } else { - *distortion = rd_stack.this_dist; - *rate = rd_stack.this_rate; - *sse = rd_stack.this_sse; + *distortion = args.this_dist; + *rate = args.this_rate; + *sse = args.this_sse; *skippable = vp9_is_skippable_in_plane(x, bsize, plane); } } @@ -787,7 +793,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); - int64_t rd[TX_SIZES][2]; + int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}}; int n, m; int s0, s1; const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; @@ -851,6 +860,11 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, } } +static int64_t scaled_rd_cost(int rdmult, int rddiv, + int rate, int64_t dist, double scale) { + return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale); +} + static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, int (*r)[2], int *rate, int64_t *d, int64_t *distortion, @@ -862,7 +876,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); - int64_t rd[TX_SIZES][2]; + int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}}; int n, m; int s0, s1; double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; @@ -885,10 +902,13 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, r[n][1] += vp9_cost_one(tx_probs[m]); } if (s[n]) { - rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale; + rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n], + scale); } else { - rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale; - rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale; + rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n], + scale); + rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n], + scale); } if (rd[n][1] < best_rd) { best_rd = rd[n][1]; @@ -915,27 +935,23 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, } } -static void super_block_yrd(VP9_COMP *cpi, - MACROBLOCK *x, int *rate, int64_t *distortion, - int *skip, int64_t *psse, BLOCK_SIZE bs, - int64_t txfm_cache[TX_MODES], - int64_t ref_best_rd) { +static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, + int64_t *psse, BLOCK_SIZE bs, + int64_t txfm_cache[TX_MODES], + int64_t ref_best_rd) { int r[TX_SIZES][2], s[TX_SIZES]; int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - const int b_inter_mode = is_inter_block(mbmi); const TX_SIZE max_tx_size = max_txsize_lookup[bs]; TX_SIZE tx_size; - assert(bs == mbmi->sb_type); - if (b_inter_mode) - vp9_subtract_sby(x, bs); - if (cpi->sf.tx_size_search_method == USE_LARGESTALL || - (cpi->sf.tx_size_search_method != USE_FULL_RD && - !b_inter_mode)) { + vp9_subtract_plane(x, bs, 0); + + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, bs); @@ -944,8 +960,7 @@ static void super_block_yrd(VP9_COMP *cpi, return; } - if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && - b_inter_mode) { + if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) { for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd, &r[tx_size][0], &d[tx_size], &s[tx_size]); @@ -963,6 +978,36 @@ static void super_block_yrd(VP9_COMP *cpi, *psse = sse[mbmi->tx_size]; } +static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, + int64_t *psse, BLOCK_SIZE bs, + int64_t txfm_cache[TX_MODES], + int64_t ref_best_rd) { + int64_t sse[TX_SIZES]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + + assert(bs == mbmi->sb_type); + if (cpi->sf.tx_size_search_method != USE_FULL_RD) { + vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); + choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, + ref_best_rd, bs); + } else { + int r[TX_SIZES][2], s[TX_SIZES]; + int64_t d[TX_SIZES]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size) + txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], + &s[tx_size], &sse[tx_size], + ref_best_rd, 0, bs, tx_size); + choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, + skip, txfm_cache, bs); + } + if (psse) + *psse = sse[mbmi->tx_size]; +} + + static int conditional_skipintra(MB_PREDICTION_MODE mode, MB_PREDICTION_MODE best_intra_mode) { if (mode == D117_PRED && @@ -1064,7 +1109,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, so = &vp9_scan_orders[TX_4X4][tx_type]; if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, 8, tx_type); + vp9_fht4x4(src_diff, coeff, 8, tx_type); else x->fwd_txm4x4(src_diff, coeff, 8); @@ -1223,8 +1268,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } mic->mbmi.mode = mode; - super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, local_tx_cache, best_rd); + intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, + &s, NULL, bsize, local_tx_cache, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1259,7 +1304,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, +static void super_block_uvrd(MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, BLOCK_SIZE bsize, int64_t ref_best_rd) { @@ -1273,8 +1318,11 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, if (ref_best_rd < 0) goto term; - if (is_inter_block(mbmi)) - vp9_subtract_sbuv(x, bsize); + if (is_inter_block(mbmi)) { + int plane; + for (plane = 1; plane < MAX_MB_PLANE; ++plane) + vp9_subtract_plane(x, bsize, plane); + } *rate = 0; *distortion = 0; @@ -1306,6 +1354,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize, TX_SIZE max_tx_size) { + MACROBLOCKD *xd = &x->e_mbd; MB_PREDICTION_MODE mode; MB_PREDICTION_MODE mode_selected = DC_PRED; int64_t best_rd = INT64_MAX, this_rd; @@ -1316,9 +1365,9 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode))) continue; - x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode; + xd->mi_8x8[0]->mbmi.uv_mode = mode; - super_block_uvrd(cpi, x, &this_rate_tokenonly, + super_block_uvrd(x, &this_rate_tokenonly, &this_distortion, &s, &this_sse, bsize, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1336,7 +1385,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!x->select_txfm_size) { int i; struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = x->e_mbd.plane; + struct macroblockd_plane *const pd = xd->plane; for (i = 1; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][2]; p[i].qcoeff = ctx->qcoeff_pbuf[i][2]; @@ -1357,25 +1406,21 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected; + xd->mi_8x8[0]->mbmi.uv_mode = mode_selected; return best_rd; } -static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, +static int64_t rd_sbuv_dcpred(const VP9_COMMON *cm, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { - int64_t this_rd; - int64_t this_sse; + int64_t unused; x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED; - super_block_uvrd(cpi, x, rate_tokenonly, distortion, - skippable, &this_sse, bsize, INT64_MAX); - *rate = *rate_tokenonly + - x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED]; - this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion); - - return this_rd; + super_block_uvrd(x, rate_tokenonly, distortion, + skippable, &unused, bsize, INT64_MAX); + *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED]; + return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, @@ -1388,8 +1433,8 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, // Use an estimated rd for uv_intra based on DC_PRED if the // appropriate speed flag is set. if (cpi->sf.use_uv_intra_rd_estimate) { - rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, - bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + rd_sbuv_dcpred(&cpi->common, x, rate_uv, rate_uv_tokenonly, dist_uv, + skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); // Else do a proper rd search for each possible transform size that may // be considered in the main rd loop. } else { @@ -1403,8 +1448,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, int mode_context) { MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - const int segment_id = xd->mi_8x8[0]->mbmi.segment_id; + const int segment_id = x->e_mbd.mi_8x8[0]->mbmi.segment_id; // Don't account for mode here if segment skip is enabled. if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { @@ -1429,7 +1473,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int *rate_mv); static int labels2mode(MACROBLOCK *x, int i, - MB_PREDICTION_MODE this_mode, + MB_PREDICTION_MODE mode, int_mv *this_mv, int_mv *this_second_mv, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int_mv seg_mvs[MAX_REF_FRAMES], @@ -1439,23 +1483,18 @@ static int labels2mode(MACROBLOCK *x, int i, MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mic = xd->mi_8x8[0]; MB_MODE_INFO *mbmi = &mic->mbmi; - int cost = 0, thismvcost = 0; + int thismvcost = 0; int idx, idy; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; const int has_second_rf = has_second_ref(mbmi); - /* We have to be careful retrieving previously-encoded motion vectors. - Ones from this macroblock have to be pulled from the BLOCKD array - as they have not yet made it to the bmi array in our MB_MODE_INFO. */ - MB_PREDICTION_MODE m; - // the only time we should do costing for new motion vector or mode // is when we are on a new label (jbb May 08, 2007) - switch (m = this_mode) { + switch (mode) { case NEWMV: this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int; - thismvcost = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv, + thismvcost += vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv, mvjcost, mvcost, MV_COST_WEIGHT_SUB); if (has_second_rf) { this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int; @@ -1467,14 +1506,12 @@ static int labels2mode(MACROBLOCK *x, int i, case NEARESTMV: this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int; if (has_second_rf) - this_second_mv->as_int = - frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int; + this_second_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int; break; case NEARMV: this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int; if (has_second_rf) - this_second_mv->as_int = - frame_mv[NEARMV][mbmi->ref_frame[1]].as_int; + this_second_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int; break; case ZEROMV: this_mv->as_int = 0; @@ -1485,22 +1522,19 @@ static int labels2mode(MACROBLOCK *x, int i, break; } - cost = cost_mv_ref(cpi, this_mode, - mbmi->mode_context[mbmi->ref_frame[0]]); - mic->bmi[i].as_mv[0].as_int = this_mv->as_int; if (has_second_rf) mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int; - mic->bmi[i].as_mode = m; + mic->bmi[i].as_mode = mode; for (idy = 0; idy < num_4x4_blocks_high; ++idy) for (idx = 0; idx < num_4x4_blocks_wide; ++idx) vpx_memcpy(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i])); - cost += thismvcost; - return cost; + return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) + + thismvcost; } static int64_t encode_inter_mb_segment(VP9_COMP *cpi, @@ -1604,13 +1638,11 @@ typedef struct { int mvthresh; } BEST_SEG_INFO; -static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { - int r = 0; - r |= (mv->as_mv.row >> 3) < x->mv_row_min; - r |= (mv->as_mv.row >> 3) > x->mv_row_max; - r |= (mv->as_mv.col >> 3) < x->mv_col_min; - r |= (mv->as_mv.col >> 3) > x->mv_col_max; - return r; +static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) { + return (mv->row >> 3) < x->mv_row_min || + (mv->row >> 3) > x->mv_row_max || + (mv->col >> 3) < x->mv_col_min || + (mv->col >> 3) > x->mv_col_max; } static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { @@ -1645,14 +1677,15 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi_buf, int filter_idx, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { - int i, br = 0, idx, idy; + int k, br = 0, idx, idy; int64_t bd = 0, block_sse = 0; MB_PREDICTION_MODE this_mode; + MACROBLOCKD *xd = &x->e_mbd; VP9_COMMON *cm = &cpi->common; - MODE_INFO *mi = x->e_mbd.mi_8x8[0]; + MODE_INFO *mi = xd->mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; const int label_count = 4; int64_t this_segment_rd = 0; int label_mv_thresh; @@ -1660,7 +1693,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = mbmi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - vp9_variance_fn_ptr_t *v_fn_ptr; + vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize]; ENTROPY_CONTEXT t_above[2], t_left[2]; BEST_SEG_INFO *bsi = bsi_buf + filter_idx; int mode_idx; @@ -1670,8 +1703,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, vpx_memcpy(t_above, pd->above_context, sizeof(t_above)); vpx_memcpy(t_left, pd->left_context, sizeof(t_left)); - v_fn_ptr = &cpi->fn_ptr[bsize]; - // 64 makes this threshold really big effectively // making it so that we very rarely check mvs on // segments. setting this to 1 would make mv thresh @@ -1687,20 +1718,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; MB_PREDICTION_MODE mode_selected = ZEROMV; int64_t best_rd = INT64_MAX; - i = idy * 2 + idx; - - frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(cm, &x->e_mbd, tile, - i, 0, mi_row, mi_col, - &frame_mv[NEARESTMV][mbmi->ref_frame[0]], - &frame_mv[NEARMV][mbmi->ref_frame[0]]); - if (has_second_rf) { - frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(cm, &x->e_mbd, tile, - i, 1, mi_row, mi_col, - &frame_mv[NEARESTMV][mbmi->ref_frame[1]], - &frame_mv[NEARMV][mbmi->ref_frame[1]]); + const int i = idy * 2 + idx; + int ref; + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + frame_mv[ZEROMV][frame].as_int = 0; + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col, + &frame_mv[NEARESTMV][frame], + &frame_mv[NEARMV][frame]); } + // search for the best motion vector on this segment for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { const struct buf_2d orig_src = x->plane[0].src; @@ -1829,28 +1857,28 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 0, v_fn_ptr, &bsi->ref_mv->as_mv, - new_mv); + &new_mv->as_mv); } // Should we do a full search (best quality only) if (cpi->oxcf.mode == MODE_BESTQUALITY || cpi->oxcf.mode == MODE_SECONDPASS_BEST) { + int_mv *const best_mv = &mi->bmi[i].as_mv[0]; /* Check if mvp_full is within the range. */ clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - thissme = cpi->full_search_sad(x, &mvp_full, sadpb, 16, v_fn_ptr, x->nmvjointcost, x->mvcost, - &bsi->ref_mv->as_mv, i); - + &bsi->ref_mv->as_mv, + &best_mv->as_mv); if (thissme < bestsme) { bestsme = thissme; - new_mv->as_int = mi->bmi[i].as_mv[0].as_int; + new_mv->as_int = best_mv->as_int; } else { - /* The full search result is actually worse so re-instate the - * previous best vector */ - mi->bmi[i].as_mv[0].as_int = new_mv->as_int; + // The full search result is actually worse so re-instate the + // previous best vector + best_mv->as_int = new_mv->as_int; } } @@ -1928,10 +1956,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } // Trap vectors that reach beyond the UMV borders - if (mv_check_bounds(x, &mode_mv[this_mode])) - continue; - if (has_second_rf && - mv_check_bounds(x, &second_mode_mv[this_mode])) + if (mv_check_bounds(x, &mode_mv[this_mode].as_mv) || + (has_second_rf && + mv_check_bounds(x, &second_mode_mv[this_mode].as_mv))) continue; if (filter_idx > 0) { @@ -2042,8 +2069,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->sse = block_sse; // update the coding decisions - for (i = 0; i < 4; ++i) - bsi->modes[i] = mi->bmi[i].as_mode; + for (k = 0; k < 4; ++k) + bsi->modes[k] = mi->bmi[k].as_mode; } static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, @@ -2356,7 +2383,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int sadpb = x->sadperbit16; MV mvp_full; int ref = mbmi->ref_frame[0]; - int_mv ref_mv = mbmi->ref_mvs[ref][0]; + MV ref_mv = mbmi->ref_mvs[ref][0].as_mv; int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; @@ -2366,10 +2393,10 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); - int_mv pred_mv[3]; - pred_mv[0] = mbmi->ref_mvs[ref][0]; - pred_mv[1] = mbmi->ref_mvs[ref][1]; - pred_mv[2] = x->pred_mv[ref]; + MV pred_mv[3]; + pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv; + pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv; + pred_mv[2] = x->pred_mv[ref].as_mv; if (scaled_ref_frame) { int i; @@ -2382,26 +2409,18 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); } - vp9_set_mv_search_range(x, &ref_mv.as_mv); + vp9_set_mv_search_range(x, &ref_mv); - // Adjust search parameters based on small partitions' result. - if (x->fast_ms) { - // adjust search range - step_param = 6; - if (x->fast_ms > 1) - step_param = 8; + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is MAX >> 1 etc. + if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + + cpi->mv_step_param) >> 1; } else { - // Work out the size of the first step in the mv step search. - // 0 here is maximum length first step. 1 is MAX >> 1 etc. - if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { - // Take wtd average of the step_params based on the last frame's - // max mv magnitude and that based on the best ref mvs of the current - // block for the given reference. - step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + - cpi->mv_step_param) >> 1; - } else { - step_param = cpi->mv_step_param; - } + step_param = cpi->mv_step_param; } if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 && @@ -2435,7 +2454,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } } - mvp_full = pred_mv[x->mv_best_ref_index[ref]].as_mv; + mvp_full = pred_mv[x->mv_best_ref_index[ref]]; mvp_full.col >>= 3; mvp_full.row >>= 3; @@ -2443,23 +2462,27 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // Further step/diamond searches as necessary further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - if (cpi->sf.search_method == HEX) { + if (cpi->sf.search_method == FAST_HEX) { + bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, + &cpi->fn_ptr[bsize], 1, + &ref_mv, &tmp_mv->as_mv); + } else if (cpi->sf.search_method == HEX) { bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1, &cpi->fn_ptr[bsize], 1, - &ref_mv.as_mv, &tmp_mv->as_mv); + &ref_mv, &tmp_mv->as_mv); } else if (cpi->sf.search_method == SQUARE) { bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1, &cpi->fn_ptr[bsize], 1, - &ref_mv.as_mv, &tmp_mv->as_mv); + &ref_mv, &tmp_mv->as_mv); } else if (cpi->sf.search_method == BIGDIA) { bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1, &cpi->fn_ptr[bsize], 1, - &ref_mv.as_mv, &tmp_mv->as_mv); + &ref_mv, &tmp_mv->as_mv); } else { bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1, &cpi->fn_ptr[bsize], - &ref_mv.as_mv, tmp_mv); + &ref_mv, &tmp_mv->as_mv); } x->mv_col_min = tmp_col_min; @@ -2469,7 +2492,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ - cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv, + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], @@ -2478,7 +2501,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref]); } - *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv, + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) @@ -2705,6 +2728,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv tmp_mv; single_motion_search(cpi, x, tile, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); + if (tmp_mv.as_int == INVALID_MV) + return INT64_MAX; *rate2 += rate_mv; frame_mv[refs[0]].as_int = xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int; @@ -2717,7 +2742,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[refs[0]].as_int == 0 && !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) && (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) { - int rfc = mbmi->mode_context[mbmi->ref_frame[0]]; + int rfc = mbmi->mode_context[refs[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); int c3 = cost_mv_ref(cpi, ZEROMV, rfc); @@ -2732,17 +2757,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, assert(this_mode == ZEROMV); if (num_refs == 1) { if ((c3 >= c2 && - mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) || + mode_mv[NEARESTMV][refs[0]].as_int == 0) || (c3 >= c1 && - mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0)) + mode_mv[NEARMV][refs[0]].as_int == 0)) return INT64_MAX; } else { if ((c3 >= c2 && - mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 && - mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) || + mode_mv[NEARESTMV][refs[0]].as_int == 0 && + mode_mv[NEARESTMV][refs[1]].as_int == 0) || (c3 >= c1 && - mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 && - mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0)) + mode_mv[NEARMV][refs[0]].as_int == 0 && + mode_mv[NEARMV][refs[1]].as_int == 0)) return INT64_MAX; } } @@ -2754,7 +2779,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd); - if (mv_check_bounds(x, &cur_mv[i])) + if (mv_check_bounds(x, &cur_mv[i].as_mv)) return INT64_MAX; mbmi->mv[i].as_int = cur_mv[i].as_int; } @@ -2773,8 +2798,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other * words if you present them in that order, the second one is always known * if the first is known */ - *rate2 += cost_mv_ref(cpi, this_mode, - mbmi->mode_context[mbmi->ref_frame[0]]); + *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); if (!(*mode_excluded)) *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE @@ -2910,33 +2934,26 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cm->interp_filter == SWITCHABLE) *rate2 += get_switchable_rate(x); - if (!is_comp_pred && cpi->enable_encode_breakout) { + if (!is_comp_pred) { if (cpi->active_map_enabled && x->active_ptr[0] == 0) x->skip = 1; - else if (x->encode_breakout) { + else if (cpi->allow_encode_breakout && x->encode_breakout) { const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); unsigned int var, sse; // Skipping threshold for ac. unsigned int thresh_ac; - // The encode_breakout input - unsigned int encode_breakout = x->encode_breakout << 4; - unsigned int max_thresh = 36000; - + // Set a maximum for threshold to avoid big PSNR loss in low bitrate case. // Use extreme low threshold for static frames to limit skipping. - if (cpi->enable_encode_breakout == 2) - max_thresh = 128; + const unsigned int max_thresh = (cpi->allow_encode_breakout == + ENCODE_BREAKOUT_LIMITED) ? 128 : 36000; + // The encode_breakout input + const unsigned int min_thresh = + MIN(((unsigned int)x->encode_breakout << 4), max_thresh); // Calculate threshold according to dequant value. thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; - - // Use encode_breakout input if it is bigger than internal threshold. - if (thresh_ac < encode_breakout) - thresh_ac = encode_breakout; - - // Set a maximum for threshold to avoid big PSNR loss in low bitrate case. - if (thresh_ac > max_thresh) - thresh_ac = max_thresh; + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, @@ -2999,8 +3016,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int64_t rdcosty = INT64_MAX; // Y cost and distortion - super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, - bsize, txfm_cache, ref_best_rd); + inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, + bsize, txfm_cache, ref_best_rd); if (*rate_y == INT_MAX) { *rate2 = INT_MAX; @@ -3015,7 +3032,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse)); - super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, + super_block_uvrd(x, rate_uv, distortion_uv, &skippable_uv, &sseuv, bsize, ref_best_rd - rdcosty); if (*rate_uv == INT_MAX) { *rate2 = INT_MAX; @@ -3123,10 +3140,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - const struct segmentation *seg = &cm->seg; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + const struct segmentation *const seg = &cm->seg; const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); MB_PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; @@ -3162,12 +3179,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; + int mode_skip_mask = 0; + const int mode_skip_start = cpi->sf.mode_skip_start + 1; + const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize]; + const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags; x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; - // Everywhere the flag is set the error is much higher than its neighbors. - ctx->modes_with_high_error = 0; - estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, &comp_mode_p); @@ -3195,16 +3214,72 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[ZEROMV][ref_frame].as_int = 0; } - cpi->ref_frame_mask = 0; - for (ref_frame = LAST_FRAME; - ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) { - int i; - for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { - if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) { - cpi->ref_frame_mask |= (1 << ref_frame); - break; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // All modes from vp9_mode_order that use this frame as any ref + static const int ref_frame_mask_all[] = { + 0x0, 0x123291, 0x25c444, 0x39b722 + }; + // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use + // this frame as their primary ref + static const int ref_frame_mask_fixedmv[] = { + 0x0, 0x121281, 0x24c404, 0x080102 + }; + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { + // Skip modes for missing references + mode_skip_mask |= ref_frame_mask_all[ref_frame]; + } else if (cpi->sf.reference_masking) { + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) { + mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame]; + break; + } } } + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + mode_skip_mask |= ref_frame_mask_all[ref_frame]; + } + } + + // If the segment skip feature is enabled.... + // then do nothing if the current mode is not allowed.. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + const int inter_non_zero_mode_mask = 0x1F7F7; + mode_skip_mask |= inter_non_zero_mode_mask; + } + + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + const int altref_zero_mask = + ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA)); + mode_skip_mask |= altref_zero_mask; + if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0) + mode_skip_mask |= (1 << THR_NEARA); + if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0) + mode_skip_mask |= (1 << THR_NEARESTA); + } + } + + // TODO(JBB): This is to make up for the fact that we don't have sad + // functions that work when the block size reads outside the umv. We + // should fix this either by making the motion search just work on + // a representative block in the boundary ( first ) and then implement a + // function that does sads when inside the border.. + if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) { + const int new_modes_mask = + (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) | + (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA); + mode_skip_mask |= new_modes_mask; } for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { @@ -3218,109 +3293,95 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t tx_cache[TX_MODES]; int i; int this_skip2 = 0; - int64_t total_sse = INT_MAX; + int64_t total_sse = INT64_MAX; int early_term = 0; - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = INT64_MAX; - - x->skip = 0; - this_mode = vp9_mode_order[mode_index].mode; - ref_frame = vp9_mode_order[mode_index].ref_frame[0]; - second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; - // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. - if (mode_index > cpi->sf.mode_skip_start) { - if (mode_index == (cpi->sf.mode_skip_start + 1)) { - switch (vp9_mode_order[best_mode_index].ref_frame[0]) { - case INTRA_FRAME: - cpi->mode_skip_mask = 0; - break; - case LAST_FRAME: - cpi->mode_skip_mask = LAST_FRAME_MODE_MASK; - break; - case GOLDEN_FRAME: - cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK; - break; - case ALTREF_FRAME: - cpi->mode_skip_mask = ALT_REF_MODE_MASK; - break; - case NONE: - case MAX_REF_FRAMES: - assert(0 && "Invalid Reference frame"); - } + if (mode_index == mode_skip_start) { + switch (vp9_mode_order[best_mode_index].ref_frame[0]) { + case INTRA_FRAME: + break; + case LAST_FRAME: + mode_skip_mask |= LAST_FRAME_MODE_MASK; + break; + case GOLDEN_FRAME: + mode_skip_mask |= GOLDEN_FRAME_MODE_MASK; + break; + case ALTREF_FRAME: + mode_skip_mask |= ALT_REF_MODE_MASK; + break; + case NONE: + case MAX_REF_FRAMES: + assert(0 && "Invalid Reference frame"); } - if (cpi->mode_skip_mask & ((int64_t)1 << mode_index)) - continue; } - - // Skip if the current reference frame has been masked off - if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV) + if (mode_skip_mask & (1 << mode_index)) continue; // Test best rd so far against threshold for trying this mode. - if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] * - cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) || - cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX) - continue; - - // Do not allow compound prediction if the segment level reference - // frame feature is in use as in this case there can only be one reference. - if ((second_ref_frame > INTRA_FRAME) && - vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) - continue; - - // Skip some checking based on small partitions' result. - if (x->fast_ms > 1 && !ref_frame) - continue; - if (x->fast_ms > 2 && ref_frame != x->subblock_ref) - continue; - - mbmi->ref_frame[0] = ref_frame; - mbmi->ref_frame[1] = second_ref_frame; + if (best_rd < ((int64_t)rd_threshes[mode_index] * + rd_thresh_freq_fact[mode_index] >> 5) || + rd_threshes[mode_index] == INT_MAX) + continue; - if (!(ref_frame == INTRA_FRAME - || (cpi->ref_frame_flags & flag_list[ref_frame]))) { - continue; - } - if (!(second_ref_frame == NONE - || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) { - continue; - } + this_mode = vp9_mode_order[mode_index].mode; + ref_frame = vp9_mode_order[mode_index].ref_frame[0]; + second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) - if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) - continue; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) - if (ref_frame != best_inter_ref_frame && - second_ref_frame != best_inter_ref_frame) + if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) + continue; + if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) && + ref_frame != best_inter_ref_frame && + second_ref_frame != best_inter_ref_frame) + continue; + mode_excluded = mode_excluded ? + mode_excluded : cm->reference_mode == SINGLE_REFERENCE; + } else { + if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) + mode_excluded = mode_excluded ? + mode_excluded : cm->reference_mode == COMPOUND_REFERENCE; + } + + if (ref_frame == INTRA_FRAME) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + }; + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + this_mode != DC_PRED && + x->source_variance < skip_intra_var_thresh[bsize]) + continue; + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= TM_PRED)) { + if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME) continue; + } + if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(this_mode, best_intra_mode)) + continue; + } } - set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + mbmi->mode = this_mode; mbmi->uv_mode = DC_PRED; - + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; // Evaluate all sub-pel filters irrespective of whether we can use // them for this frame. mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter); - if (comp_pred) { - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) - continue; - - mode_excluded = mode_excluded ? mode_excluded - : cm->reference_mode == SINGLE_REFERENCE; - } else { - if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) - mode_excluded = mode_excluded ? - mode_excluded : cm->reference_mode == COMPOUND_REFERENCE; - } - // Select prediction reference frames. for (i = 0; i < MAX_MB_PLANE; i++) { xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; @@ -3328,46 +3389,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; } - // If the segment reference frame feature is enabled.... - // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != - (int)ref_frame) { - continue; - // If the segment skip feature is enabled.... - // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) { - continue; - // Disable this drop out case if the ref frame - // segment level feature is enabled for this segment. This is to - // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(seg, segment_id, - SEG_LVL_REF_FRAME)) { - // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, - // unless ARNR filtering is enabled in which case we want - // an unfiltered alternative. We allow near/nearest as well - // because they may result in zero-zero MVs but be cheaper. - if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - if ((this_mode != ZEROMV && - !(this_mode == NEARMV && - frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) && - !(this_mode == NEARESTMV && - frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) || - ref_frame != ALTREF_FRAME) { - continue; - } - } - } - // TODO(JBB): This is to make up for the fact that we don't have sad - // functions that work when the block size reads outside the umv. We - // should fix this either by making the motion search just work on - // a representative block in the boundary ( first ) and then implement a - // function that does sads when inside the border.. - if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) && - this_mode == NEWMV) { - continue; - } + for (i = 0; i < TX_MODES; ++i) + tx_cache[i] = INT64_MAX; #ifdef MODE_TEST_HIT_STATS // TEST/DEBUG CODE @@ -3375,34 +3398,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cpi->mode_test_hits[bsize]++; #endif - if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - // Disable intra modes other than DC_PRED for blocks with low variance - // Threshold for intra skipping based on source variance - // TODO(debargha): Specialize the threshold for super block sizes - static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = { - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - }; - if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && - this_mode != DC_PRED && - x->source_variance < skip_intra_var_thresh[mbmi->sb_type]) - continue; - // Only search the oblique modes if the best so far is - // one of the neighboring directional modes - if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && - (this_mode >= D45_PRED && this_mode <= TM_PRED)) { - if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME) - continue; - } - mbmi->mode = this_mode; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { - if (conditional_skipintra(mbmi->mode, best_intra_mode)) - continue; - } - - super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, - bsize, tx_cache, best_rd); + intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, + bsize, tx_cache, best_rd); if (rate_y == INT_MAX) continue; @@ -3424,8 +3423,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { - mbmi->mode = this_mode; - compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME); this_rd = handle_inter_mode(cpi, x, tile, bsize, tx_cache, &rate2, &distortion2, &skippable, @@ -3437,14 +3434,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, single_newmv, &total_sse, best_rd); if (this_rd == INT64_MAX) continue; - } - if (cm->reference_mode == REFERENCE_MODE_SELECT) - rate2 += compmode_cost; + compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + rate2 += compmode_cost; + } // Estimate the reference frame signaling cost and add it // to the rolling cost variable. - if (second_ref_frame > INTRA_FRAME) { + if (comp_pred) { rate2 += ref_costs_comp[ref_frame]; } else { rate2 += ref_costs_single[ref_frame]; @@ -3552,7 +3551,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history - if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && + if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) && (mode_index > MIN_EARLY_TERM_INDEX)) { const int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index @@ -3662,17 +3661,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - // Flag all modes that have a distortion thats > 2x the best we found at - // this level. - for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) { - if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV) - continue; - - if (mode_distortions[mode_index] > 2 * *returndistortion) { - ctx->modes_with_high_error |= (1 << mode_index); - } - } - assert((cm->interp_filter == SWITCHABLE) || (cm->interp_filter == best_mbmode.interp_filter) || !is_inter_block(&best_mbmode)); @@ -3787,6 +3775,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; + int ref_frame_mask = 0; + int mode_skip_mask = 0; x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); @@ -3822,13 +3812,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[ZEROMV][ref_frame].as_int = 0; } - cpi->ref_frame_mask = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) { int i; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) { - cpi->ref_frame_mask |= (1 << ref_frame); + ref_frame_mask |= (1 << ref_frame); break; } } @@ -3861,23 +3850,23 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (mode_index == 3) { switch (vp9_ref_order[best_mode_index].ref_frame[0]) { case INTRA_FRAME: - cpi->mode_skip_mask = 0; + mode_skip_mask = 0; break; case LAST_FRAME: - cpi->mode_skip_mask = 0x0010; + mode_skip_mask = 0x0010; break; case GOLDEN_FRAME: - cpi->mode_skip_mask = 0x0008; + mode_skip_mask = 0x0008; break; case ALTREF_FRAME: - cpi->mode_skip_mask = 0x0000; + mode_skip_mask = 0x0000; break; case NONE: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); } } - if (cpi->mode_skip_mask & ((int64_t)1 << mode_index)) + if (mode_skip_mask & (1 << mode_index)) continue; } @@ -4137,11 +4126,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (tmp_rd == INT64_MAX) continue; } else { - if (cm->interp_filter == SWITCHABLE) { - int rs = get_switchable_rate(x); - tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0); - } - tmp_rd = tmp_best_rdu; total_sse = tmp_best_sse; rate = tmp_best_rate; rate_y = tmp_best_ratey; @@ -4173,7 +4157,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // then dont bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); - super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, + super_block_uvrd(x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu); if (rate_uv == INT_MAX) continue; @@ -4392,7 +4376,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (best_rd == INT64_MAX && bsize < BLOCK_8X8) { *returnrate = INT_MAX; - *returndistortion = INT_MAX; + *returndistortion = INT64_MAX; return best_rd; } diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 96cea4216..6b85d67f8 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -80,10 +80,10 @@ void vp9_init_me_luts(); void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode, const MV *mv); -void vp9_get_entropy_contexts(TX_SIZE tx_size, - ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], - const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, - int num_4x4_w, int num_4x4_h); +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c index 0766b5107..4e6efaeb9 100644 --- a/vp9/encoder/vp9_resize.c +++ b/vp9/encoder/vp9_resize.c @@ -14,6 +14,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> + #include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_resize.h" @@ -24,9 +25,6 @@ #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) #define INTERP_PRECISION_BITS 32 -#define ROUND_POWER_OF_TWO(value, n) \ - (((value) + (1 << ((n) - 1))) >> (n)) - typedef int16_t interp_kernel[INTERP_TAPS]; // Filters for interpolation (0.5-band) - note this also filters integer pels. diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad.c index 58c5df47e..58c5df47e 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad.c diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index e822e4c64..502e4b678 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -20,7 +20,6 @@ #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_psnr.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" @@ -29,7 +28,6 @@ #include "vpx_scale/vpx_scale.h" #define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering -#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *y_mb_ptr, @@ -134,17 +132,16 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, int sadpb = x->sadperbit16; int bestsme = INT_MAX; - int_mv best_ref_mv1; - int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - int_mv *ref_mv; + MV best_ref_mv1 = {0, 0}; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + MV *ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0].as_mv; // Save input state struct buf_2d src = x->plane[0].src; struct buf_2d pre = xd->plane[0].pre[0]; - best_ref_mv1.as_int = 0; - best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3; - best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; // Setup frame pointers x->plane[0].src.buf = arf_frame_buf; @@ -161,21 +158,17 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, /*cpi->sf.search_method == HEX*/ // Ignore mv costing by sending NULL pointer instead of cost arrays - ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0]; - bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv, - step_param, sadpb, 1, - &cpi->fn_ptr[BLOCK_16X16], - 0, &best_ref_mv1.as_mv, &ref_mv->as_mv); + vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, + &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv); -#if ALT_REF_SUBPEL_ENABLED // Try sub-pixel MC? // if (bestsme > error_thresh && bestsme < INT_MAX) { int distortion; unsigned int sse; // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv, - &best_ref_mv1.as_mv, + bestsme = cpi->find_fractional_mv_step(x, ref_mv, + &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], @@ -183,7 +176,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, NULL, NULL, &distortion, &sse); } -#endif // Restore input state x->plane[0].src = src; @@ -523,11 +515,16 @@ void vp9_configure_arnr_filter(VP9_COMP *cpi, cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; // Adjust the strength based on active max q - q = ((int)vp9_convert_qindex_to_q(cpi->rc.active_worst_quality) >> 1); - if (q > 8) { + if (cpi->common.current_video_frame > 1) + q = ((int)vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME])); + else + q = ((int)vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[KEY_FRAME])); + if (q > 16) { cpi->active_arnr_strength = cpi->oxcf.arnr_strength; } else { - cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q); + cpi->active_arnr_strength = cpi->oxcf.arnr_strength - ((16 - q) / 2); if (cpi->active_arnr_strength < 0) cpi->active_arnr_strength = 0; } diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index ed1301a8a..7ae110707 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -23,8 +23,8 @@ static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; const TOKENVALUE *vp9_dct_value_tokens_ptr; -static int dct_value_cost[DCT_MAX_VALUE * 2]; -const int *vp9_dct_value_cost_ptr; +static int16_t dct_value_cost[DCT_MAX_VALUE * 2]; +const int16_t *vp9_dct_value_cost_ptr; // Array indices are identical to previously-existing CONTEXT_NODE indices const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = { @@ -160,7 +160,6 @@ struct tokenize_b_args { VP9_COMP *cpi; MACROBLOCKD *xd; TOKENEXTRA **tp; - TX_SIZE tx_size; uint8_t *token_cache; }; @@ -188,6 +187,18 @@ static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree, ++counts[token]; } +static INLINE void add_token_no_extra(TOKENEXTRA **t, + const vp9_prob *context_tree, + uint8_t token, + uint8_t skip_eob_node, + unsigned int *counts) { + (*t)->token = token; + (*t)->context_tree = context_tree; + (*t)->skip_eob_node = skip_eob_node; + (*t)++; + ++counts[token]; +} + static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct tokenize_b_args* const args = arg; @@ -199,17 +210,22 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, struct macroblockd_plane *pd = &xd->plane[plane]; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; int pt; /* near block/prev token context index */ - int c = 0; + int c; TOKENEXTRA *t = *tp; /* store tokens starting here */ int eob = p->eobs[block]; const PLANE_TYPE type = pd->plane_type; - const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block); + const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const int segment_id = mbmi->segment_id; const int16_t *scan, *nb; const scan_order *so; - vp9_coeff_count *const counts = cpi->coef_counts[tx_size]; - vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size]; const int ref = is_inter_block(mbmi); + unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = + cpi->coef_counts[tx_size][type][ref]; + vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + cpi->common.fc.coef_probs[tx_size][type][ref]; + unsigned int (*const eob_branch)[COEFF_CONTEXTS] = + cpi->common.counts.eob_branch[tx_size][type][ref]; + const uint8_t *const band = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); @@ -225,27 +241,26 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, while (c < eob) { int v = 0; int skip_eob = 0; - v = qcoeff_ptr[scan[c]]; + v = qcoeff[scan[c]]; while (!v) { - add_token(&t, coef_probs[type][ref][band[c]][pt], 0, ZERO_TOKEN, skip_eob, - counts[type][ref][band[c]][pt]); - - cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] += - !skip_eob; + add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob, + counts[band[c]][pt]); + eob_branch[band[c]][pt] += !skip_eob; skip_eob = 1; token_cache[scan[c]] = 0; ++c; pt = get_coef_context(nb, token_cache, c); - v = qcoeff_ptr[scan[c]]; + v = qcoeff[scan[c]]; } - add_token(&t, coef_probs[type][ref][band[c]][pt], - vp9_dct_value_tokens_ptr[v].extra, - vp9_dct_value_tokens_ptr[v].token, skip_eob, - counts[type][ref][band[c]][pt]); - cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] += !skip_eob; + add_token(&t, coef_probs[band[c]][pt], + vp9_dct_value_tokens_ptr[v].extra, + (uint8_t)vp9_dct_value_tokens_ptr[v].token, + (uint8_t)skip_eob, + counts[band[c]][pt]); + eob_branch[band[c]][pt] += !skip_eob; token_cache[scan[c]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token]; @@ -253,9 +268,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, pt = get_coef_context(nb, token_cache, c); } if (c < seg_eob) { - add_token(&t, coef_probs[type][ref][band[c]][pt], 0, EOB_TOKEN, 0, - counts[type][ref][band[c]][pt]); - ++cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt]; + add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0, + counts[band[c]][pt]); + ++eob_branch[band[c]][pt]; } *tp = t; @@ -299,8 +314,8 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, const int ctx = vp9_get_skip_context(xd); const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache}; - if (mbmi->skip_coeff) { + struct tokenize_b_args arg = {cpi, xd, t, cpi->mb.token_cache}; + if (mbmi->skip) { if (!dry_run) cm->counts.skip[ctx][1] += skip_inc; reset_skip_context(xd, bsize); diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index ea86240be..063c0bafe 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -47,7 +47,7 @@ struct VP9_COMP; void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize); -extern const int *vp9_dct_value_cost_ptr; +extern const int16_t *vp9_dct_value_cost_ptr; /* TODO: The Token field should be broken out into a separate char array to * improve cache locality, since it's needed for costing when the rest of the * fields are not. diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c index 1f9cb8709..600029b19 100644 --- a/vp9/encoder/vp9_vaq.c +++ b/vp9/encoder/vp9_vaq.c @@ -19,8 +19,8 @@ #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_systemdependent.h" -#define ENERGY_MIN (-3) -#define ENERGY_MAX (3) +#define ENERGY_MIN (-1) +#define ENERGY_MAX (1) #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) #define ENERGY_IN_BOUNDS(energy)\ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) @@ -44,7 +44,7 @@ unsigned int vp9_vaq_segment_id(int energy) { double vp9_vaq_rdmult_ratio(int energy) { ENERGY_IN_BOUNDS(energy); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); return RDMULT_RATIO(energy); } @@ -52,7 +52,7 @@ double vp9_vaq_rdmult_ratio(int energy) { double vp9_vaq_inv_q_ratio(int energy) { ENERGY_IN_BOUNDS(energy); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); return Q_RATIO(-energy); } @@ -63,9 +63,9 @@ void vp9_vaq_init() { assert(ENERGY_SPAN <= MAX_SEGMENTS); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); - base_ratio = 1.8; + base_ratio = 1.5; for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { Q_RATIO(i) = pow(base_ratio, i/3.0); @@ -75,35 +75,39 @@ void vp9_vaq_init() { void vp9_vaq_frame_setup(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; struct segmentation *seg = &cm->seg; - int base_q = vp9_convert_qindex_to_q(cm->base_qindex); - int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + - cm->y_dc_delta_q); + const double base_q = vp9_convert_qindex_to_q(cm->base_qindex); + const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + + cm->y_dc_delta_q); int i; - vp9_enable_segmentation((VP9_PTR)cpi); - vp9_clearall_segfeatures(seg); + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + vp9_enable_segmentation((VP9_PTR)cpi); + vp9_clearall_segfeatures(seg); - seg->abs_delta = SEGMENT_DELTADATA; + seg->abs_delta = SEGMENT_DELTADATA; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); - for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { - int qindex_delta, segment_rdmult; + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + int qindex_delta, segment_rdmult; - if (Q_RATIO(i) == 1) { - // No need to enable SEG_LVL_ALT_Q for this segment - RDMULT_RATIO(i) = 1; - continue; - } + if (Q_RATIO(i) == 1) { + // No need to enable SEG_LVL_ALT_Q for this segment + RDMULT_RATIO(i) = 1; + continue; + } - qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); - vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); - vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); + qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); + vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); - segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + - cm->y_dc_delta_q); + segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + + cm->y_dc_delta_q); - RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; + RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; + } } } @@ -137,11 +141,8 @@ int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { double energy; unsigned int var = block_variance(cpi, x, bs); - vp9_clear_system_state(); // __asm emms; - - // if (var <= 1000) - // return 0; + vp9_clear_system_state(); - energy = 0.9*(logf(var + 1) - 10.0); - return clamp(round(energy), ENERGY_MIN, ENERGY_MAX); + energy = 0.9 * (log(var + 1.0) - 10.0); + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); } diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance.c index 8bc385089..8bc385089 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance.c diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h index 5958b4806..1795e05e4 100644 --- a/vp9/encoder/vp9_write_bit_buffer.h +++ b/vp9/encoder/vp9_write_bit_buffer.h @@ -29,7 +29,7 @@ static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) { } static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) { - const int off = wb->bit_offset; + const int off = (int)wb->bit_offset; const int p = off / CHAR_BIT; const int q = CHAR_BIT - 1 - off % CHAR_BIT; if (q == CHAR_BIT -1) { diff --git a/vp9/encoder/vp9_writer.c b/vp9/encoder/vp9_writer.c index 3d13d07b6..fda1b390e 100644 --- a/vp9/encoder/vp9_writer.c +++ b/vp9/encoder/vp9_writer.c @@ -12,11 +12,6 @@ #include "vp9/encoder/vp9_writer.h" #include "vp9/common/vp9_entropy.h" -#if defined(SECTIONBITS_OUTPUT) -unsigned __int64 Sectionbits[500]; - -#endif - #ifdef ENTROPY_STATS unsigned int active_section = 0; #endif diff --git a/vp9/encoder/vp9_writer.h b/vp9/encoder/vp9_writer.h index 62f555c99..defeec377 100644 --- a/vp9/encoder/vp9_writer.h +++ b/vp9/encoder/vp9_writer.h @@ -44,17 +44,6 @@ static void vp9_write(vp9_writer *br, int bit, int probability) { unsigned int lowvalue = br->lowvalue; register unsigned int shift; -#ifdef ENTROPY_STATS -#if defined(SECTIONBITS_OUTPUT) - - if (bit) - Sectionbits[active_section] += vp9_prob_cost[255 - probability]; - else - Sectionbits[active_section] += vp9_prob_cost[probability]; - -#endif -#endif - split = 1 + (((range - 1) * probability) >> 8); range = split; diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c index ea031fb07..b5269ed03 100644 --- a/vp9/encoder/x86/vp9_dct_avx2.c +++ b/vp9/encoder/x86/vp9_dct_avx2.c @@ -16,7 +16,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -46,7 +46,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { - // The mask will only contain wether the first value is zero, all + // The mask will only contain whether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: @@ -59,7 +59,7 @@ void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - // Transform 1/2: Add/substract + // Transform 1/2: Add/subtract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); @@ -244,32 +244,36 @@ void fadst4_avx2(__m128i *in) { transpose_4x4_avx2(in); } -void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht4x4_avx2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in[4]; - load_buffer_4x4_avx2(input, in, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct4_avx2(in); - fdct4_avx2(in); + case DCT_DCT: + vp9_fdct4x4_avx2(input, output, stride); break; - case 1: // ADST_DCT + case ADST_DCT: + load_buffer_4x4_avx2(input, in, stride); fadst4_avx2(in); fdct4_avx2(in); + write_buffer_4x4_avx2(output, in); break; - case 2: // DCT_ADST + case DCT_ADST: + load_buffer_4x4_avx2(input, in, stride); fdct4_avx2(in); fadst4_avx2(in); + write_buffer_4x4_avx2(output, in); break; - case 3: // ADST_ADST + case ADST_ADST: + load_buffer_4x4_avx2(input, in, stride); fadst4_avx2(in); fadst4_avx2(in); + write_buffer_4x4_avx2(output, in); break; default: assert(0); break; } - write_buffer_4x4_avx2(output, in); } void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { @@ -313,7 +317,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/substract + // Add/subtract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); @@ -324,7 +328,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { - // Add/substract + // Add/subtract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); @@ -386,7 +390,7 @@ void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/substract + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); @@ -1028,40 +1032,46 @@ void fadst8_avx2(__m128i *in) { array_transpose_8x8_avx2(in, in); } -void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht8x8_avx2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in[8]; - load_buffer_8x8_avx2(input, in, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct8_avx2(in); - fdct8_avx2(in); + case DCT_DCT: + vp9_fdct8x8_avx2(input, output, stride); break; - case 1: // ADST_DCT + case ADST_DCT: + load_buffer_8x8_avx2(input, in, stride); fadst8_avx2(in); fdct8_avx2(in); + right_shift_8x8_avx2(in, 1); + write_buffer_8x8_avx2(output, in, 8); break; - case 2: // DCT_ADST + case DCT_ADST: + load_buffer_8x8_avx2(input, in, stride); fdct8_avx2(in); fadst8_avx2(in); + right_shift_8x8_avx2(in, 1); + write_buffer_8x8_avx2(output, in, 8); break; - case 3: // ADST_ADST + case ADST_ADST: + load_buffer_8x8_avx2(input, in, stride); fadst8_avx2(in); fadst8_avx2(in); + right_shift_8x8_avx2(in, 1); + write_buffer_8x8_avx2(output, in, 8); break; default: assert(0); break; } - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); } void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -1218,7 +1228,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { } // Work on the first eight values; fdct8(input, even_results); { - // Add/substract + // Add/subtract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); @@ -1229,7 +1239,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { - // Add/substract + // Add/subtract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); @@ -1293,7 +1303,7 @@ void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/substract + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); @@ -2534,36 +2544,39 @@ void fadst16_avx2(__m128i *in0, __m128i *in1) { array_transpose_16x16_avx2(in0, in1); } -void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in0[16], in1[16]; - load_buffer_16x16_avx2(input, in0, in1, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fdct16_avx2(in0, in1); + case DCT_DCT: + vp9_fdct16x16_avx2(input, output, stride); break; - case 1: // ADST_DCT + case ADST_DCT: + load_buffer_16x16_avx2(input, in0, in1, stride); fadst16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); fdct16_avx2(in0, in1); + write_buffer_16x16_avx2(output, in0, in1, 16); break; - case 2: // DCT_ADST + case DCT_ADST: + load_buffer_16x16_avx2(input, in0, in1, stride); fdct16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); fadst16_avx2(in0, in1); + write_buffer_16x16_avx2(output, in0, in1, 16); break; - case 3: // ADST_ADST + case ADST_ADST: + load_buffer_16x16_avx2(input, in0, in1, stride); fadst16_avx2(in0, in1); right_shift_16x16_avx2(in0, in1); fadst16_avx2(in0, in1); + write_buffer_16x16_avx2(output, in0, in1, 16); break; default: assert(0); break; } - write_buffer_16x16_avx2(output, in0, in1, 16); } #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index c876cc273..f3735ebd3 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -16,7 +16,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -47,7 +47,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { in1 = _mm_slli_epi16(in1, 4); // if (i == 0 && input[0]) input[0] += 1; { - // The mask will only contain wether the first value is zero, all + // The mask will only contain whether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: @@ -60,7 +60,7 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - // Transform 1/2: Add/substract + // Transform 1/2: Add/subtract const __m128i r0 = _mm_add_epi16(in0, in1); const __m128i r1 = _mm_sub_epi16(in0, in1); const __m128i r2 = _mm_unpacklo_epi64(r0, r1); @@ -242,32 +242,36 @@ void fadst4_sse2(__m128i *in) { transpose_4x4(in); } -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in[4]; - load_buffer_4x4(input, in, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct4_sse2(in); - fdct4_sse2(in); + case DCT_DCT: + vp9_fdct4x4_sse2(input, output, stride); break; - case 1: // ADST_DCT + case ADST_DCT: + load_buffer_4x4(input, in, stride); fadst4_sse2(in); fdct4_sse2(in); + write_buffer_4x4(output, in); break; - case 2: // DCT_ADST + case DCT_ADST: + load_buffer_4x4(input, in, stride); fdct4_sse2(in); fadst4_sse2(in); + write_buffer_4x4(output, in); break; - case 3: // ADST_ADST + case ADST_ADST: + load_buffer_4x4(input, in, stride); fadst4_sse2(in); fadst4_sse2(in); + write_buffer_4x4(output, in); break; - default: - assert(0); - break; + default: + assert(0); + break; } - write_buffer_4x4(output, in); } void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { @@ -311,7 +315,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/substract + // Add/subtract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); @@ -322,7 +326,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { - // Add/substract + // Add/subtract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); @@ -384,7 +388,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/substract + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); @@ -1026,40 +1030,46 @@ void fadst8_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in[8]; - load_buffer_8x8(input, in, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct8_sse2(in); - fdct8_sse2(in); + case DCT_DCT: + vp9_fdct8x8_sse2(input, output, stride); break; - case 1: // ADST_DCT + case ADST_DCT: + load_buffer_8x8(input, in, stride); fadst8_sse2(in); fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); break; - case 2: // DCT_ADST + case DCT_ADST: + load_buffer_8x8(input, in, stride); fdct8_sse2(in); fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); break; - case 3: // ADST_ADST + case ADST_ADST: + load_buffer_8x8(input, in, stride); fadst8_sse2(in); fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); break; default: assert(0); break; } - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); } void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -1216,7 +1226,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { } // Work on the first eight values; fdct8(input, even_results); { - // Add/substract + // Add/subtract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); @@ -1227,7 +1237,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { - // Add/substract + // Add/subtract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); @@ -1291,7 +1301,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/substract + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); @@ -2532,36 +2542,39 @@ void fadst16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); } -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in0[16], in1[16]; - load_buffer_16x16(input, in0, in1, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct16_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_sse2(in0, in1); + case DCT_DCT: + vp9_fdct16x16_sse2(input, output, stride); break; - case 1: // ADST_DCT + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); break; - case 2: // DCT_ADST + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); fdct16_sse2(in0, in1); right_shift_16x16(in0, in1); fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); break; - case 3: // ADST_ADST + case ADST_ADST: + load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); break; default: assert(0); break; } - write_buffer_16x16(output, in0, in1, 16); } #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm index db306603b..48ccef8cc 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm @@ -188,7 +188,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmaxsw m8, m7 pshuflw m7, m8, 0x1 pmaxsw m8, m7 - pextrw [r2], m8, 0 + pextrw r6, m8, 0 + mov [r2], r6 RET ; skip-block, i.e. just write all zeroes @@ -214,5 +215,5 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endmacro INIT_XMM ssse3 -QUANTIZE_FN b, 6 +QUANTIZE_FN b, 7 QUANTIZE_FN b_32x32, 7 diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c new file mode 100644 index 000000000..b8bfa8900 --- /dev/null +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> // AVX2 +#include "vpx_ports/mem.h" +#include "vp9/encoder/vp9_variance.h" + +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, + 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, + 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, + 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, + 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, + 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, + 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, + 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, + 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15 +}; + +unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + int height, + unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + if (x_offset == 0) { + // x_offset = 0 and y_offset = 0 + if (y_offset == 0) { + for (i = 0; i < height ; i++) { + // load source and destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + + // expend each byte to 2 bytes + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + // load source + next source + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) + (src + src_stride)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + // average between current and next stride source + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + + // expend each byte to 2 bytes + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; +#if (ARCH_X86_64) + int64_t y_offset64; + y_offset64 = y_offset; + y_offset64 <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset64)); +#else + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); +#endif + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + // load current and next source + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) + (src + src_stride)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + + // merge current and next source + exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); + + // add 8 to the source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // expand each byte to 2 byte in the destination + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + src+= src_stride; + dst+= dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + // load source and another source starting from the next + // following byte + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + + // average between source and the next byte following source + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + + // expand each byte to 2 bytes + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + + // average between source and the next byte following source + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); + for (i = 0; i < height ; i++) { + src+= src_stride; + // load source and another source starting from the next + // following byte + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + // average between source and the next byte following source + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + + // expand each byte to 2 bytes + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); + + // save current source average + src_avg = src_reg; + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + dst+= dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; +#if (ARCH_X86_64) + int64_t y_offset64; + y_offset64 = y_offset; + y_offset64 <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset64)); +#else + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); +#endif + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + // average between source and the next byte following source + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); + for (i = 0; i < height ; i++) { + src+= src_stride; + // load source and another source starting from the next + // following byte + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + // average between source and the next byte following source + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + + // merge previous average and current average + exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); + + // add 8 to the source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide the source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // expand each byte to 2 bytes + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // save current source average + src_avg = src_reg; + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + dst+= dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; +#if (ARCH_X86_64) + int64_t x_offset64; + x_offset64 = x_offset; + x_offset64 <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset64)); +#else + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); +#endif + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + // load source and another source starting from the next + // following byte + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + + // merge current and next source + exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); + + // add 8 to source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide the source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // expand each byte to 2 bytes + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + src+= src_stride; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; +#if (ARCH_X86_64) + int64_t x_offset64; + x_offset64 = x_offset; + x_offset64 <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset64)); +#else + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); +#endif + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + + // merge current and next stride source + exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); + + // add 8 to source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + + // load source and another source starting from the next + // following byte + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + + // merge current and next stride source + exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); + + // add 8 to source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // convert each 16 bit to 8 bit to each low and high lane source + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + + // expand each byte to 2 bytes + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // calculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + // save previous pack + src_pack = src_reg; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; +#if (ARCH_X86_64) + int64_t x_offset64, y_offset64; + x_offset64 = x_offset; + x_offset64 <<= 5; + y_offset64 = y_offset; + y_offset64 <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset64)); + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset64)); +#else + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); +#endif + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + // merge current and next stride source + exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter); + + // add 8 to the source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide the source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + // load source and another source starting from the next + // following byte + destination + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1)); + dst_reg = _mm256_load_si256((__m256i const *) (dst)); + + // merge current and next stride source + exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter); + + // add 8 to source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // convert each 16 bit to 8 bit to each low and high lane source + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + // merge previous pack to current pack source + exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg); + + // filter the source + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter); + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter); + + // expand each byte to 2 bytes + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); + + // add 8 to source + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); + + // divide source by 16 + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + + // source - dest + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); + + // caculate sum + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); + + // calculate sse + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + + src_pack = src_reg; + dst+= dst_stride; + } + } + } + // sum < 0 + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); + // save the next 8 bytes of each lane of sse + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); + // merge the result of sum < 0 with sum to add sign to the next 16 bits + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); + // add each 8 bytes from every lane of sse and sum + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); + + // save the next 4 bytes of each lane sse + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); + // save the next 8 bytes of each lane of sum + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); + + // add the first 4 bytes to the next 4 bytes sse + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); + // add the first 8 bytes to the next 8 bytes + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); + // extract the low lane and the high lane and add the results + *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + return sum; +} diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c index c9b90d52d..02007a3bd 100644 --- a/vp9/encoder/x86/vp9_variance_avx2.c +++ b/vp9/encoder/x86/vp9_variance_avx2.c @@ -42,6 +42,18 @@ void vp9_get32x32var_avx2 int *Sum ); +unsigned int vp9_sub_pixel_variance32xh_avx2 +( + const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + int height, + unsigned int *sse +); + static void variance_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, int w, int h, unsigned int *sse, int *sum, @@ -155,3 +167,43 @@ unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, *sse = var; return (var - (((int64_t)avg * avg) >> 11)); } + +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse_ptr) { + // processing 32 elements in parallel + unsigned int sse; + int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 64, &sse); + // processing the next 32 elements in parallel + unsigned int sse2; + int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, + x_offset, y_offset, + dst + 32, dst_stride, + 64, &sse2); + se += se2; + sse += sse2; + *sse_ptr = sse; + return sse - (((int64_t)se * se) >> 12); +} + +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse_ptr) { + // processing 32 element in parallel + unsigned int sse; + int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 32, &sse); + *sse_ptr = sse; + return sse - (((int64_t)se * se) >> 10); +} diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index c691411bf..9fb611504 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -23,7 +23,8 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymode.c VP9_COMMON_SRCS-yes += common/vp9_entropymv.c VP9_COMMON_SRCS-yes += common/vp9_filter.c VP9_COMMON_SRCS-yes += common/vp9_filter.h -VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c +VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c +VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h VP9_COMMON_SRCS-yes += common/vp9_idct.c VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h VP9_COMMON_SRCS-yes += common/vp9_blockd.h @@ -76,12 +77,15 @@ VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm +VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif -ifeq ($(USE_X86INC),yes) +ifeq ($(CONFIG_USE_X86INC),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 6b181710e..d7713fd3f 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -175,6 +175,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); /*Spatial layers max */ + + RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS); + if (cfg->ts_number_layers > 1) { + unsigned int i; + for (i = 1; i < cfg->ts_number_layers; ++i) { + if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i-1]) { + ERROR("ts_target_bitrate entries are not increasing"); + } + } + RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1); + for (i = cfg->ts_number_layers-2; i > 0; --i) { + if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i]) { + ERROR("ts_rate_decimator factors are not powers of 2"); + } + } + } + /* VP8 does not support a lower bound on the keyframe interval in * automatic keyframe placement mode. */ @@ -205,7 +222,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); FIRSTPASS_STATS *stats; - if (!cfg->rc_twopass_stats_in.buf) + if (cfg->rc_twopass_stats_in.buf == NULL) ERROR("rc_twopass_stats_in.buf not set."); if (cfg->rc_twopass_stats_in.sz % packet_sz) @@ -247,7 +264,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, vpx_codec_enc_cfg_t cfg, - struct vp9_extracfg vp8_cfg) { + struct vp9_extracfg vp9_cfg) { oxcf->version = cfg.g_profile; oxcf->width = cfg.g_w; oxcf->height = cfg.g_h; @@ -272,30 +289,25 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, } if (cfg.g_pass == VPX_RC_FIRST_PASS) { - oxcf->allow_lag = 0; oxcf->lag_in_frames = 0; } else { - oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; oxcf->lag_in_frames = cfg.g_lag_in_frames; } - // VBR only supported for now. - // CBR code has been deprectated for experimental phase. - // CQ mode not yet tested - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; + oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; if (cfg.rc_end_usage == VPX_CQ) - oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; + oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; else if (cfg.rc_end_usage == VPX_Q) - oxcf->end_usage = USAGE_CONSTANT_QUALITY; + oxcf->end_usage = USAGE_CONSTANT_QUALITY; else if (cfg.rc_end_usage == VPX_CBR) oxcf->end_usage = USAGE_STREAM_FROM_SERVER; oxcf->target_bandwidth = cfg.rc_target_bitrate; - oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; + oxcf->rc_max_intra_bitrate_pct = vp9_cfg.rc_max_intra_bitrate_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; oxcf->worst_allowed_q = cfg.rc_max_quantizer; - oxcf->cq_level = vp8_cfg.cq_level; + oxcf->cq_level = vp9_cfg.cq_level; oxcf->fixed_q = -1; oxcf->under_shoot_pct = cfg.rc_undershoot_pct; @@ -316,35 +328,52 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, // oxcf->kf_min_dist = cfg.kf_min_dis; oxcf->key_freq = cfg.kf_max_dist; - // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile; - // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file); - - oxcf->cpu_used = vp8_cfg.cpu_used; - oxcf->encode_breakout = vp8_cfg.static_thresh; - oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; - oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; - oxcf->sharpness = vp8_cfg.sharpness; + oxcf->cpu_used = vp9_cfg.cpu_used; + oxcf->encode_breakout = vp9_cfg.static_thresh; + oxcf->play_alternate = vp9_cfg.enable_auto_alt_ref; + oxcf->noise_sensitivity = vp9_cfg.noise_sensitivity; + oxcf->sharpness = vp9_cfg.sharpness; oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; - oxcf->output_pkt_list = vp8_cfg.pkt_list; + oxcf->output_pkt_list = vp9_cfg.pkt_list; - oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; - oxcf->arnr_strength = vp8_cfg.arnr_strength; - oxcf->arnr_type = vp8_cfg.arnr_type; + oxcf->arnr_max_frames = vp9_cfg.arnr_max_frames; + oxcf->arnr_strength = vp9_cfg.arnr_strength; + oxcf->arnr_type = vp9_cfg.arnr_type; - oxcf->tuning = vp8_cfg.tuning; + oxcf->tuning = vp9_cfg.tuning; - oxcf->tile_columns = vp8_cfg.tile_columns; - oxcf->tile_rows = vp8_cfg.tile_rows; + oxcf->tile_columns = vp9_cfg.tile_columns; + oxcf->tile_rows = vp9_cfg.tile_rows; - oxcf->lossless = vp8_cfg.lossless; + oxcf->lossless = vp9_cfg.lossless; oxcf->error_resilient_mode = cfg.g_error_resilient; - oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; + oxcf->frame_parallel_decoding_mode = vp9_cfg.frame_parallel_decoding_mode; - oxcf->aq_mode = vp8_cfg.aq_mode; + oxcf->aq_mode = vp9_cfg.aq_mode; oxcf->ss_number_layers = cfg.ss_number_layers; + + if (oxcf->ss_number_layers > 1) { + memcpy(oxcf->ss_target_bitrate, cfg.ss_target_bitrate, + sizeof(cfg.ss_target_bitrate)); + } else if (oxcf->ss_number_layers == 1) { + oxcf->ss_target_bitrate[0] = oxcf->target_bandwidth; + } + + oxcf->ts_number_layers = cfg.ts_number_layers; + + if (oxcf->ts_number_layers > 1) { + memcpy(oxcf->ts_target_bitrate, cfg.ts_target_bitrate, + sizeof(cfg.ts_target_bitrate)); + memcpy(oxcf->ts_rate_decimator, cfg.ts_rate_decimator, + sizeof(cfg.ts_rate_decimator)); + } else if (oxcf->ts_number_layers == 1) { + oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth; + oxcf->ts_rate_decimator[0] = 1; + } + /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); @@ -352,7 +381,6 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, printf("sharpness: %d\n", oxcf->sharpness); printf("cpu_used: %d\n", oxcf->cpu_used); printf("Mode: %d\n", oxcf->mode); - // printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file); printf("auto_key: %d\n", oxcf->auto_key); printf("key_freq: %d\n", oxcf->key_freq); printf("end_usage: %d\n", oxcf->end_usage); @@ -367,7 +395,6 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); - printf("allow_lag: %d\n", oxcf->allow_lag); printf("lag_in_frames: %d\n", oxcf->lag_in_frames); printf("play_alternate: %d\n", oxcf->play_alternate); printf("Version: %d\n", oxcf->Version); @@ -396,7 +423,7 @@ static vpx_codec_err_t vp9e_set_config(vpx_codec_alg_priv_t *ctx, res = validate_config(ctx, cfg, &ctx->vp8_cfg); - if (!res) { + if (res == VPX_CODEC_OK) { ctx->cfg = *cfg; set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); vp9_change_config(ctx->cpi, &ctx->oxcf); @@ -416,8 +443,7 @@ static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx, #define MAP(id, var) case id: *(RECAST(id, arg)) = var; break - if (!arg) - return VPX_CODEC_INVALID_PARAM; + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; switch (ctrl_id) { MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi)); @@ -459,7 +485,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, res = validate_config(ctx, &ctx->cfg, &xcfg); - if (!res) { + if (res == VPX_CODEC_OK) { ctx->vp8_cfg = xcfg; set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); vp9_change_config(ctx->cpi, &ctx->oxcf); @@ -478,12 +504,10 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx) { VP9_PTR optr; - if (!ctx->priv) { + if (ctx->priv == NULL) { priv = calloc(1, sizeof(struct vpx_codec_alg_priv)); - if (!priv) { - return VPX_CODEC_MEM_ERROR; - } + if (priv == NULL) return VPX_CODEC_MEM_ERROR; ctx->priv = &priv->base; ctx->priv->sz = sizeof(*ctx->priv); @@ -520,21 +544,19 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx) { priv->cx_data = malloc(priv->cx_data_sz); - if (!priv->cx_data) { - return VPX_CODEC_MEM_ERROR; - } + if (priv->cx_data == NULL) return VPX_CODEC_MEM_ERROR; vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->vp8_cfg); - if (!res) { + if (res == VPX_CODEC_OK) { set_vp9e_config(&ctx->priv->alg_priv->oxcf, ctx->priv->alg_priv->cfg, ctx->priv->alg_priv->vp8_cfg); optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf); - if (!optr) + if (optr == NULL) res = VPX_CODEC_MEM_ERROR; else ctx->priv->alg_priv->cpi = optr; @@ -621,7 +643,7 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { *x++ = marker; for (i = 0; i < ctx->pending_frame_count; i++) { - int this_sz = ctx->pending_frame_sizes[i]; + unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i]; for (j = 0; j <= mag; j++) { *x++ = this_sz & 0xff; @@ -702,7 +724,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, } /* Initialize the encoder instance on the first frame. */ - if (!res && ctx->cpi) { + if (res == VPX_CODEC_OK && ctx->cpi != NULL) { unsigned int lib_flags; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp, dst_end_time_stamp; @@ -762,8 +784,8 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; /* Pack invisible frames with the next visible frame */ - if (!cpi->common.show_frame) { - if (!ctx->pending_cx_data) + if (cpi->common.show_frame == 0) { + if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; ctx->pending_cx_data_sz += size; ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; @@ -788,7 +810,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, if (lib_flags & FRAMEFLAGS_KEY) pkt.data.frame.flags |= VPX_FRAME_IS_KEY; - if (!cpi->common.show_frame) { + if (cpi->common.show_frame == 0) { pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE; // This timestamp should be as close as possible to the @@ -862,10 +884,9 @@ static const vpx_codec_cx_pkt_t *vp9e_get_cxdata(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + vpx_ref_frame_t *frame = va_arg(args, vpx_ref_frame_t *); - if (data) { - vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + if (frame != NULL) { YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); @@ -880,10 +901,9 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + vpx_ref_frame_t *frame = va_arg(args, vpx_ref_frame_t *); - if (data) { - vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + if (frame != NULL) { YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); @@ -898,13 +918,13 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); + vp9_ref_frame_t *frame = va_arg(args, vp9_ref_frame_t *); - if (data) { + if (frame != NULL) { YV12_BUFFER_CONFIG* fb; - vp9_get_reference_enc(ctx->cpi, data->idx, &fb); - yuvconfig2image(&data->img, fb, NULL); + vp9_get_reference_enc(ctx->cpi, frame->idx, &fb); + yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; @@ -915,11 +935,11 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { #if CONFIG_VP9_POSTPROC - vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); + vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *); (void)ctr_id; - if (data) { - ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data); + if (config != NULL) { + ctx->preview_ppcfg = *config; return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; @@ -993,20 +1013,14 @@ static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *); + vpx_scaling_mode_t *scalemode = va_arg(args, vpx_scaling_mode_t *); - if (data) { + if (scalemode != NULL) { int res; - vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; res = vp9_set_internal_size(ctx->cpi, - (VPX_SCALING)scalemode.h_scaling_mode, - (VPX_SCALING)scalemode.v_scaling_mode); - - if (!res) { - return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; - } + (VPX_SCALING)scalemode->h_scaling_mode, + (VPX_SCALING)scalemode->v_scaling_mode); + return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; } else { return VPX_CODEC_INVALID_PARAM; } @@ -1016,32 +1030,54 @@ static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { int data = va_arg(args, int); vp9_set_svc(ctx->cpi, data); + // CBR mode for SVC with both temporal and spatial layers not yet supported. + if (data == 1 && + ctx->cfg.rc_end_usage == VPX_CBR && + ctx->cfg.ss_number_layers > 1 && + ctx->cfg.ts_number_layers > 1) { + return VPX_CODEC_INVALID_PARAM; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp9e_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *); + VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; + cpi->svc.spatial_layer_id = data->spatial_layer_id; + cpi->svc.temporal_layer_id = data->temporal_layer_id; + // Checks on valid layer_id input. + if (cpi->svc.temporal_layer_id < 0 || + cpi->svc.temporal_layer_id >= (int)ctx->cfg.ts_number_layers) { + return VPX_CODEC_INVALID_PARAM; + } + if (cpi->svc.spatial_layer_id < 0 || + cpi->svc.spatial_layer_id >= (int)ctx->cfg.ss_number_layers) { + return VPX_CODEC_INVALID_PARAM; + } return VPX_CODEC_OK; } static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *); VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; - vpx_svc_parameters_t params; + vpx_svc_parameters_t *params = va_arg(args, vpx_svc_parameters_t *); - if (data == NULL) { - return VPX_CODEC_INVALID_PARAM; - } + if (params == NULL) return VPX_CODEC_INVALID_PARAM; - params = *(vpx_svc_parameters_t *)data; + cpi->svc.spatial_layer_id = params->spatial_layer; + cpi->svc.temporal_layer_id = params->temporal_layer; - cpi->current_layer = params.layer; - cpi->lst_fb_idx = params.lst_fb_idx; - cpi->gld_fb_idx = params.gld_fb_idx; - cpi->alt_fb_idx = params.alt_fb_idx; + cpi->lst_fb_idx = params->lst_fb_idx; + cpi->gld_fb_idx = params->gld_fb_idx; + cpi->alt_fb_idx = params->alt_fb_idx; - if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) { + if (vp9_set_size_literal(ctx->cpi, params->width, params->height) != 0) return VPX_CODEC_INVALID_PARAM; - } - ctx->cfg.rc_max_quantizer = params.max_quantizer; - ctx->cfg.rc_min_quantizer = params.min_quantizer; + ctx->cfg.rc_max_quantizer = params->max_quantizer; + ctx->cfg.rc_min_quantizer = params->min_quantizer; set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); vp9_change_config(ctx->cpi, &ctx->oxcf); @@ -1080,6 +1116,7 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = { {VP9_GET_REFERENCE, get_reference}, {VP9E_SET_SVC, vp9e_set_svc}, {VP9E_SET_SVC_PARAMETERS, vp9e_set_svc_parameters}, + {VP9E_SET_SVC_LAYER_ID, vp9e_set_svc_layer_id}, { -1, NULL}, }; @@ -1130,9 +1167,13 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = { 9999, /* kf_max_dist */ VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ - + {0}, /* ss_target_bitrate */ + 1, /* ts_number_layers */ + {0}, /* ts_target_bitrate */ + {0}, /* ts_rate_decimator */ + 0, /* ts_periodicity */ + {0}, /* ts_layer_id */ #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION) - 1, /* g_delete_first_pass_file */ "vp8.fpf" /* first pass filename */ #endif } diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 92c6cd20c..b85e17237 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -15,6 +15,7 @@ #include "vpx/vp8dx.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" +#include "vp9/common/vp9_frame_buffers.h" #include "vp9/decoder/vp9_onyxd.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_read_bit_buffer.h" @@ -59,6 +60,11 @@ struct vpx_codec_alg_priv { int img_setup; int img_avail; int invert_tile_order; + + // External frame buffer info to save for VP9 common. + void *ext_priv; // Private data associated with the external frame buffers. + vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb; }; static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si, @@ -148,14 +154,12 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data, unsigned int data_sz, { struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; const int frame_marker = vp9_rb_read_literal(&rb, 2); - const int version = vp9_rb_read_bit(&rb) | (vp9_rb_read_bit(&rb) << 1); + const int version = vp9_rb_read_bit(&rb); + (void) vp9_rb_read_bit(&rb); // unused version bit + if (frame_marker != VP9_FRAME_MARKER) return VPX_CODEC_UNSUP_BITSTREAM; -#if CONFIG_NON420 if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM; -#else - if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM; -#endif if (vp9_rb_read_bit(&rb)) { // show an existing frame return VPX_CODEC_OK; @@ -206,7 +210,7 @@ static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx, ? sizeof(vp9_stream_info_t) : sizeof(vpx_codec_stream_info_t); memcpy(si, &ctx->si, sz); - si->sz = sz; + si->sz = (unsigned int)sz; return VPX_CODEC_OK; } @@ -291,10 +295,31 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, ctx->postproc_cfg.noise_level = 0; } - if (!optr) + if (!optr) { res = VPX_CODEC_ERROR; - else + } else { + VP9D_COMP *const pbi = (VP9D_COMP*)optr; + VP9_COMMON *const cm = &pbi->common; + + // Set index to not initialized. + cm->new_fb_idx = -1; + + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + cm->get_fb_cb = ctx->get_ext_fb_cb; + cm->release_fb_cb = ctx->release_ext_fb_cb; + cm->cb_priv = ctx->ext_priv; + } else { + cm->get_fb_cb = vp9_get_frame_buffer; + cm->release_fb_cb = vp9_release_frame_buffer; + + if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + cm->cb_priv = &cm->int_frame_buffers; + } + ctx->pbi = optr; + } } ctx->decoder_init = 1; @@ -332,7 +357,11 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags)) { + VP9D_COMP *const pbi = (VP9D_COMP*)ctx->pbi; + VP9_COMMON *const cm = &pbi->common; yuvconfig2image(&ctx->img, &sd, user_priv); + + ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; ctx->img_avail = 1; } } @@ -429,7 +458,7 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx, while (data_start < data_end && *data_start == 0) data_start++; - data_sz = data_end - data_start; + data_sz = (unsigned int)(data_end - data_start); } while (data_start < data_end); return res; } @@ -452,6 +481,24 @@ static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t *ctx, return img; } +static vpx_codec_err_t vp9_set_fb_fn( + vpx_codec_alg_priv_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + if (cb_get == NULL || cb_release == NULL) { + return VPX_CODEC_INVALID_PARAM; + } else if (ctx->pbi == NULL) { + // If the decoder has already been initialized, do not accept changes to + // the frame buffer functions. + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return VPX_CODEC_OK; + } + + return VPX_CODEC_ERROR; +} + static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx, vpx_codec_mmap_t *mmap, vpx_codec_iter_t *iter) { @@ -685,7 +732,8 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = { CODEC_INTERFACE(vpx_codec_vp9_dx) = { "WebM Project VP9 Decoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC, + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | + VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER, /* vpx_codec_caps_t caps; */ vp9_init, /* vpx_codec_init_fn_t init; */ vp9_destroy, /* vpx_codec_destroy_fn_t destroy; */ @@ -697,6 +745,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { vp9_get_si, /* vpx_codec_get_si_fn_t get_si; */ vp9_decode, /* vpx_codec_decode_fn_t decode; */ vp9_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + vp9_set_fb_fn, /* vpx_codec_set_fb_fn_t set_fb_fn; */ }, { // NOLINT /* encoder functions */ diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 63003b9c2..6679f89be 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -19,7 +19,6 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c VP9_CX_SRCS-yes += encoder/vp9_bitstream.c VP9_CX_SRCS-yes += encoder/vp9_dct.c -VP9_CX_SRCS-yes += encoder/vp9_dct.h VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h VP9_CX_SRCS-yes += encoder/vp9_encodemb.c @@ -39,7 +38,6 @@ VP9_CX_SRCS-yes += encoder/vp9_lookahead.c VP9_CX_SRCS-yes += encoder/vp9_lookahead.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.h VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h -VP9_CX_SRCS-yes += encoder/vp9_psnr.h VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h VP9_CX_SRCS-yes += encoder/vp9_rdopt.h @@ -51,12 +49,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mcomp.c VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c VP9_CX_SRCS-yes += encoder/vp9_picklpf.c VP9_CX_SRCS-yes += encoder/vp9_picklpf.h -VP9_CX_SRCS-yes += encoder/vp9_psnr.c VP9_CX_SRCS-yes += encoder/vp9_quantize.c VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_pickmode.c -VP9_CX_SRCS-yes += encoder/vp9_sad_c.c +VP9_CX_SRCS-yes += encoder/vp9_sad.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-yes += encoder/vp9_subexp.c @@ -66,7 +63,7 @@ VP9_CX_SRCS-yes += encoder/vp9_resize.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c -VP9_CX_SRCS-yes += encoder/vp9_variance_c.c +VP9_CX_SRCS-yes += encoder/vp9_variance.c VP9_CX_SRCS-yes += encoder/vp9_vaq.c VP9_CX_SRCS-yes += encoder/vp9_vaq.h ifeq ($(CONFIG_VP9_POSTPROC),yes) @@ -87,10 +84,11 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm -ifeq ($(USE_X86INC),yes) +ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm |