diff options
Diffstat (limited to 'vp9')
29 files changed, 810 insertions, 443 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 8ca356dd6..9088b0bde 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -228,8 +228,6 @@ typedef struct macroblockd { DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); int lossless; - /* Inverse transform function pointers. */ - void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); int corrupted; diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index d86877622..3253bcbf4 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -33,6 +33,9 @@ extern "C" { #define pair_set_epi16(a, b) \ _mm_set_epi16(b, a, b, a, b, a, b, a) +#define dual_set_epi16(a, b) \ + _mm_set_epi16(b, b, b, b, a, a, a, a) + // Constants: // for (int i = 1; i< 32; ++i) // printf("static const int cospi_%d_64 = %.0f;\n", i, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c300cde62..09ce72ef2 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -360,7 +360,7 @@ specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_10_add sse2 neon_asm dspr2/; +specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/; $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon; add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; @@ -422,10 +422,6 @@ specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc"; -add_proto qw/void vp9_get_sse_sum_16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get_sse_sum_16x16 sse2/; -$vp9_get_sse_sum_16x16_sse2=vp9_get16x16var_sse2; - add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; @@ -435,9 +431,11 @@ specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc"; -add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get_sse_sum_8x8 sse2/; -$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2; +add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc"; + +add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance8x4/, "$sse2_x86inc"; diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index ff9c43221..b60f8a06d 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -375,15 +375,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ { \ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ @@ -612,23 +603,6 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, dc_value); } -static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - static void idct8_sse2(__m128i *in) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h index 1c62e3272..0f179b49a 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.h +++ b/vp9/common/x86/vp9_idct_intrin_sse2.h @@ -45,6 +45,32 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { __m128i tbuf[8]; array_transpose_8x8(res0, res0); diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c index e5d3cb5f4..73bf5d1d7 100644 --- a/vp9/common/x86/vp9_idct_intrin_ssse3.c +++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c @@ -16,7 +16,7 @@ #include <tmmintrin.h> // SSSE3 #include "vp9/common/x86/vp9_idct_intrin_sse2.h" -static void idct16_8col(__m128i *in) { +static void idct16_8col(__m128i *in, int round) { const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); @@ -36,6 +36,8 @@ static void idct16_8col(__m128i *in) { const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); __m128i v[16], u[16], s[16], t[16]; @@ -266,28 +268,80 @@ static void idct16_8col(__m128i *in) { t[15] = _mm_add_epi16(s[12], s[15]); // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_sub_epi16(t[13], t[10]); - u[1] = _mm_add_epi16(t[13], t[10]); - u[2] = _mm_sub_epi16(t[12], t[11]); - u[3] = _mm_add_epi16(t[12], t[11]); - - s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); - s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); - s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2); - s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2); - s[14] = t[14]; - s[15] = t[15]; + if (round == 1) { + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + } else { + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_sub_epi16(t[13], t[10]); + u[1] = _mm_add_epi16(t[13], t[10]); + u[2] = _mm_sub_epi16(t[12], t[11]); + u[3] = _mm_add_epi16(t[12], t[11]); + + s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2); + s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2); + s[14] = t[14]; + s[15] = t[15]; + } // stage 7 in[0] = _mm_add_epi16(s[0], s[15]); @@ -308,10 +362,10 @@ static void idct16_8col(__m128i *in) { in[15] = _mm_sub_epi16(s[0], s[15]); } -static void idct16_sse2(__m128i *in0, __m128i *in1) { +static void idct16_sse2(__m128i *in0, __m128i *in1, int round) { array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); + idct16_8col(in0, round); + idct16_8col(in1, round); } void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, @@ -322,10 +376,387 @@ void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, input += 8; load_buffer_8x16(input, in1); - idct16_sse2(in0, in1); - idct16_sse2(in0, in1); + idct16_sse2(in0, in1, 0); + idct16_sse2(in0, in1, 1); write_buffer_8x16(dest, in0, stride); dest += 8; write_buffer_8x16(dest, in1, stride); } + +static void idct16_10_r1(__m128i *in, __m128i *l) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_01 = dual_set_epi16(3212, 32610); + const __m128i stg2_67 = dual_set_epi16(-9512, 31358); + const __m128i stg3_01 = dual_set_epi16(6392, 32138); + const __m128i stg4_01 = dual_set_epi16(23170, 23170); + + + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + __m128i stp1_0, stp1_1, stp1_4, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]); + const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]); + + stp2_8 = _mm_mulhrs_epi16(lo_1_15, stg2_01); + stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]); + stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp4 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + + stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0); + stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp2, tmp4); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4); + const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10); + const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10); + const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11); + const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11); + + tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6); + tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14); + tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13); + + stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01); + tmp0 = _mm_mulhrs_epi16(tmp0, stg4_01); + tmp4 = _mm_mulhrs_epi16(tmp4, stg4_01); + + stp2_10 = _mm_unpacklo_epi64(tmp0, zero); + stp2_13 = _mm_unpackhi_epi64(tmp0, zero); + stp2_11 = _mm_unpacklo_epi64(tmp4, zero); + stp2_12 = _mm_unpackhi_epi64(tmp4, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); +} + +static void idct16_10_r2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + const __m128i stg2_0 = dual_set_epi16(3212, 3212); + const __m128i stg2_1 = dual_set_epi16(32610, 32610); + const __m128i stg2_6 = dual_set_epi16(-9512, -9512); + const __m128i stg2_7 = dual_set_epi16(31358, 31358); + const __m128i stg3_0 = dual_set_epi16(6392, 6392); + const __m128i stg3_1 = dual_set_epi16(32138, 32138); + const __m128i stg4_01 = dual_set_epi16(23170, 23170); + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Stage2 */ + { + stp1_8_0 = _mm_mulhrs_epi16(in[1], stg2_0); + stp1_15 = _mm_mulhrs_epi16(in[1], stg2_1); + stp1_11 = _mm_mulhrs_epi16(in[3], stg2_6); + stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7); + } + + /* Stage3 */ + { + stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0); + stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1); + + stp1_9 = stp1_8_0; + stp1_10 = stp1_11; + + stp1_13 = stp1_12_0; + stp1_14 = stp1_15; + } + + /* Stage4 */ + { + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01); + + stp2_5 = stp2_4; + stp2_6 = stp2_7; + + + tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); + tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); + tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); + tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); + tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp2_9 = _mm_packs_epi32(tmp0, tmp1); + stp2_14 = _mm_packs_epi32(tmp2, tmp3); + stp2_10 = _mm_packs_epi32(tmp4, tmp5); + stp2_13 = _mm_packs_epi32(tmp6, tmp7); + } + + /* Stage5 */ + { + stp1_2 = stp1_0; + stp1_3 = stp1_0; + + tmp0 = _mm_sub_epi16(stp2_6, stp2_5); + tmp1 = _mm_add_epi16(stp2_6, stp2_5); + + stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01); + stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01); + + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + } + + /* Stage6 */ + { + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); + stp2_1 = _mm_add_epi16(stp1_0, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); + + tmp0 = _mm_sub_epi16(stp1_13, stp1_10); + tmp1 = _mm_add_epi16(stp1_13, stp1_10); + tmp2 = _mm_sub_epi16(stp1_12, stp1_11); + tmp3 = _mm_add_epi16(stp1_12, stp1_11); + + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_0, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + + stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01); + stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01); + stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01); + stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01); + } + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); +} + +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + __m128i in[16], l[16]; + + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + idct16_10_r1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + array_transpose_4X8(l + 8*i, in); + + idct16_10_r2(in); + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + + dest += 8 - (stride * 16); + } +} diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 121b1f2cd..fc70035f2 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -195,30 +195,32 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, struct macroblockd_plane *const pd = &xd->plane[plane]; if (eob > 0) { TX_TYPE tx_type; - const PLANE_TYPE plane_type = pd->plane_type; int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - switch (tx_size) { - case TX_4X4: - tx_type = get_tx_type_4x4(plane_type, xd, block); - if (tx_type == DCT_DCT) - xd->itxm_add(dqcoeff, dst, stride, eob); - else - vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type); - break; - case TX_8X8: - tx_type = get_tx_type(plane_type, xd); - vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); - break; - case TX_16X16: - tx_type = get_tx_type(plane_type, xd); - vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); - break; - case TX_32X32: - tx_type = DCT_DCT; - vp9_idct32x32_add(dqcoeff, dst, stride, eob); - break; - default: - assert(0 && "Invalid transform size"); + if (xd->lossless) { + tx_type = DCT_DCT; + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + const PLANE_TYPE plane_type = pd->plane_type; + switch (tx_size) { + case TX_4X4: + tx_type = get_tx_type_4x4(plane_type, xd, block); + vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_8X8: + tx_type = get_tx_type(plane_type, xd); + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + tx_type = get_tx_type(plane_type, xd); + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + tx_type = DCT_DCT; + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + } } if (eob == 1) { @@ -588,8 +590,6 @@ static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - - xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; } static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c index 47ad8d8cc..0d6b41d15 100644 --- a/vp9/encoder/vp9_aq_complexity.c +++ b/vp9/encoder/vp9_aq_complexity.c @@ -47,11 +47,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { // Use some of the segments for in frame Q adjustment. for (segment = 1; segment < 2; segment++) { - const int qindex_delta = + int qindex_delta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, in_frame_q_adj_ratio[segment]); - vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); - vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + + // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0. + // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta + // is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + if ((cm->base_qindex + qindex_delta) > 0) { + vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } } } } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index c406860a0..c3cd93b78 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -109,6 +109,7 @@ struct macroblock { MV pred_mv[MAX_REF_FRAMES]; void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); + void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); }; #ifdef __cplusplus diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 6cbc38d79..c1db8263e 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -478,8 +478,8 @@ static void choose_partitioning(VP9_COMP *cpi, unsigned int sse = 0; int sum = 0; if (x_idx < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp, - d + y_idx * dp + x_idx, dp, &sse, &sum); + vp9_get8x8var(s + y_idx * sp + x_idx, sp, + d + y_idx * dp + x_idx, dp, &sse, &sum); fill_variance(sse, sum, 64, &vst->split[k].part_variances.none); } } @@ -1214,9 +1214,9 @@ static void set_source_var_based_partition(VP9_COMP *cpi, int b_offset = b_mi_row * MI_SIZE * src_stride + b_mi_col * MI_SIZE; - vp9_get_sse_sum_16x16(src + b_offset, src_stride, - pre_src + b_offset, pre_stride, - &d16[j].sse, &d16[j].sum); + vp9_get16x16var(src + b_offset, src_stride, + pre_src + b_offset, pre_stride, + &d16[j].sse, &d16[j].sum); d16[j].var = d16[j].sse - (((uint32_t)d16[j].sum * d16[j].sum) >> 8); @@ -2369,22 +2369,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { sizeof(*xd->above_seg_context) * aligned_mi_cols); } -static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { - if (lossless) { - // printf("Switching to lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_fwht4x4; - cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; - cpi->mb.optimize = 0; - cpi->common.lf.filter_level = 0; - cpi->zbin_mode_boost_enabled = 0; - cpi->common.tx_mode = ONLY_4X4; - } else { - // printf("Not lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_fdct4x4; - cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; - } -} - static int check_dual_ref_flags(VP9_COMP *cpi) { const int ref_flags = cpi->ref_frame_flags; @@ -2421,7 +2405,7 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { } static TX_MODE select_tx_mode(const VP9_COMP *cpi) { - if (cpi->oxcf.lossless) { + if (cpi->mb.e_mbd.lossless) { return ONLY_4X4; } else if (cpi->common.current_video_frame == 0) { return TX_MODE_SELECT; @@ -3011,13 +2995,21 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(rd_opt->tx_select_diff); vp9_zero(rd_opt->tx_select_threshes); - cm->tx_mode = select_tx_mode(cpi); - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); + + cm->tx_mode = select_tx_mode(cpi); + + cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4; + cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + + if (cpi->mb.e_mbd.lossless) { + cpi->mb.optimize = 0; + cpi->common.lf.filter_level = 0; + cpi->zbin_mode_boost_enabled = 0; + } vp9_frame_init_quantizer(cpi); @@ -3357,7 +3349,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else { mbmi->skip = 1; - if (output_enabled) + if (output_enabled && + !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) cm->counts.skip[vp9_get_skip_context(xd)][1]++; reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 3b231b7f2..8581e6117 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -406,7 +406,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; default: assert(0 && "Invalid transform size"); @@ -428,7 +428,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, vp9_xform_quant(x, plane, block, plane_bsize, tx_size); if (p->eobs[block] > 0) - xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { @@ -574,7 +574,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - xd->itxm_add(dqcoeff, dst, dst_stride, *eob); + x->itxm_add(dqcoeff, dst, dst_stride, *eob); else vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 2ce5483d6..0ebc93638 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -393,11 +393,6 @@ static void set_speed_features(VP9_COMP *cpi) { // Set rd thresholds based on mode and speed setting vp9_set_rd_speed_thresholds(cpi); vp9_set_rd_speed_thresholds_sub8x8(cpi); - - cpi->mb.fwd_txm4x4 = vp9_fdct4x4; - if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm4x4 = vp9_fwht4x4; - } } static void alloc_raw_frame_buffers(VP9_COMP *cpi) { @@ -596,16 +591,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if (cpi->oxcf.mode == REALTIME) cpi->oxcf.play_alternate = 0; - cpi->oxcf.lossless = oxcf->lossless; - if (cpi->oxcf.lossless) { - // In lossless mode, make sure right quantizer range and correct transform - // is set. - cpi->oxcf.worst_allowed_q = 0; - cpi->oxcf.best_allowed_q = 0; - cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; - } else { - cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; - } rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; @@ -627,33 +612,30 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { // local file playback mode == really big buffer if (cpi->oxcf.rc_mode == RC_MODE_VBR) { - cpi->oxcf.starting_buffer_level = 60000; - cpi->oxcf.optimal_buffer_level = 60000; - cpi->oxcf.maximum_buffer_size = 240000; + cpi->oxcf.starting_buffer_level_ms = 60000; + cpi->oxcf.optimal_buffer_level_ms = 60000; + cpi->oxcf.maximum_buffer_size_ms = 240000; } - cpi->oxcf.starting_buffer_level = - vp9_rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); + rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms, + cpi->oxcf.target_bandwidth, 1000); // Set or reset optimal and maximum buffer levels. - if (cpi->oxcf.optimal_buffer_level == 0) - cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; + if (cpi->oxcf.optimal_buffer_level_ms == 0) + rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else - cpi->oxcf.optimal_buffer_level = - vp9_rescale(cpi->oxcf.optimal_buffer_level, - cpi->oxcf.target_bandwidth, 1000); + rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms, + cpi->oxcf.target_bandwidth, 1000); - if (cpi->oxcf.maximum_buffer_size == 0) - cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; + if (cpi->oxcf.maximum_buffer_size_ms == 0) + rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; else - cpi->oxcf.maximum_buffer_size = - vp9_rescale(cpi->oxcf.maximum_buffer_size, - cpi->oxcf.target_bandwidth, 1000); + rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms, + cpi->oxcf.target_bandwidth, 1000); // Under a configuration change, where maximum_buffer_size may change, // keep buffer level clipped to the maximum allowed buffer size. - rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size); - rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size); + rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size); // Set up frame rate and related parameters rate control values. vp9_new_framerate(cpi, cpi->oxcf.framerate); @@ -1439,21 +1421,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, vp8_yv12_extend_frame_borders_c(dst); } -static int find_fp_qindex() { - int i; - - for (i = 0; i < QINDEX_RANGE; i++) { - if (vp9_convert_qindex_to_q(i) >= 30.0) { - break; - } - } - - if (i == QINDEX_RANGE) - i--; - - return i; -} - #define WRITE_RECON_BUFFER 0 #if WRITE_RECON_BUFFER void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { @@ -2308,17 +2275,6 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, encode_frame_to_data_rate(cpi, size, dest, frame_flags); } -static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { - (void) size; - (void) dest; - (void) frame_flags; - - vp9_rc_get_first_pass_params(cpi); - vp9_set_quantizer(&cpi->common, find_fp_qindex()); - vp9_first_pass(cpi); -} - static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; @@ -2658,7 +2614,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (cpi->pass == 1 && (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { - Pass1Encode(cpi, size, dest, frame_flags); + const int lossless = is_lossless_requested(&cpi->oxcf); + cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4; + cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + vp9_first_pass(cpi); } else if (cpi->pass == 2 && (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) { Pass2Encode(cpi, size, dest, frame_flags); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 47c901975..c69a345d0 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -216,9 +216,9 @@ typedef struct VP9EncoderConfig { int over_shoot_pct; // buffering parameters - int64_t starting_buffer_level; // in seconds - int64_t optimal_buffer_level; - int64_t maximum_buffer_size; + int64_t starting_buffer_level_ms; + int64_t optimal_buffer_level_ms; + int64_t maximum_buffer_size_ms; // Frame drop threshold. int drop_frames_water_mark; @@ -228,7 +228,6 @@ typedef struct VP9EncoderConfig { int worst_allowed_q; int best_allowed_q; int cq_level; - int lossless; AQ_MODE aq_mode; // Adaptive Quantization mode // Internal frame size scaling. @@ -257,7 +256,6 @@ typedef struct VP9EncoderConfig { // these parameters aren't to be used in final build don't use!!! int play_alternate; - int alt_freq; int encode_breakout; // early breakout : for video conf recommend 800 @@ -286,6 +284,10 @@ typedef struct VP9EncoderConfig { vp8e_tuning tuning; } VP9EncoderConfig; +static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { + return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; +} + static INLINE int is_best_mode(MODE mode) { return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 0d4f2c72c..dc3832b16 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -398,6 +398,32 @@ static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) { } } +static int find_fp_qindex() { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i) >= 30.0) + break; + + if (i == QINDEX_RANGE) + i--; + + return i; +} + +static void set_first_pass_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || + (cpi->frame_flags & FRAMEFLAGS_KEY))) { + cm->frame_type = KEY_FRAME; + } else { + cm->frame_type = INTER_FRAME; + } + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; +} + void vp9_first_pass(VP9_COMP *cpi) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->mb; @@ -438,6 +464,9 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_clear_system_state(); + set_first_pass_params(cpi); + vp9_set_quantizer(cm, find_fp_qindex()); + if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) { MV_REFERENCE_FRAME ref_frame = LAST_FRAME; const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL; @@ -1576,7 +1605,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Break out conditions. if ( - // Break at cpi->max_gf_interval unless almost totally static. + // Break at active_max_gf_interval unless almost totally static. (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) || ( // Don't break out with a very short interval. @@ -2051,19 +2080,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->modified_error_left -= kf_group_err; } -void vp9_rc_get_first_pass_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - if (!cpi->refresh_alt_ref_frame && - (cm->current_video_frame == 0 || - (cpi->frame_flags & FRAMEFLAGS_KEY))) { - cm->frame_type = KEY_FRAME; - } else { - cm->frame_type = INTER_FRAME; - } - // Do not use periodic key frames. - cpi->rc.frames_to_key = INT_MAX; -} - // For VBR...adjustment to the frame target based on error from previous frames void vbr_rate_correction(int * this_frame_target, const int64_t vbr_bits_off_target) { diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 9d2b2a497..dbd19a2d6 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -524,7 +524,8 @@ static int vp9_pattern_search(const MACROBLOCK *x, // Work out the start point for the search bestsad = vfp->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, ref_mv), in_what->stride); + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); // Search all possible scales upto the search param around the center point // pick the scale of the point that is best as the starting scale of diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 11633a73d..913b8ead4 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -280,8 +280,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { int rate_mv = 0; - if (cpi->sf.disable_inter_mode_mask[bsize] & - (1 << INTER_OFFSET(this_mode))) + if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; if (rd_less_than_thresh(best_rd, rd_threshes[mode_idx[this_mode]], diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 6f646ea0e..0163fd1e8 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -159,7 +159,7 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { lrc->bits_off_target += bits_off_for_this_layer; // Clip buffer level to maximum buffer size for the layer. - lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size); + lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = lrc->bits_off_target; } } @@ -167,7 +167,6 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { // Update the buffer level: leaky bucket model. static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { const VP9_COMMON *const cm = &cpi->common; - const VP9EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; // Non-viewable frames are a special case and are treated as pure overhead. @@ -178,7 +177,7 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { } // Clip the buffer level to the maximum specified buffer size. - rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size); + rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); rc->buffer_level = rc->bits_off_target; if (cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR) { @@ -188,23 +187,20 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { if (pass == 0 && oxcf->rc_mode == RC_MODE_CBR) { - rc->avg_frame_qindex[0] = oxcf->worst_allowed_q; - rc->avg_frame_qindex[1] = oxcf->worst_allowed_q; - rc->avg_frame_qindex[2] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; } else { - rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; - rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; - rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q + - oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; } rc->last_q[KEY_FRAME] = oxcf->best_allowed_q; rc->last_q[INTER_FRAME] = oxcf->best_allowed_q; - rc->buffer_level = oxcf->starting_buffer_level; - rc->bits_off_target = oxcf->starting_buffer_level; + rc->buffer_level = rc->starting_buffer_level; + rc->bits_off_target = rc->starting_buffer_level; rc->rolling_target_bits = rc->avg_frame_bandwidth; rc->rolling_actual_bits = rc->avg_frame_bandwidth; @@ -250,7 +246,7 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = (int)(oxcf->drop_frames_water_mark * - oxcf->optimal_buffer_level / 100); + rc->optimal_buffer_level / 100); if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { --rc->decimation_factor; @@ -444,10 +440,9 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { // ambient Q (at buffer = optimal level) to worst_quality level // (at buffer = critical level). const VP9_COMMON *const cm = &cpi->common; - const VP9EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; // Buffer level below which we push active_worst to worst_quality. - int64_t critical_level = oxcf->optimal_buffer_level >> 2; + int64_t critical_level = rc->optimal_buffer_level >> 2; int64_t buff_lvl_step = 0; int adjustment = 0; int active_worst_quality; @@ -459,26 +454,26 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { else active_worst_quality = MIN(rc->worst_quality, rc->avg_frame_qindex[KEY_FRAME] * 3 / 2); - if (rc->buffer_level > oxcf->optimal_buffer_level) { + if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. // Maximum limit for down adjustment, ~30%. int max_adjustment_down = active_worst_quality / 3; if (max_adjustment_down) { - buff_lvl_step = ((oxcf->maximum_buffer_size - - oxcf->optimal_buffer_level) / max_adjustment_down); + buff_lvl_step = ((rc->maximum_buffer_size - + rc->optimal_buffer_level) / max_adjustment_down); if (buff_lvl_step) - adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) / + adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / buff_lvl_step); active_worst_quality -= adjustment; } } else if (rc->buffer_level > critical_level) { // Adjust up from ambient Q. if (critical_level) { - buff_lvl_step = (oxcf->optimal_buffer_level - critical_level); + buff_lvl_step = (rc->optimal_buffer_level - critical_level); if (buff_lvl_step) { adjustment = (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) * - (oxcf->optimal_buffer_level - rc->buffer_level) / + (rc->optimal_buffer_level - rc->buffer_level) / buff_lvl_step); } active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment; @@ -1086,21 +1081,21 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->last_q[KEY_FRAME] = qindex; rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); - } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) && - !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) { - rc->avg_frame_qindex[2] = - ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2); } else { - rc->last_q[INTER_FRAME] = qindex; - rc->avg_frame_qindex[INTER_FRAME] = + if (rc->is_src_frame_alt_ref || + !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) || + (cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) { + rc->last_q[INTER_FRAME] = qindex; + rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); - rc->ni_frames++; - rc->tot_q += vp9_convert_qindex_to_q(qindex); - rc->avg_q = rc->tot_q / rc->ni_frames; - // Calculate the average Q for normal inter frames (not key or GFU frames). - rc->ni_tot_qi += qindex; - rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + rc->ni_frames++; + rc->tot_q += vp9_convert_qindex_to_q(qindex); + rc->avg_q = rc->tot_q / rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + } } // Keep record of last boosted (KF/KF/ARF) Q value. @@ -1227,8 +1222,8 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const VP9EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; const SVC *const svc = &cpi->svc; - const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level; - const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100; + const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); int target = rc->avg_frame_bandwidth; if (svc->number_temporal_layers > 1 && @@ -1259,8 +1254,8 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const SVC *const svc = &cpi->svc; int target; if (cpi->common.current_video_frame == 0) { - target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX) - ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2); + target = ((rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX : (int)(rc->starting_buffer_level / 2); } else { int kf_boost = 32; double framerate = oxcf->framerate; @@ -1388,6 +1383,24 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, return target_index - qindex; } +void vp9_rc_set_gf_max_interval(const VP9EncoderConfig *const oxcf, + RATE_CONTROL *const rc) { + // Set Maximum gf/arf interval + rc->max_gf_interval = 16; + + // Extended interval for genuinely static scenes + rc->static_scene_max_gf_interval = oxcf->key_freq >> 1; + + // Special conditions when alt ref frame enabled + if (oxcf->play_alternate && oxcf->lag_in_frames) { + if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) + rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + } + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; +} + void vp9_rc_update_framerate(VP9_COMP *cpi) { const VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -1412,21 +1425,5 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); - // Set Maximum gf/arf interval - rc->max_gf_interval = 16; - - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1; - - // Special conditions when alt ref frame enabled in lagged compress mode - if (oxcf->play_alternate && oxcf->lag_in_frames) { - if (rc->max_gf_interval > oxcf->lag_in_frames - 1) - rc->max_gf_interval = oxcf->lag_in_frames - 1; - - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; - } - - if (rc->max_gf_interval > rc->static_scene_max_gf_interval) - rc->max_gf_interval = rc->static_scene_max_gf_interval; + vp9_rc_set_gf_max_interval(oxcf, rc); } diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 614078eef..f1a4a3f6d 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -61,7 +61,7 @@ typedef struct { int ni_av_qi; int ni_tot_qi; int ni_frames; - int avg_frame_qindex[3]; // 0 - KEY, 1 - INTER, 2 - ARF/GF + int avg_frame_qindex[FRAME_TYPES]; double tot_q; double avg_q; @@ -84,6 +84,10 @@ typedef struct { int worst_quality; int best_quality; + + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; // int active_best_quality; } RATE_CONTROL; @@ -178,6 +182,9 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, void vp9_rc_update_framerate(struct VP9_COMP *cpi); +void vp9_rc_set_gf_max_interval(const struct VP9EncoderConfig *const oxcf, + RATE_CONTROL *const rc); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index d402d7b40..f68aa2738 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1675,9 +1675,9 @@ static INLINE int mv_has_subpel(const MV *mv) { static int check_best_zero_mv( const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES], int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], - int disable_inter_mode_mask, int this_mode, + int inter_mode_mask, int this_mode, const MV_REFERENCE_FRAME ref_frames[2]) { - if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) && + if ((inter_mode_mask & (1 << ZEROMV)) && (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && frame_mv[this_mode][ref_frames[0]].as_int == 0 && (ref_frames[1] == NONE || @@ -1743,7 +1743,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, ENTROPY_CONTEXT t_above[2], t_left[2]; int subpelmv = 1, have_ref = 0; const int has_second_rf = has_second_ref(mbmi); - const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize]; + const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; vp9_zero(*bsi); @@ -1792,11 +1792,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, mode_idx = INTER_OFFSET(this_mode); bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; - if (disable_inter_mode_mask & (1 << mode_idx)) + if (!(inter_mode_mask & (1 << this_mode))) continue; if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, - disable_inter_mode_mask, + inter_mode_mask, this_mode, mbmi->ref_frame)) continue; @@ -3063,7 +3063,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags; const int intra_y_mode_mask = cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]; - int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize]; + int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; vp9_zero(best_mbmode); x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; @@ -3130,7 +3130,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int inter_non_zero_mode_mask = 0x1F7F7; mode_skip_mask |= inter_non_zero_mode_mask; mode_skip_mask &= ~(1 << THR_ZEROMV); - disable_inter_mode_mask = ~(1 << INTER_OFFSET(ZEROMV)); + inter_mode_mask = (1 << ZEROMV); } // Disable this drop out case if the ref frame @@ -3182,7 +3182,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mode_index = THR_ZEROMV; mode_skip_mask = ~(1 << mode_index); mode_skip_start = MAX_MODES; - disable_inter_mode_mask = 0; + inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | + (1 << NEWMV); } for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { @@ -3229,8 +3230,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, this_mode = vp9_mode_order[mode_index].mode; ref_frame = vp9_mode_order[mode_index].ref_frame[0]; - if (ref_frame != INTRA_FRAME && - disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode))) + if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode))) continue; second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; @@ -3279,7 +3279,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame}; if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, - disable_inter_mode_mask, this_mode, ref_frames)) + inter_mode_mask, this_mode, ref_frames)) continue; } } @@ -3665,7 +3665,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; - int ref_frame_mask = 0; int mode_skip_mask = 0; x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; @@ -3700,17 +3699,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[ZEROMV][ref_frame].as_int = 0; } - for (ref_frame = LAST_FRAME; - ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) { - int i; - for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { - if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) { - ref_frame_mask |= (1 << ref_frame); - break; - } - } - } - for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) { int mode_excluded = 0; int64_t this_rd = INT64_MAX; @@ -3805,11 +3793,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { continue; - // If the segment skip feature is enabled.... - // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - ref_frame != INTRA_FRAME) { - continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. @@ -4034,15 +4017,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } if (!disable_skip) { - // Test for the condition where skip block will be activated - // because there are no non zero coefficients and make any - // necessary adjustment for rate. Ignore if skip is coded at - // segment level as the cost wont have been added in. - // Is Mb level skip allowed (i.e. not coded at segment level). - const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, - SEG_LVL_SKIP); + // Skip is never coded at the segment level for sub8x8 blocks and instead + // always coded in the bitstream at the mode info level. - if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + if (ref_frame != INTRA_FRAME && !xd->lossless) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { // Add in the cost of the no skip flag. @@ -4057,7 +4035,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, rate_uv = 0; this_skip2 = 1; } - } else if (mb_skip_allowed) { + } else { // Add in the cost of the no skip flag. rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 5ea09a8a7..e85d08a6d 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -99,41 +99,44 @@ static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, int step_param, int error_per_bit, const MV *ref_mv, MV *tmp_mv, int var_max, int rd) { + const SPEED_FEATURES *const sf = &cpi->sf; + const SEARCH_METHODS method = sf->search_method; + vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; - if (cpi->sf.search_method == FAST_DIAMOND) { - var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, - &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); - if (rd && var < var_max) - var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == FAST_HEX) { - var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, - &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); - if (rd && var < var_max) - var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == HEX) { - var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, - &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); - if (rd && var < var_max) - var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == SQUARE) { - var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, - &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); - if (rd && var < var_max) - var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == BIGDIA) { - var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, - &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); - if (rd && var < var_max) - var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); - } else { - int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - - var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, - further_steps, 1, &cpi->fn_ptr[bsize], - ref_mv, tmp_mv); + switch (method) { + case FAST_DIAMOND: + var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case FAST_HEX: + var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case HEX: + var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case SQUARE: + var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case BIGDIA: + var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case NSTEP: + var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, + (sf->max_step_search_steps - 1) - step_param, + 1, fn_ptr, ref_mv, tmp_mv); + break; + default: + assert(!"Invalid search method."); } + if (method != NSTEP && rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); + return var; } diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 7c3abd5d7..b7f839747 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -14,20 +14,23 @@ #include "vp9/encoder/vp9_speed_features.h" enum { - ALL_INTRA_MODES = (1 << DC_PRED) | + INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) | (1 << D207_PRED) | (1 << D63_PRED) | (1 << TM_PRED), - - INTRA_DC_ONLY = (1 << DC_PRED), - - INTRA_DC_TM = (1 << TM_PRED) | (1 << DC_PRED), - + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED), INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | + (1 << H_PRED) +}; - INTRA_DC_TM_H_V = INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED) +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) }; enum { @@ -140,8 +143,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->search_method = HEX; sf->disable_filter_search_var_thresh = 500; for (i = 0; i < TX_SIZES; ++i) { - sf->intra_y_mode_mask[i] = INTRA_DC_ONLY; - sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; + sf->intra_y_mode_mask[i] = INTRA_DC; + sf->intra_uv_mode_mask[i] = INTRA_DC; } cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; } @@ -156,7 +159,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->adaptive_rd_thresh = 1; sf->use_fast_coef_costing = 1; - if (speed == 1) { + if (speed >= 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD @@ -179,13 +182,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, } if (speed >= 2) { - sf->use_square_partition_only = !frame_is_intra_only(cm); - sf->less_rectangular_check = 1; - sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD - : USE_LARGESTALL; if (MIN(cm->width, cm->height) >= 720) - sf->disable_split_mask = cm->show_frame ? - DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT + : DISABLE_ALL_INTER_SPLIT; else sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; @@ -193,28 +192,18 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; - sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; sf->adaptive_pred_interp_filter = 2; - sf->auto_mv_step_size = 1; sf->reference_masking = 1; - sf->disable_filter_search_var_thresh = 50; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; sf->lf_motion_threshold = LOW_MOITION_THRESHOLD; sf->adjust_partitioning_from_last_frame = 1; sf->last_partitioning_redo_frequency = 3; - - sf->adaptive_rd_thresh = 2; sf->use_lp32x32fdct = 1; sf->mode_skip_start = 11; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; } if (speed >= 3) { @@ -246,15 +235,15 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->subpel_force_stop = 1; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; + sf->intra_uv_mode_mask[i] = INTRA_DC; } - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->frame_parameter_update = 0; sf->search_method = FAST_HEX; - sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV); - sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV)); - sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV)); - sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV)); + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; sf->max_intra_bsize = BLOCK_32X32; sf->allow_skip_recode = 1; } @@ -285,7 +274,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, if (speed >= 7) { int i; for (i = 0; i < BLOCK_SIZES; ++i) - sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV)); + sf->inter_mode_mask[i] = INTER_NEAREST; } } @@ -302,7 +291,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->subpel_search_method = SUBPEL_TREE; sf->subpel_iters_per_step = 2; sf->subpel_force_stop = 0; - sf->optimize_coefficients = !oxcf->lossless; + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); sf->reduce_first_step_size = 0; sf->auto_mv_step_size = 0; sf->max_step_search_steps = MAX_MVSEARCH_STEPS; @@ -330,8 +319,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->disable_split_var_thresh = 0; sf->disable_filter_search_var_thresh = 0; for (i = 0; i < TX_SIZES; i++) { - sf->intra_y_mode_mask[i] = ALL_INTRA_MODES; - sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES; + sf->intra_y_mode_mask[i] = INTRA_ALL; + sf->intra_uv_mode_mask[i] = INTRA_ALL; } sf->use_rd_breakout = 0; sf->skip_encode_sb = 0; @@ -343,7 +332,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set sf->use_nonrd_pick_mode = 0; for (i = 0; i < BLOCK_SIZES; ++i) - sf->disable_inter_mode_mask[i] = 0; + sf->inter_mode_mask[i] = INTER_ALL; sf->max_intra_bsize = BLOCK_64X64; // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index a54599e6a..3e7cd27d8 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -331,8 +331,8 @@ typedef struct SPEED_FEATURES { int use_nonrd_pick_mode; // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV - // modes are disabled in order from LSB to MSB for each BLOCK_SIZE. - int disable_inter_mode_mask[BLOCK_SIZES]; + // modes are used in order from LSB to MSB for each BLOCK_SIZE. + int inter_mode_mask[BLOCK_SIZES]; // This feature controls whether we do the expensive context update and // calculation in the rd coefficient costing loop. diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 95ea1072d..1b995757a 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -54,7 +54,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q; } - lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level), + lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms), lc->target_bandwidth, 1000); lrc->bits_off_target = lrc->buffer_level; } @@ -87,14 +87,14 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; // Update buffer-related quantities. - lc->starting_buffer_level = - (int64_t)(oxcf->starting_buffer_level * bitrate_alloc); - lc->optimal_buffer_level = - (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc); - lc->maximum_buffer_size = - (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc); - lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size); - lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size); + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); // Update framerate-related quantities. if (svc->number_temporal_layers > 1) { lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer]; @@ -149,20 +149,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { oxcf->two_pass_vbrmin_section / 100); lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / 100); - lrc->max_gf_interval = 16; - - lrc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1; - - if (oxcf->play_alternate && oxcf->lag_in_frames) { - if (lrc->max_gf_interval > oxcf->lag_in_frames - 1) - lrc->max_gf_interval = oxcf->lag_in_frames - 1; - - if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; - } - - if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval) - lrc->max_gf_interval = lrc->static_scene_max_gf_interval; + vp9_rc_set_gf_max_interval(oxcf, lrc); } void vp9_restore_layer_context(VP9_COMP *const cpi) { @@ -173,9 +160,6 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { cpi->rc = lc->rc; cpi->twopass = lc->twopass; cpi->oxcf.target_bandwidth = lc->target_bandwidth; - cpi->oxcf.starting_buffer_level = lc->starting_buffer_level; - cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level; - cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size; // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1) { @@ -191,9 +175,6 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->rc = cpi->rc; lc->twopass = cpi->twopass; lc->target_bandwidth = (int)oxcf->target_bandwidth; - lc->starting_buffer_level = oxcf->starting_buffer_level; - lc->optimal_buffer_level = oxcf->optimal_buffer_level; - lc->maximum_buffer_size = oxcf->maximum_buffer_size; } void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 6881ce1e7..36e2027fd 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -22,9 +22,6 @@ extern "C" { typedef struct { RATE_CONTROL rc; int target_bandwidth; - int64_t starting_buffer_level; - int64_t optimal_buffer_level; - int64_t maximum_buffer_size; double framerate; int avg_frame_size; TWO_PASS twopass; diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c index 02bed8988..eb5ae2e41 100644 --- a/vp9/encoder/vp9_variance.c +++ b/vp9/encoder/vp9_variance.c @@ -156,16 +156,15 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ } - -void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); } -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); } diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm index 8723a7114..28458dcdd 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm @@ -23,6 +23,7 @@ pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 %endmacro +TRANSFORM_COEFFS 11585, 11585 TRANSFORM_COEFFS 15137, 6270 TRANSFORM_COEFFS 16069, 3196 TRANSFORM_COEFFS 9102, 13623 @@ -83,7 +84,7 @@ SECTION .text %endmacro ; 1D forward 8x8 DCT transform -%macro FDCT8_1D 0 +%macro FDCT8_1D 1 SUM_SUB 0, 7, 9 SUM_SUB 1, 6, 9 SUM_SUB 2, 5, 9 @@ -92,14 +93,21 @@ SECTION .text SUM_SUB 0, 3, 9 SUM_SUB 1, 2, 9 SUM_SUB 6, 5, 9 +%if %1 == 0 SUM_SUB 0, 1, 9 +%endif BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 pmulhrsw m6, m12 pmulhrsw m5, m12 +%if %1 == 0 pmulhrsw m0, m12 pmulhrsw m1, m12 +%else + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 + SWAP 0, 1 +%endif SUM_SUB 4, 5, 9 SUM_SUB 7, 6, 9 @@ -150,10 +158,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride psllw m7, 2 ; column transform - FDCT8_1D + FDCT8_1D 0 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - FDCT8_1D + FDCT8_1D 1 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 DIVIDE_ROUND_2X 0, 1, 9, 10 diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm index 673e0b3a6..21aaa9383 100644 --- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm @@ -43,9 +43,9 @@ sym(vp9_temporal_filter_apply_sse2): mov [rsp + rbp_backup], rbp ; end prolog - mov rdx, arg(3) + mov edx, arg(3) mov [rsp + block_width], rdx - mov rdx, arg(4) + mov edx, arg(4) mov [rsp + block_height], rdx movd xmm6, arg(5) movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index fb0fe58d3..72768e11e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -331,8 +331,10 @@ static vpx_codec_err_t set_encoder_config( oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; - oxcf->best_allowed_q = vp9_quantizer_to_qindex(cfg->rc_min_quantizer); - oxcf->worst_allowed_q = vp9_quantizer_to_qindex(cfg->rc_max_quantizer); + oxcf->best_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer); + oxcf->worst_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer); oxcf->cq_level = vp9_quantizer_to_qindex(extra_cfg->cq_level); oxcf->fixed_q = -1; @@ -343,9 +345,9 @@ static vpx_codec_err_t set_encoder_config( oxcf->scaled_frame_width = cfg->rc_scaled_width; oxcf->scaled_frame_height = cfg->rc_scaled_height; - oxcf->maximum_buffer_size = cfg->rc_buf_sz; - oxcf->starting_buffer_level = cfg->rc_buf_initial_sz; - oxcf->optimal_buffer_level = cfg->rc_buf_optimal_sz; + oxcf->maximum_buffer_size_ms = cfg->rc_buf_sz; + oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz; + oxcf->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz; oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh; @@ -376,8 +378,6 @@ static vpx_codec_err_t set_encoder_config( oxcf->tile_columns = extra_cfg->tile_columns; oxcf->tile_rows = extra_cfg->tile_rows; - oxcf->lossless = extra_cfg->lossless; - oxcf->error_resilient_mode = cfg->g_error_resilient; oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode; @@ -1262,7 +1262,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { VPX_VBR, // rc_end_usage #if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION) - {0}, // rc_twopass_stats_in + {NULL, 0}, // rc_twopass_stats_in #endif 256, // rc_target_bandwidth 0, // rc_min_quantizer diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 734ec4658..48110b414 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -38,7 +38,6 @@ struct vpx_codec_alg_priv { vpx_decrypt_cb decrypt_cb; void *decrypt_state; vpx_image_t img; - int img_avail; int invert_tile_order; // External frame buffer info to save for VP9 common. @@ -48,10 +47,12 @@ struct vpx_codec_alg_priv { }; static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data) { + vpx_codec_priv_enc_mr_cfg_t *data) { // This function only allocates space for the vpx_codec_alg_priv_t // structure. More memory may be required at the time the stream // information becomes known. + (void)data; + if (!ctx->priv) { vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv)); if (alg_priv == NULL) @@ -243,14 +244,11 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) { static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, void *user_priv, int64_t deadline) { - YV12_BUFFER_CONFIG sd = { 0 }; vp9_ppflags_t flags = {0}; VP9_COMMON *cm = NULL; (void)deadline; - ctx->img_avail = 0; - // Determine the stream parameters. Note that we rely on peek_si to // validate that we have a buffer that does not wrap around the top // of the heap. @@ -285,13 +283,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); - if (vp9_get_raw_frame(ctx->pbi, &sd, &flags)) - return update_error_state(ctx, &cm->error); - - yuvconfig2image(&ctx->img, &sd, user_priv); - ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - ctx->img_avail = 1; - return VPX_CODEC_OK; } @@ -420,15 +411,20 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; - if (ctx->img_avail) { - // iter acts as a flip flop, so an image is only returned on the first - // call to get_frame. - if (!(*iter)) { + // iter acts as a flip flop, so an image is only returned on the first + // call to get_frame. + if (*iter == NULL && ctx->pbi != NULL) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; + + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) { + VP9_COMMON *cm = &ctx->pbi->common; + yuvconfig2image(&ctx->img, &sd, NULL); + ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; img = &ctx->img; *iter = img; } } - ctx->img_avail = 0; return img; } @@ -631,11 +627,12 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { decoder_set_fb_fn, // vpx_codec_set_fb_fn_t }, { // NOLINT - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED + NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t + NOT_IMPLEMENTED, // vpx_codec_encode_fn_t + NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t + NOT_IMPLEMENTED, // vpx_codec_enc_config_set_fn_t + NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t + NOT_IMPLEMENTED, // vpx_codec_get_preview_frame_fn_t + NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t } }; |