diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_reconintra_neon.c | 207 | ||||
-rw-r--r-- | vp9/common/vp9_filter.c | 28 | ||||
-rw-r--r-- | vp9/common/vp9_reconintra.c | 4 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 20 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.c | 55 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_avg.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 60 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 30 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 30 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 90 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.h | 2 |
13 files changed, 431 insertions, 115 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c index 499c42ac3..13c46a57e 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -15,6 +15,75 @@ #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ +// DC 4x4 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_top = vcombine_u16(p1, p1); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_left = vcombine_u16(p1, p1); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 3); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 2); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 2); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + } + } +} + +void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_4x4(dst, stride, above, left, 1, 1); +} + +void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_4x4(dst, stride, NULL, left, 0, 1); +} + +void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_4x4(dst, stride, above, NULL, 1, 0); +} + +void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_4x4(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ // DC 8x8 // 'do_above' and 'do_left' facilitate branch removal when inlined. @@ -161,6 +230,144 @@ void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, dc_16x16(dst, stride, NULL, NULL, 0, 0); } +//------------------------------------------------------------------------------ +// DC 32x32 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t A1 = vld1q_u8(above + 16); + const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top + const uint16x8_t p1 = vpaddlq_u8(A1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_top = vcombine_u16(p5, p5); + } + + if (do_left) { + const uint8x16_t L0 = vld1q_u8(left); // left row + const uint8x16_t L1 = vld1q_u8(left + 16); + const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left + const uint16x8_t p1 = vpaddlq_u8(L1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_left = vcombine_u16(p5, p5); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 6); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 5); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 5); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 32; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + } + } +} + +void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_32x32(dst, stride, above, left, 1, 1); +} + +void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_32x32(dst, stride, NULL, left, 0, 1); +} + +void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_32x32(dst, stride, above, NULL, 1, 0); +} + +void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_32x32(dst, stride, NULL, NULL, 0, 0); +} + +// ----------------------------------------------------------------------------- + +void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row + const uint64x1_t A1 = vshr_n_u64(A0, 8); + const uint64x1_t A2 = vshr_n_u64(A0, 16); + const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); + const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); + const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); + const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r0 = vreinterpret_u32_u8(avg2); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + (void)left; + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + dst[3 * stride + 3] = above[7]; +} + +// ----------------------------------------------------------------------------- + +void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + #if !HAVE_NEON_ASM void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index b256d4af5..14654a5ab 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -12,8 +12,8 @@ #include "vp9/common/vp9_filter.h" -DECLARE_ALIGNED(256, const InterpKernel, - vp9_bilinear_filters[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + bilinear_filters[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -33,8 +33,8 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -54,8 +54,8 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // DCT based filter -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8s[SUBPEL_SHIFTS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -75,8 +75,8 @@ DECLARE_ALIGNED(256, const InterpKernel, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const InterpKernel, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, @@ -96,15 +96,15 @@ DECLARE_ALIGNED(256, const InterpKernel, }; -static const InterpKernel* vp9_filter_kernels[4] = { - vp9_sub_pel_filters_8, - vp9_sub_pel_filters_8lp, - vp9_sub_pel_filters_8s, - vp9_bilinear_filters +static const InterpKernel* filter_kernels[4] = { + sub_pel_filters_8, + sub_pel_filters_8lp, + sub_pel_filters_8s, + bilinear_filters }; const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) { assert(filter != SWITCHABLE); - return vp9_filter_kernels[filter]; + return filter_kernels[filter]; } diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 650f4adde..1e9acb8d5 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -533,8 +533,8 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } intra_pred_no_4x4(d117) -void vp9_d135_predictor_4x4(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { const int I = left[0]; const int J = left[1]; const int K = left[2]; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2a3b054ad..dac142397 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -60,7 +60,7 @@ add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc"; +specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; @@ -72,7 +72,7 @@ add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d117_predictor_4x4/; add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_4x4/; +specialize qw/vp9_d135_predictor_4x4 neon/; add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; @@ -84,16 +84,16 @@ add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, cons specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc"; +specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc"; +specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc"; +specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc"; +specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc"; add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc"; @@ -201,16 +201,16 @@ add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64"; add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc"; +specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc"; +specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc"; +specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc"; add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc"; +specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc"; # # Loopfilter diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index df70d48f8..0e4d8638b 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -39,6 +39,8 @@ struct CYCLIC_REFRESH { int rdmult; // Cyclic refresh map. signed char *map; + // Map of the last q a block was coded at. + uint8_t *last_coded_q_map; // Thresholds applied to the projected rate/distortion of the coding block, // when deciding whether block should be refreshed. int64_t thresh_rate_sb; @@ -51,11 +53,11 @@ struct CYCLIC_REFRESH { // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. double rate_boost_fac; double low_content_avg; - int qindex_delta_seg1; - int qindex_delta_seg2; + int qindex_delta[3]; }; CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + size_t last_coded_q_map_size; CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); if (cr == NULL) return NULL; @@ -65,12 +67,21 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { vpx_free(cr); return NULL; } + last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); + cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size); + if (cr->last_coded_q_map == NULL) { + vpx_free(cr); + return NULL; + } + assert(MAXQ <= 255); + memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); return cr; } void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { vpx_free(cr->map); + vpx_free(cr->last_coded_q_map); vpx_free(cr); } @@ -159,11 +170,11 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, correction_factor, cm->bit_depth) + weight_segment1 * vp9_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta_seg1, mbs, + cm->base_qindex + cr->qindex_delta[1], mbs, correction_factor, cm->bit_depth) + weight_segment2 * vp9_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta_seg2, mbs, + cm->base_qindex + cr->qindex_delta[2], mbs, correction_factor, cm->bit_depth)); return estimated_bits; } @@ -248,9 +259,16 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, // copy mbmi->segment_id into global segmentation map. for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) { - cr->map[block_index + y * cm->mi_cols + x] = new_map_value; - cpi->segmentation_map[block_index + y * cm->mi_cols + x] = - mbmi->segment_id; + int map_offset = block_index + y * cm->mi_cols + x; + cr->map[map_offset] = new_map_value; + cpi->segmentation_map[map_offset] = mbmi->segment_id; + // Inter skip blocks were clearly not coded at the current qindex, so + // don't update the map for them. For cases where motion is non-zero or + // the reference frame isn't the previous frame, the previous value in + // the map for this spatial location is not entirely correct. + if (!is_inter_block(mbmi) || !skip) + cr->last_coded_q_map[map_offset] = clamp( + cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ); } } @@ -384,6 +402,10 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { int sb_col_index = i - sb_row_index * sb_cols; int mi_row = sb_row_index * MI_BLOCK_SIZE; int mi_col = sb_col_index * MI_BLOCK_SIZE; + int qindex_thresh = + cpi->oxcf.content == VP9E_CONTENT_SCREEN + ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) + : 0; assert(mi_row >= 0 && mi_row < cm->mi_rows); assert(mi_col >= 0 && mi_col < cm->mi_cols); bl_index = mi_row * cm->mi_cols + mi_col; @@ -399,7 +421,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { // for possible boost/refresh (segment 1). The segment id may get // reset to 0 later if block gets coded anything other than ZEROMV. if (cr->map[bl_index2] == 0) { - sum_map++; + if (cr->last_coded_q_map[bl_index2] > qindex_thresh) + sum_map++; } else if (cr->map[bl_index2] < 0) { cr->map[bl_index2]++; } @@ -466,8 +489,11 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); vp9_disable_segmentation(&cm->seg); - if (cm->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) { + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; + } return; } else { int qindex_delta = 0; @@ -505,7 +531,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // Set the q delta for segment BOOST1. qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta); - cr->qindex_delta_seg1 = qindex_delta; + cr->qindex_delta[1] = qindex_delta; // Compute rd-mult for segment BOOST1. qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ); @@ -518,7 +544,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { qindex_delta = compute_deltaq(cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO, cr->rate_boost_fac * cr->rate_ratio_qdelta)); - cr->qindex_delta_seg2 = qindex_delta; + cr->qindex_delta[2] = qindex_delta; vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); // Update the segmentation and refresh map. @@ -529,3 +555,10 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { return cr->rdmult; } + +void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_rows * cm->mi_cols); + cr->sb_index = 0; +} diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index 99bb98ec8..29d2a91bc 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -75,6 +75,8 @@ void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi); int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); +void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi); + static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { return segment_id == CR_SEGMENT_ID_BOOST1 || segment_id == CR_SEGMENT_ID_BOOST2; diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index 437356681..3ef3882d2 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -115,33 +115,41 @@ void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, } } +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. int16_t vp9_satd_c(const int16_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] return (int16_t)satd; } // Integer projection onto row vectors. -void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, +// height: value range {16, 32, 64}. +void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int idx; - const int norm_factor = MAX(8, height >> 1); + const int norm_factor = height >> 1; for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 16320]. for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 510]. hbuf[idx] /= norm_factor; ++ref; } } +// width: value range {16, 32, 64}. int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) { int idx; int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 16320] for (idx = 0; idx < width; ++idx) sum += ref[idx]; return sum; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5299244d9..dcddefc28 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -2220,66 +2220,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, *max_block_size = max_size; } -static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, - MACROBLOCKD *const xd, - int mi_row, int mi_col, - BLOCK_SIZE *min_block_size, - BLOCK_SIZE *max_block_size) { - VP9_COMMON *const cm = &cpi->common; - MODE_INFO **mi_8x8 = xd->mi; - const int left_in_image = xd->left_available && mi_8x8[-1]; - const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride]; - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; - int bh, bw; - BLOCK_SIZE min_size = BLOCK_32X32; - BLOCK_SIZE max_size = BLOCK_8X8; - int bsl = mi_width_log2_lookup[BLOCK_64X64]; - const int search_range_ctrl = (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & 0x1; - // Trap case where we do not have a prediction. - if (search_range_ctrl && - (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) { - int block; - MODE_INFO **mi; - BLOCK_SIZE sb_type; - - // Find the min and max partition sizes used in the left SB64. - if (left_in_image) { - MODE_INFO *cur_mi; - mi = &mi_8x8[-1]; - for (block = 0; block < MI_BLOCK_SIZE; ++block) { - cur_mi = mi[block * xd->mi_stride]; - sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0; - min_size = MIN(min_size, sb_type); - max_size = MAX(max_size, sb_type); - } - } - // Find the min and max partition sizes used in the above SB64. - if (above_in_image) { - mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE]; - for (block = 0; block < MI_BLOCK_SIZE; ++block) { - sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0; - min_size = MIN(min_size, sb_type); - max_size = MAX(max_size, sb_type); - } - } - - min_size = min_partition_size[min_size]; - max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining, - &bh, &bw); - min_size = MIN(min_size, max_size); - min_size = MAX(min_size, BLOCK_8X8); - max_size = MIN(max_size, BLOCK_32X32); - } else { - min_size = BLOCK_8X8; - max_size = BLOCK_32X32; - } - - *min_block_size = min_size; - *max_block_size = max_size; -} - // TODO(jingning) refactor functions setting partition search range static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize, diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index ecd60d772..d708b8319 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1596,6 +1596,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, sizeof(*cm->frame_contexts))); cpi->use_svc = 0; + cpi->resize_state = 0; + cpi->resize_avg_qp = 0; + cpi->resize_buffer_underflow = 0; cpi->common.buffer_pool = pool; init_config(cpi, oxcf); @@ -3032,6 +3035,31 @@ static void set_frame_size(VP9_COMP *cpi) { oxcf->scaled_frame_height); } + if (oxcf->pass == 0 && + oxcf->rc_mode == VPX_CBR && + !cpi->use_svc && + oxcf->resize_mode == RESIZE_DYNAMIC) { + if (cpi->resize_state == 1) { + oxcf->scaled_frame_width = + (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den; + oxcf->scaled_frame_height = + (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den; + } else if (cpi->resize_state == -1) { + // Go back up to original size. + oxcf->scaled_frame_width = oxcf->width; + oxcf->scaled_frame_height = oxcf->height; + } + if (cpi->resize_state != 0) { + // There has been a change in frame size. + vp9_set_size_literal(cpi, + oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + + // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. + set_mv_search_params(cpi); + } + } + if ((oxcf->pass == 2) && (!cpi->use_svc || (is_two_pass_svc(cpi) && @@ -3962,7 +3990,6 @@ static void check_src_altref(VP9_COMP *cpi, extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch, const unsigned char *img2, int img2_pitch, int width, int height); -#endif static void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { @@ -3972,6 +3999,7 @@ static void adjust_image_stat(double y, double u, double v, double all, s->stat[ALL] += all; s->worst = MIN(s->worst, all); } +#endif // CONFIG_INTERNAL_STATS int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 4d2a186e8..2b0da103f 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -479,6 +479,12 @@ typedef struct VP9_COMP { #endif int resize_pending; + int resize_state; + int resize_scale_num; + int resize_scale_den; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; // VAR_BASED_PARTITION thresholds // 0 - threshold_64x64; 1 - threshold_32x32; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 234272697..081b99f9f 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -286,20 +286,20 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { bestmv->row *= 8; \ bestmv->col *= 8; -static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, - const MV *bestmv, - const MV *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - const uint8_t *const src, - const int src_stride, - const uint8_t *const y, - int y_stride, - const uint8_t *second_pred, - int w, int h, int offset, - int *mvjcost, int *mvcost[2], - unsigned int *sse1, - int *distortion) { +static unsigned int setup_center_error(const MACROBLOCKD *xd, + const MV *bestmv, + const MV *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src, + const int src_stride, + const uint8_t *const y, + int y_stride, + const uint8_t *second_pred, + int w, int h, int offset, + int *mvjcost, int *mvcost[2], + unsigned int *sse1, + int *distortion) { unsigned int besterr; #if CONFIG_VP9_HIGHBITDEPTH if (second_pred != NULL) { @@ -610,7 +610,7 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, return besterr; } -const MV search_step_table[12] = { +static const MV search_step_table[12] = { // left, right, up, down {0, -4}, {0, 4}, {-4, 0}, {4, 0}, {0, -2}, {0, 2}, {-2, 0}, {2, 0}, diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 32682fe74..425073fcd 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1596,6 +1596,7 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { target = calc_pframe_target_size_one_pass_cbr(cpi); vp9_rc_set_frame_target(cpi, target); + cpi->resize_state = vp9_resize_one_pass_cbr(cpi); } int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, @@ -1756,3 +1757,92 @@ void vp9_set_target_rate(VP9_COMP *cpi) { vbr_rate_correction(cpi, &target_rate); vp9_rc_set_frame_target(cpi, target_rate); } + +// Check if we should resize, based on average QP from past x frames. +// Only allow for resize at most one scale down for now, scaling factor is 2. +int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int resize_now = 0; + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 1; + // Don't resize on key frame; reset the counters on key frame. + if (cm->frame_type == KEY_FRAME) { + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + return 0; + } + // Resize based on average QP over some window. + // Ignore samples close to key frame, since QP is usually high after key. + if (cpi->rc.frames_since_key > 2 * cpi->framerate) { + const int window = (int)(5 * cpi->framerate); + cpi->resize_avg_qp += cm->base_qindex; + if (cpi->rc.buffer_level < 0) + ++cpi->resize_buffer_underflow; + ++cpi->resize_count; + // Check for resize action every "window" frames. + if (cpi->resize_count == window) { + int avg_qp = cpi->resize_avg_qp / cpi->resize_count; + // Resize down if buffer level has underflowed sufficent amount in past + // window, and we are at original resolution. + // Resize back up if average QP is low, and we are currently in a resized + // down state. + if (cpi->resize_state == 0 && + cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) { + resize_now = 1; + } else if (cpi->resize_state == 1 && + avg_qp < 40 * cpi->rc.worst_quality / 100) { + resize_now = -1; + } + // Reset for next window measurement. + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + cpi->resize_buffer_underflow = 0; + } + } + // If decision is to resize, reset some quantities, and check is we should + // reduce rate correction factor, + if (resize_now != 0) { + int target_bits_per_frame; + int active_worst_quality; + int qindex; + int tot_scale_change; + // For now, resize is by 1/2 x 1/2. + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 2; + tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) / + (cpi->resize_scale_num * cpi->resize_scale_num); + // Reset buffer level to optimal, update target size. + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi); + // Reset cyclic refresh parameters. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) + vp9_cyclic_refresh_reset_resize(cpi); + // Get the projected qindex, based on the scaled target frame size (scaled + // so target_bits_per_mb in vp9_rc_regulate_q will be correct target). + target_bits_per_frame = (resize_now == 1) ? + rc->this_frame_target * tot_scale_change : + rc->this_frame_target / tot_scale_change; + active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + qindex = vp9_rc_regulate_q(cpi, + target_bits_per_frame, + rc->best_quality, + active_worst_quality); + // If resize is down, check if projected q index is close to worst_quality, + // and if so, reduce the rate correction factor (since likely can afford + // lower q for resized frame). + if (resize_now == 1 && + qindex > 90 * cpi->rc.worst_quality / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.85; + } + // If resize is back up, check if projected q index is too much above the + // current base_qindex, and if so, reduce the rate correction factor + // (since prefer to keep q for resized frame at least close to previous q). + if (resize_now == -1 && + qindex > 130 * cm->base_qindex / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.9; + } + } + return resize_now; +} diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index e12d200be..a10836c74 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -245,6 +245,8 @@ void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi, void vp9_set_target_rate(struct VP9_COMP *cpi); +int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif |