diff options
Diffstat (limited to 'vp9')
27 files changed, 490 insertions, 1240 deletions
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index d12cd76db..1b420143b 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -174,6 +174,9 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, else if (eob <= 34) // non-zero coeff only in upper-left 8x8 vpx_idct32x32_34_add(input, dest, stride); + else if (eob <= 135) + // non-zero coeff only in upper-left 16x16 + vpx_idct32x32_135_add(input, dest, stride); else vpx_idct32x32_1024_add(input, dest, stride); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 890b63821..d6c86fe5f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -194,42 +194,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { -add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; -specialize qw/vp9_avg_8x8 sse2 neon msa/; - -add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; -specialize qw/vp9_avg_4x4 sse2 msa/; - -add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; -specialize qw/vp9_minmax_8x8 sse2/; - -add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; -specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc"; - -add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; -specialize qw/vp9_hadamard_16x16 sse2/; - -add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length"; -specialize qw/vp9_satd sse2/; - -add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height"; -specialize qw/vp9_int_pro_row sse2 neon/; - -add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width"; -specialize qw/vp9_int_pro_col sse2 neon/; - -add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl"; -specialize qw/vp9_vector_var neon sse2/; - -if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p"; - specialize qw/vp9_highbd_avg_8x8/; - add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p"; - specialize qw/vp9_highbd_avg_4x4/; - add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - specialize qw/vp9_highbd_minmax_8x8/; -} - # ENCODEMB INVOKE # diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c deleted file mode 100644 index d569ec95d..000000000 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vpx/vpx_integer.h" - -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); - - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - return (horizontal_add_u16x8(v_sum) + 32) >> 6; -} - -void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, - const int ref_stride, const int height) { - int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); - - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; - } - - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); - - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); -} - -int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) { - int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; - } - - return horizontal_add_u16x8(vec_sum); -} - -// ref, src = [0, 510] - max diff = 16-bits -// bwl = {2, 3, 4}, width = {16, 32, 64} -int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { - int width = 4 << bwl; - int32x4_t sse = vdupq_n_s32(0); - int16x8_t total = vdupq_n_s16(0); - - assert(width >= 8); - assert((width % 8) == 0); - - do { - const int16x8_t r = vld1q_s16(ref); - const int16x8_t s = vld1q_s16(src); - const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. - sse = vmlal_s16(sse, diff_hi, diff_hi); - total = vaddq_s16(total, diff); // dynamic range 16 bits. - - ref += 8; - src += 8; - width -= 8; - } while (width != 0); - - { - // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired - // with the summation of 'sse' performed better on a Cortex-A15. - const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' - const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); - const int32x2_t t2 = vpadd_s32(t1, t1); - const int t = vget_lane_s32(t2, 0); - const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int s = vget_lane_s32(s1, 0); - const int shift_factor = bwl + 2; - return s - ((t * t) >> shift_factor); - } -} diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vp9/encoder/mips/msa/vp9_avg_msa.c deleted file mode 100644 index 611adb1a2..000000000 --- a/vp9/encoder/mips/msa/vp9_avg_msa.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { - uint32_t sum_out; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; - v4u32 sum = { 0 }; - - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); - HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); - ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); - ADD2(sum0, sum2, sum4, sum6, sum0, sum4); - sum0 += sum4; - - sum = __msa_hadd_u_w(sum0, sum0); - sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); - sum = __msa_hadd_u_w(sum0, sum0); - sum = (v4u32)__msa_srari_w((v4i32)sum, 6); - sum_out = __msa_copy_u_w((v4i32)sum, 0); - - return sum_out; -} - -uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { - uint32_t sum_out; - uint32_t src0, src1, src2, src3; - v16u8 vec = { 0 }; - v8u16 sum0; - v4u32 sum1; - v2u64 sum2; - - LW4(src, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, vec); - - sum0 = __msa_hadd_u_h(vec, vec); - sum1 = __msa_hadd_u_w(sum0, sum0); - sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); - sum1 = __msa_hadd_u_w(sum0, sum0); - sum2 = __msa_hadd_u_d(sum1, sum1); - sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); - sum_out = __msa_copy_u_w((v4i32)sum1, 0); - - return sum_out; -} diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 0def2cf1f..63db214d1 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -191,7 +191,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, BLOCK_SIZE bsize, int64_t rate, int64_t dist, - int skip) { + int skip, + struct macroblock_plane *const p) { const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int bw = num_8x8_blocks_wide_lookup[bsize]; @@ -199,12 +200,25 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; - const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, - bsize); + int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, bsize); // Default is to not update the refresh map. int new_map_value = cr->map[block_index]; int x = 0; int y = 0; + int is_skin = 0; + if (refresh_this_block == 0 && + bsize <= BLOCK_16X16 && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) { + is_skin = vp9_compute_skin_block(p[0].src.buf, + p[1].src.buf, + p[2].src.buf, + p[0].src.stride, + p[1].src.stride, + bsize); + if (is_skin) + refresh_this_block = 1; + } + // If this block is labeled for refresh, check if we should reset the // segment_id. if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index a5b38138b..edf0a973e 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -14,6 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_skin_detection.h" #ifdef __cplusplus extern "C" { @@ -93,7 +95,8 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i, void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi, MB_MODE_INFO *const mbmi, int mi_row, int mi_col, BLOCK_SIZE bsize, - int64_t rate, int64_t dist, int skip); + int64_t rate, int64_t dist, int skip, + struct macroblock_plane *const p); void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi, const MB_MODE_INFO *const mbmi, diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c deleted file mode 100644 index a9a4c3050..000000000 --- a/vp9/encoder/vp9_avg.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vpx_ports/mem.h" - -unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) { - int i, j; - int sum = 0; - for (i = 0; i < 8; ++i, s+=p) - for (j = 0; j < 8; sum += s[j], ++j) {} - - return (sum + 32) >> 6; -} - -unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { - int i, j; - int sum = 0; - for (i = 0; i < 4; ++i, s+=p) - for (j = 0; j < 4; sum += s[j], ++j) {} - - return (sum + 8) >> 4; -} - -// src_diff: first pass, 9 bit, dynamic range [-255, 255] -// second pass, 12 bit, dynamic range [-2040, 2040] -static void hadamard_col8(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; - int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; - int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; - int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; - int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; - int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; - int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; - int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; - - int16_t c0 = b0 + b2; - int16_t c1 = b1 + b3; - int16_t c2 = b0 - b2; - int16_t c3 = b1 - b3; - int16_t c4 = b4 + b6; - int16_t c5 = b5 + b7; - int16_t c6 = b4 - b6; - int16_t c7 = b5 - b7; - - coeff[0] = c0 + c4; - coeff[7] = c1 + c5; - coeff[3] = c2 + c6; - coeff[4] = c3 + c7; - coeff[2] = c0 - c4; - coeff[6] = c1 - c5; - coeff[1] = c2 - c6; - coeff[5] = c3 - c7; -} - -void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - int16_t buffer[64]; - int16_t *tmp_buf = &buffer[0]; - for (idx = 0; idx < 8; ++idx) { - hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit - // dynamic range [-255, 255] - tmp_buf += 8; - ++src_diff; - } - - tmp_buf = &buffer[0]; - for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit - // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] - ++tmp_buf; - } -} - -// In place 16x16 2D Hadamard transform -void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - // src_diff: 9 bit, dynamic range [-255, 255] - int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride - + (idx & 0x01) * 8; - vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); - } - - // coeff: 15 bit, dynamic range [-16320, 16320] - for (idx = 0; idx < 64; ++idx) { - int16_t a0 = coeff[0]; - int16_t a1 = coeff[64]; - int16_t a2 = coeff[128]; - int16_t a3 = coeff[192]; - - int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] - int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range - int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] - int16_t b3 = (a2 - a3) >> 1; - - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] - coeff[64] = b1 + b3; - coeff[128] = b0 - b2; - coeff[192] = b1 - b3; - - ++coeff; - } -} - -// coeff: 16 bits, dynamic range [-32640, 32640]. -// length: value range {16, 64, 256, 1024}. -int16_t vp9_satd_c(const int16_t *coeff, int length) { - int i; - int satd = 0; - for (i = 0; i < length; ++i) - satd += abs(coeff[i]); - - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - return (int16_t)satd; -} - -// Integer projection onto row vectors. -// height: value range {16, 32, 64}. -void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref, - const int ref_stride, const int height) { - int idx; - const int norm_factor = height >> 1; - for (idx = 0; idx < 16; ++idx) { - int i; - hbuf[idx] = 0; - // hbuf[idx]: 14 bit, dynamic range [0, 16320]. - for (i = 0; i < height; ++i) - hbuf[idx] += ref[i * ref_stride]; - // hbuf[idx]: 9 bit, dynamic range [0, 510]. - hbuf[idx] /= norm_factor; - ++ref; - } -} - -// width: value range {16, 32, 64}. -int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) { - int idx; - int16_t sum = 0; - // sum: 14 bit, dynamic range [0, 16320] - for (idx = 0; idx < width; ++idx) - sum += ref[idx]; - return sum; -} - -// ref: [0 - 510] -// src: [0 - 510] -// bwl: {2, 3, 4} -int vp9_vector_var_c(int16_t const *ref, int16_t const *src, - const int bwl) { - int i; - int width = 4 << bwl; - int sse = 0, mean = 0, var; - - for (i = 0; i < width; ++i) { - int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. - mean += diff; // mean: dynamic range 16 bits. - sse += diff * diff; // sse: dynamic range 26 bits. - } - - // (mean * mean): dynamic range 31 bits. - var = sse - ((mean * mean) >> (bwl + 2)); - return var; -} - -void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, - int *min, int *max) { - int i, j; - *min = 255; - *max = 0; - for (i = 0; i < 8; ++i, s += p, d += dp) { - for (j = 0; j < 8; ++j) { - int diff = abs(s[j]-d[j]); - *min = diff < *min ? diff : *min; - *max = diff > *max ? diff : *max; - } - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) { - int i, j; - int sum = 0; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - for (i = 0; i < 8; ++i, s+=p) - for (j = 0; j < 8; sum += s[j], ++j) {} - - return (sum + 32) >> 6; -} - -unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) { - int i, j; - int sum = 0; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - for (i = 0; i < 4; ++i, s+=p) - for (j = 0; j < 4; sum += s[j], ++j) {} - - return (sum + 8) >> 4; -} - -void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, - int dp, int *min, int *max) { - int i, j; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - const uint16_t* d = CONVERT_TO_SHORTPTR(d8); - *min = 255; - *max = 0; - for (i = 0; i < 8; ++i, s += p, d += dp) { - for (j = 0; j < 8; ++j) { - int diff = abs(s[j]-d[j]); - *min = diff < *min ? diff : *min; - *max = diff > *max ? diff : *max; - } - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - - diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index e87a12e44..6533902b3 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -194,7 +194,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, int mi_col, PICK_MODE_CONTEXT *ctx, int motion_magnitude, - int is_skin) { + int is_skin, + int *zeromv_filter) { int mv_col, mv_row; int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; MV_REFERENCE_FRAME frame; @@ -237,6 +238,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, mbmi->mv[0].as_int = 0; ctx->best_sse_inter_mode = ZEROMV; ctx->best_sse_mv.as_int = 0; + *zeromv_filter = 1; } if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { @@ -316,9 +318,11 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, - PICK_MODE_CONTEXT *ctx) { + PICK_MODE_CONTEXT *ctx, + VP9_DENOISER_DECISION *denoiser_decision) { int mv_col, mv_row; int motion_magnitude = 0; + int zeromv_filter = 0; VP9_DENOISER_DECISION decision = COPY_BLOCK; YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; @@ -329,20 +333,12 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int is_skin = 0; if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kDenLow) { - // Take center pixel in block to determine is_skin. - const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1; - const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1; - const int uv_width_shift = y_width_shift >> 1; - const int uv_height_shift = y_height_shift >> 1; - const int stride = mb->plane[0].src.stride; - const int strideuv = mb->plane[1].src.stride; - const uint8_t ysource = - mb->plane[0].src.buf[y_height_shift * stride + y_width_shift]; - const uint8_t usource = - mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift]; - const uint8_t vsource = - mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift]; - is_skin = vp9_skin_pixel(ysource, usource, vsource); + is_skin = vp9_compute_skin_block(mb->plane[0].src.buf, + mb->plane[1].src.buf, + mb->plane[2].src.buf, + mb->plane[0].src.stride, + mb->plane[1].src.stride, + bs); } mv_col = ctx->best_sse_mv.as_mv.col; @@ -359,7 +355,8 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, denoiser->increase_denoising, mi_row, mi_col, ctx, motion_magnitude, - is_skin); + is_skin, + &zeromv_filter); if (decision == FILTER_BLOCK) { decision = vp9_denoiser_filter(src.buf, src.stride, @@ -380,6 +377,9 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } + *denoiser_decision = decision; + if (decision == FILTER_BLOCK && zeromv_filter == 1) + *denoiser_decision = FILTER_ZEROMV_BLOCK; } static void copy_frame(YV12_BUFFER_CONFIG * const dest, @@ -458,6 +458,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { ctx->zeromv_sse = UINT_MAX; ctx->newmv_sse = UINT_MAX; + ctx->zeromv_lastref_sse = UINT_MAX; } void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index bc676e925..d07056b45 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -23,7 +23,8 @@ extern "C" { typedef enum vp9_denoiser_decision { COPY_BLOCK, - FILTER_BLOCK + FILTER_BLOCK, + FILTER_ZEROMV_BLOCK } VP9_DENOISER_DECISION; typedef enum vp9_denoiser_level { @@ -54,7 +55,8 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, - PICK_MODE_CONTEXT *ctx); + PICK_MODE_CONTEXT *ctx , + VP9_DENOISER_DECISION *denoiser_decision); void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f9c28f6a9..c07eee969 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -401,7 +401,6 @@ static int set_vt_partitioning(VP9_COMP *cpi, variance_node vt; const int block_width = num_8x8_blocks_wide_lookup[bsize]; const int block_height = num_8x8_blocks_high_lookup[bsize]; - const int low_res = (cm->width <= 352 && cm->height <= 288); assert(block_height == block_width); tree_to_node(data, bsize, &vt); @@ -414,7 +413,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_min) { // Variance already computed to set the force_split. - if (low_res || cm->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && @@ -425,7 +424,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, return 0; } else if (bsize > bsize_min) { // Variance already computed to set the force_split. - if (low_res || cm->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); // For key frame: take split for bsize above 32X32 or very high variance. if (cm->frame_type == KEY_FRAME && @@ -489,13 +488,16 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { thresholds[2] = threshold_base >> 2; thresholds[3] = threshold_base << 2; } else { - // Increase base variance threshold if estimated noise level is high. + // Increase base variance threshold based on estimated noise level. if (cpi->noise_estimate.enabled) { - if (cpi->noise_estimate.level == kHigh) + NOISE_LEVEL noise_level = vp9_noise_estimate_extract_level( + &cpi->noise_estimate); + if (noise_level == kHigh) threshold_base = 3 * threshold_base; - else - if (cpi->noise_estimate.level == kMedium) - threshold_base = threshold_base << 1; + else if (noise_level == kMedium) + threshold_base = threshold_base << 1; + else if (noise_level < kLow) + threshold_base = (7 * threshold_base) >> 3; } if (cm->width <= 352 && cm->height <= 288) { thresholds[0] = threshold_base >> 3; @@ -556,16 +558,16 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, if (x8_idx < pixels_wide && y8_idx < pixels_high) { #if CONFIG_VP9_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, &min, &max); } else { - vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, &min, &max); } #else - vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, &min, &max); #endif @@ -597,18 +599,18 @@ static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, int d_avg = 128; #if CONFIG_VP9_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); if (!is_key_frame) - d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); + d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); } else { - s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); + d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp); } #else - s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); + d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp); #endif sum = s_avg - d_avg; sse = sum * sum; @@ -636,18 +638,18 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, int d_avg = 128; #if CONFIG_VP9_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); if (!is_key_frame) - d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); } else { - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp); } #else - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp); #endif sum = s_avg - d_avg; sse = sum * sum; @@ -668,6 +670,8 @@ static int choose_partitioning(VP9_COMP *cpi, v64x64 vt; v16x16 vt2[16]; int force_split[21]; + int avg_32x32; + int avg_16x16[4]; uint8_t *s; const uint8_t *d; int sp; @@ -676,9 +680,13 @@ static int choose_partitioning(VP9_COMP *cpi, int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]}; + // For the variance computation under SVC mode, we treat the frame as key if + // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). + const int is_key_frame = (cm->frame_type == KEY_FRAME || + (is_one_pass_cbr_svc(cpi) && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); // Always use 4x4 partition for key frame. - const int is_key_frame = (cm->frame_type == KEY_FRAME); - const int use_4x4_partition = is_key_frame; + const int use_4x4_partition = cm->frame_type == KEY_FRAME; const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; @@ -704,8 +712,7 @@ static int choose_partitioning(VP9_COMP *cpi, s = x->plane[0].src.buf; sp = x->plane[0].src.stride; - if (!is_key_frame && !(is_one_pass_cbr_svc(cpi) && - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { + if (!is_key_frame) { // In the case of spatial/temporal scalable coding, the assumption here is // that the temporal reference frame will always be of type LAST_FRAME. // TODO(marpan): If that assumption is broken, we need to revisit this code. @@ -819,6 +826,7 @@ static int choose_partitioning(VP9_COMP *cpi, const int y32_idx = ((i >> 1) << 5); const int i2 = i << 2; force_split[i + 1] = 0; + avg_16x16[i] = 0; for (j = 0; j < 4; j++) { const int x16_idx = x32_idx + ((j & 1) << 4); const int y16_idx = y32_idx + ((j >> 1) << 4); @@ -836,6 +844,7 @@ static int choose_partitioning(VP9_COMP *cpi, is_key_frame); fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); get_variance(&vt.split[i].split[j].part_variances.none); + avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance; if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) { // 16X16 variance is above threshold for split, so force split to 8x8 @@ -843,7 +852,8 @@ static int choose_partitioning(VP9_COMP *cpi, force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; - } else if (vt.split[i].split[j].part_variances.none.variance > + } else if (cpi->oxcf.speed < 8 && + vt.split[i].split[j].part_variances.none.variance > thresholds[1] && !cyclic_refresh_segment_id_boosted(segment_id)) { // We have some nominal amount of 16x16 variance (based on average), @@ -861,9 +871,7 @@ static int choose_partitioning(VP9_COMP *cpi, } } } - // TODO(marpan): There is an issue with variance based on 4x4 average in - // svc mode, don't allow it for now. - if (is_key_frame || (low_res && !cpi->use_svc && + if (is_key_frame || (low_res && vt.split[i].split[j].part_variances.none.variance > (thresholds[1] << 1))) { force_split[split_index] = 0; @@ -885,8 +893,8 @@ static int choose_partitioning(VP9_COMP *cpi, } } } - // Fill the rest of the variance tree by summing split partition values. + avg_32x32 = 0; for (i = 0; i < 4; i++) { const int i2 = i << 2; for (j = 0; j < 4; j++) { @@ -896,22 +904,41 @@ static int choose_partitioning(VP9_COMP *cpi, for (m = 0; m < 4; m++) fill_variance_tree(&vtemp->split[m], BLOCK_8X8); fill_variance_tree(vtemp, BLOCK_16X16); + // If variance of this 16x16 block is above the threshold, force block + // to split. This also forces a split on the upper levels. + get_variance(&vtemp->part_variances.none); + if (vtemp->part_variances.none.variance > thresholds[2]) { + force_split[5 + i2 + j] = 1; + force_split[i + 1] = 1; + force_split[0] = 1; + } } } fill_variance_tree(&vt.split[i], BLOCK_32X32); - // If variance of this 32x32 block is above the threshold, force the block - // to split. This also forces a split on the upper (64x64) level. + // If variance of this 32x32 block is above the threshold, or if its above + // (some threshold of) the average variance over the sub-16x16 blocks, then + // force this block to split. This also forces a split on the upper + // (64x64) level. if (!force_split[i + 1]) { get_variance(&vt.split[i].part_variances.none); - if (vt.split[i].part_variances.none.variance > thresholds[1]) { + if (vt.split[i].part_variances.none.variance > thresholds[1] || + (!is_key_frame && + vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) && + vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) { force_split[i + 1] = 1; force_split[0] = 1; } + avg_32x32 += vt.split[i].part_variances.none.variance; } } if (!force_split[0]) { fill_variance_tree(&vt, BLOCK_64X64); get_variance(&vt.part_variances.none); + // If variance of this 64x64 block is above (some threshold of) the average + // variance over the sub-32x32 blocks, then force this block to split. + if (!is_key_frame && + vt.part_variances.none.variance > (5 * avg_32x32) >> 4) + force_split[0] = 1; } // Now go through the entire structure, splitting every block size until @@ -1018,7 +1045,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, bsize, ctx->rate, ctx->dist, - x->skip); + x->skip, p); } } @@ -1678,6 +1705,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + struct macroblock_plane *const p = x->plane; const struct segmentation *const seg = &cm->seg; const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type]; const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type]; @@ -1698,7 +1726,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } else { // Setting segmentation map for cyclic_refresh. vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, - ctx->rate, ctx->dist, x->skip); + ctx->rate, ctx->dist, x->skip, p); } vp9_init_plane_quantizers(cpi, x); } @@ -1746,16 +1774,6 @@ static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, set_offsets(cpi, tile, x, mi_row, mi_col, bsize); update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize); -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && - output_enabled && - cpi->common.frame_type != KEY_FRAME && - cpi->resize_pending == 0) { - vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, - VPXMAX(BLOCK_8X8, bsize), ctx); - } -#endif - encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); update_stats(&cpi->common, td); @@ -2432,8 +2450,15 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (cpi->sf.use_square_partition_only && bsize > cpi->sf.use_square_only_threshold) { + if (cpi->use_svc) { + if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) + partition_horz_allowed &= force_horz_split; + if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) + partition_vert_allowed &= force_vert_split; + } else { partition_horz_allowed &= force_horz_split; partition_vert_allowed &= force_vert_split; + } } save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a57cf8725..e4681f601 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1478,7 +1478,11 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->td.mb.e_mbd.bd = (int)cm->bit_depth; #endif // CONFIG_VP9_HIGHBITDEPTH - rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) { + rc->baseline_gf_interval = FIXED_GF_INTERVAL; + } else { + rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + } cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 1; @@ -2793,6 +2797,22 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { cpi->resize_pending); } #endif + if (is_one_pass_cbr_svc(cpi)) { + // Keep track of frame index for each reference frame. + SVC *const svc = &cpi->svc; + if (cm->frame_type == KEY_FRAME) { + svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; + svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; + svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; + } else { + if (cpi->refresh_last_frame) + svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; + if (cpi->refresh_golden_frame) + svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; + if (cpi->refresh_alt_ref_frame) + svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; + } + } } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { @@ -3682,12 +3702,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH - if (use_normative_scaler) + if (use_normative_scaler && + unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth); else scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); #else - if (use_normative_scaler) + if (use_normative_scaler && + unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) scale_and_extend_frame(unscaled, scaled); else scale_and_extend_frame_nonnormative(unscaled, scaled); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 327ac1985..a84202bb4 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1755,7 +1755,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { int center, offset = 0; int bw = 4 << bwl; // redundant variable, to be changed in the experiments. for (d = 0; d <= bw; d += 16) { - this_sad = vp9_vector_var(&ref[d], src, bwl); + this_sad = vpx_vector_var(&ref[d], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; offset = d; @@ -1768,7 +1768,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1781,7 +1781,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1794,7 +1794,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1807,7 +1807,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1876,25 +1876,25 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, // Set up prediction 1-D reference set ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); for (idx = 0; idx < search_width; idx += 16) { - vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); + vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); ref_buf += 16; } ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; for (idx = 0; idx < search_height; ++idx) { - vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor; + vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor; ref_buf += ref_stride; } // Set up src 1-D reference set for (idx = 0; idx < bw; idx += 16) { src_buf = x->plane[0].src.buf + idx; - vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); + vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); } src_buf = x->plane[0].src.buf; for (idx = 0; idx < bh; ++idx) { - src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor; + src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor; src_buf += src_stride; } diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c index b41ffd0a3..008a40afc 100644 --- a/vp9/encoder/vp9_noise_estimate.c +++ b/vp9/encoder/vp9_noise_estimate.c @@ -25,7 +25,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->enabled = 0; - ne->level = kLow; + ne->level = kLowLow; ne->value = 0; ne->count = 0; ne->thresh = 90; @@ -82,6 +82,21 @@ static void copy_frame(YV12_BUFFER_CONFIG * const dest, } } +NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { + int noise_level = kLowLow; + if (ne->value > (ne->thresh << 1)) { + noise_level = kHigh; + } else { + if (ne->value > ne->thresh) + noise_level = kMedium; + else if (ne->value > (ne->thresh >> 1)) + noise_level = kLow; + else + noise_level = kLowLow; + } + return noise_level; +} + void vp9_update_noise_estimate(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; @@ -130,10 +145,6 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { const uint8_t *src_u = cpi->Source->u_buffer; const uint8_t *src_v = cpi->Source->v_buffer; const int src_uvstride = cpi->Source->uv_stride; - const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1; - const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1; - const int uv_width_shift = y_width_shift >> 1; - const int uv_height_shift = y_height_shift >> 1; int mi_row, mi_col; int num_low_motion = 0; int frame_low_motion = 1; @@ -158,13 +169,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // been encoded as zero/low motion x (= thresh_consec_zeromv) frames // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. - const uint8_t ysource = - src_y[y_height_shift * src_ystride + y_width_shift]; - const uint8_t usource = - src_u[uv_height_shift * src_uvstride + uv_width_shift]; - const uint8_t vsource = - src_v[uv_height_shift * src_uvstride + uv_width_shift]; - int is_skin = vp9_skin_pixel(ysource, usource, vsource); + int is_skin = vp9_compute_skin_block(src_y, + src_u, + src_v, + src_ystride, + src_uvstride, + bsize); if (frame_low_motion && cr->consec_zero_mv[bl_index] > thresh_consec_zeromv && cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv && @@ -220,22 +230,16 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Reset counter and check noise level condition. ne->num_frames_estimate = 30; ne->count = 0; - if (ne->value > (ne->thresh << 1)) - ne->level = kHigh; - else - if (ne->value > ne->thresh) - ne->level = kMedium; - else if (ne->value > (ne->thresh >> 1)) - ne->level = kLow; - else - ne->level = kLowLow; + ne->level = vp9_noise_estimate_extract_level(ne); +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) + vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); +#endif } } } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->oxcf.noise_sensitivity > 0) copy_frame(&cpi->denoiser.last_source, cpi->Source); - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); - } #endif } diff --git a/vp9/encoder/vp9_noise_estimate.h b/vp9/encoder/vp9_noise_estimate.h index 0d22ef042..826d125b5 100644 --- a/vp9/encoder/vp9_noise_estimate.h +++ b/vp9/encoder/vp9_noise_estimate.h @@ -47,6 +47,8 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height); +NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne); + void vp9_update_noise_estimate(struct VP9_COMP *const cpi); #ifdef __cplusplus diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 8aafae1d4..b929758ca 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -619,14 +619,14 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, scan_order->scan, scan_order->iscan); break; case TX_16X16: - vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); + vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: - vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); + vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, @@ -673,7 +673,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, if (*eob == 1) *rate += (int)abs(qcoeff[0]); else if (*eob > 1) - *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4); + *rate += vpx_satd((const int16_t *)qcoeff, step << 4); *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift; } @@ -1094,6 +1094,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; + const SVC *const svc = &cpi->svc; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; @@ -1143,6 +1144,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int best_pred_sad = INT_MAX; int best_early_term = 0; int ref_frame_cost[MAX_REF_FRAMES]; + int svc_force_zero_mode[3] = {0}; +#if CONFIG_VP9_TEMPORAL_DENOISING + int64_t zero_last_cost_orig = INT64_MAX; +#endif init_ref_frame_cost(cm, xd, ref_frame_cost); @@ -1193,6 +1198,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } else { usable_ref_frame = GOLDEN_FRAME; } + + // If the reference is temporally aligned with current superframe + // (e.g., spatial reference within superframe), constrain the inter mode: + // for now only test zero motion. + if (cpi->use_svc && svc ->force_zero_mode_spatial_ref) { + if (svc->ref_frame_index[cpi->lst_fb_idx] == svc->current_superframe) + svc_force_zero_mode[LAST_FRAME - 1] = 1; + if (svc->ref_frame_index[cpi->gld_fb_idx] == svc->current_superframe) + svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + } + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); @@ -1245,8 +1261,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, continue; ref_frame = ref_mode_set[idx].ref_frame; - if (cpi->use_svc) + if (cpi->use_svc) { ref_frame = ref_mode_set_svc[idx].ref_frame; + if (svc_force_zero_mode[ref_frame - 1] && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; if (const_motion[ref_frame] && this_mode == NEARMV) @@ -1524,8 +1545,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx); + // Keep track of zero_last cost. + if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) + zero_last_cost_orig = this_rdc.rdcost; + } #else (void)ctx; #endif @@ -1683,6 +1708,62 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && + cpi->resize_pending == 0) { + VP9_DENOISER_DECISION decision = COPY_BLOCK; + vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, + VPXMAX(BLOCK_8X8, bsize), ctx, &decision); + // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised + // result. Only do this under noise conditions, and if rdcost of ZEROMV on + // original source is not significantly higher than rdcost of best mode. + if (((best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || + (best_ref_frame == GOLDEN_FRAME && decision == FILTER_ZEROMV_BLOCK)) && + cpi->noise_estimate.enabled && + cpi->noise_estimate.level > kLow && + zero_last_cost_orig < (best_rdc.rdcost << 3)) { + // Check if we should pick ZEROMV on denoised signal. + int rate = 0; + int64_t dist = 0; + mbmi->mode = ZEROMV; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + mbmi->interp_filter = EIGHTTAP; + xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0]; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); + this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] + + cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]] + [INTER_OFFSET(ZEROMV)]; + this_rdc.dist = dist; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist); + // Switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is lower than best_ref mode (on original source). + if (this_rdc.rdcost > best_rdc.rdcost) { + this_rdc = best_rdc; + mbmi->mode = best_mode; + mbmi->ref_frame[0] = best_ref_frame; + mbmi->interp_filter = best_pred_filter; + if (best_ref_frame == INTRA_FRAME) + mbmi->mv[0].as_int = INVALID_MV; + else if (best_ref_frame == GOLDEN_FRAME) { + mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + if (reuse_inter_pred) { + xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0]; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } + } + mbmi->tx_size = best_tx_size; + x->skip_txfm[0] = best_mode_skip_txfm; + } else { + best_ref_frame = LAST_FRAME; + best_rdc = this_rdc; + } + } + } +#endif + if (cpi->sf.adaptive_rd_thresh) { THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)]; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index fdff36315..2579c6005 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -833,10 +833,16 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); if (frame_is_intra_only(cm)) { - // Handle the special case for key frames forced when we have reached - // the maximum key frame interval. Here force the Q to a range - // based on the ambient Q to reduce the risk of popping. - if (rc->this_key_frame_forced) { + if (oxcf->rc_mode == VPX_Q) { + int qindex = cq_level; + double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, + cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } else if (rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. int qindex = rc->last_boosted_qindex; double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, @@ -886,17 +892,28 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, active_best_quality = active_best_quality * 15 / 16; } else if (oxcf->rc_mode == VPX_Q) { - if (!cpi->refresh_alt_ref_frame) { - active_best_quality = cq_level; - } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); - } + int qindex = cq_level; + double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex; + if (cpi->refresh_alt_ref_frame) + delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth); + else + delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { - active_best_quality = cq_level; + int qindex = cq_level; + double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double delta_rate[FIXED_GF_INTERVAL] = + {0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0}; + int delta_qindex = + vp9_compute_qdelta(rc, q, + q * delta_rate[cm->current_video_frame % + FIXED_GF_INTERVAL], cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) @@ -1075,7 +1092,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, if (!cpi->refresh_alt_ref_frame) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); // Modify best quality for second level arfs. For mode VPX_Q this // becomes the baseline frame q. @@ -1257,8 +1274,12 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { rc->frames_since_golden = 0; // If we are not using alt ref in the up and coming group clear the arf - // active flag. - if (!rc->source_alt_ref_pending) { + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (cpi->oxcf.pass == 2) { + if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0)) + rc->source_alt_ref_active = 0; + } else if (!rc->source_alt_ref_pending) { rc->source_alt_ref_active = 0; } @@ -1309,9 +1330,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } } else { - if (rc->is_src_frame_alt_ref || - !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) || - (cpi->use_svc && oxcf->rc_mode == VPX_CBR)) { + if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) || + (!rc->is_src_frame_alt_ref && + !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { rc->last_q[INTER_FRAME] = qindex; rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); @@ -1718,29 +1739,36 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, RATE_CONTROL *const rc) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; - // Set Maximum gf/arf interval - rc->max_gf_interval = oxcf->max_gf_interval; - rc->min_gf_interval = oxcf->min_gf_interval; - if (rc->min_gf_interval == 0) - rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( - oxcf->width, oxcf->height, cpi->framerate); - if (rc->max_gf_interval == 0) - rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( - cpi->framerate, rc->min_gf_interval); + // Special case code for 1 pass fixed Q mode tests + if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) { + rc->max_gf_interval = FIXED_GF_INTERVAL; + rc->min_gf_interval = FIXED_GF_INTERVAL; + rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; + } else { + // Set Maximum gf/arf interval + rc->max_gf_interval = oxcf->max_gf_interval; + rc->min_gf_interval = oxcf->min_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, cpi->framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( + cpi->framerate, rc->min_gf_interval); + + // Extended interval for genuinely static scenes + rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + + if (is_altref_enabled(cpi)) { + if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) + rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + } - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; - if (is_altref_enabled(cpi)) { - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + // Clamp min to max + rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval); } - - if (rc->max_gf_interval > rc->static_scene_max_gf_interval) - rc->max_gf_interval = rc->static_scene_max_gf_interval; - - // Clamp min to max - rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval); } void vp9_rc_update_framerate(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 136fd3e78..3df909cb1 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -26,6 +26,7 @@ extern "C" { #define MIN_GF_INTERVAL 4 #define MAX_GF_INTERVAL 16 +#define FIXED_GF_INTERVAL 8 // Used in some testing modes only #define ONEHALFONLY_RESIZE 0 typedef enum { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 2a6b70703..bcd8f013f 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1349,11 +1349,25 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter]; for (ref = 0; ref < 1 + is_compound; ++ref) { - const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i, - pd->pre[ref].stride)]; + const int bw = b_width_log2_lookup[BLOCK_8X8]; + const int h = 4 * (i >> bw); + const int w = 4 * (i & ((1 << bw) - 1)); + const struct scale_factors *sf = &xd->block_refs[ref]->sf; + int y_stride = pd->pre[ref].stride; + uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w); + + if (vp9_is_scaled(sf)) { + const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + + y_stride = xd->block_refs[ref]->buf->y_stride; + pre = xd->block_refs[ref]->buf->y_buffer; + pre += scaled_buffer_offset(x_start + w, y_start + h, + y_stride, sf); + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride, + vp9_highbd_build_inter_predictor(pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, @@ -1361,7 +1375,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2), xd->bd); } else { - vp9_build_inter_predictor(pre, pd->pre[ref].stride, + vp9_build_inter_predictor(pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, @@ -1370,7 +1384,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, mi_row * MI_SIZE + 4 * (i / 2)); } #else - vp9_build_inter_predictor(pre, pd->pre[ref].stride, + vp9_build_inter_predictor(pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c index c2763b7da..0ca166536 100644 --- a/vp9/encoder/vp9_skin_detection.c +++ b/vp9/encoder/vp9_skin_detection.c @@ -48,6 +48,20 @@ int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) { return (evaluate_skin_color_difference(cb, cr) < skin_threshold); } +int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, int bsize) { + // Take center pixel in block to determine is_skin. + const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1; + const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1; + const int uv_width_shift = y_width_shift >> 1; + const int uv_height_shift = y_height_shift >> 1; + const uint8_t ysource = y[y_height_shift * stride + y_width_shift]; + const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift]; + const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift]; + return vp9_skin_pixel(ysource, usource, vsource); +} + + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h index 0a87ef9f4..73f7c39d9 100644 --- a/vp9/encoder/vp9_skin_detection.h +++ b/vp9/encoder/vp9_skin_detection.h @@ -23,6 +23,9 @@ struct VP9_COMP; int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr); +int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, int bsize); + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 318d8100c..c5f0bad8f 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -394,7 +394,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; } else { for (i = 0; i < BLOCK_SIZES; ++i) - if (i >= BLOCK_16X16) + if (i > BLOCK_16X16) sf->intra_y_mode_bsize_mask[i] = INTRA_DC; else // Use H and V intra mode for block sizes <= 16X16. diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 13da155c7..30a7d1013 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -25,13 +25,23 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; int mi_rows = cpi->common.mi_rows; int mi_cols = cpi->common.mi_cols; - int sl, tl; + int sl, tl, i; int alt_ref_idx = svc->number_spatial_layers; svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; svc->first_spatial_layer_to_encode = 0; svc->rc_drop_superframe = 0; + svc->force_zero_mode_spatial_ref = 0; + svc->current_superframe = 0; + for (i = 0; i < REF_FRAMES; ++i) + svc->ref_frame_index[i] = -1; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + cpi->svc.ext_frame_flags[sl] = 0; + cpi->svc.ext_lst_fb_idx[sl] = 0; + cpi->svc.ext_gld_fb_idx[sl] = 1; + cpi->svc.ext_alt_fb_idx[sl] = 2; + } if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, @@ -279,7 +289,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || - cpi->svc.number_spatial_layers > 1) { + (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) { cpi->rc.frames_since_key = old_frame_since_key; cpi->rc.frames_to_key = old_frame_to_key; } @@ -353,6 +363,8 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { cpi->svc.number_temporal_layers]; ++lc->current_video_frame_in_layer; ++lc->frames_from_key_frame; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + ++cpi->svc.current_superframe; } int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { @@ -542,6 +554,7 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering( int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { int width = 0, height = 0; LAYER_CONTEXT *lc = NULL; + cpi->svc.force_zero_mode_spatial_ref = 1; if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { set_flags_and_fb_idx_for_temporal_mode3(cpi); @@ -559,6 +572,8 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is // needed to support the case where the frame flags may be passed in via // vpx_codec_encode(), which can be used for the temporal-only svc case. + // TODO(marpan): Consider adding an enc_config parameter to better handle + // this case. if (cpi->ext_refresh_frame_flags_pending == 0) { int sl; cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 5dbf9b418..1f446d743 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -83,6 +83,9 @@ typedef struct { int ext_lst_fb_idx[VPX_MAX_LAYERS]; int ext_gld_fb_idx[VPX_MAX_LAYERS]; int ext_alt_fb_idx[VPX_MAX_LAYERS]; + int ref_frame_index[REF_FRAMES]; + int force_zero_mode_spatial_ref; + int current_superframe; } SVC; struct VP9_COMP; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 16f9c8573..015dbc0ca 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -135,15 +135,38 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { - int src_byte = frame1[byte]; - int pixel_value = *frame2++; - - modifier = src_byte - pixel_value; - // This is an integer approximation of: - // float coeff = (3.0 * modifer * modifier) / pow(2, strength); - // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); - modifier *= modifier; - modifier *= 3; + int pixel_value = *frame2; + + // non-local mean approach + int diff_sse[9] = { 0 }; + int idx, idy, index = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + int row = i + idy; + int col = j + idx; + + if (row >= 0 && row < (int)block_height && + col >= 0 && col < (int)block_width) { + int diff = frame1[byte + idy * (int)stride + idx] - + frame2[idy * (int)block_width + idx]; + diff_sse[index] = diff * diff; + ++index; + } + } + } + + assert(index > 0); + + modifier = 0; + for (idx = 0; idx < 9; ++idx) + modifier += diff_sse[idx]; + + modifier *= 3; + modifier /= index; + + ++frame2; + modifier += rounding; modifier >>= strength; @@ -182,15 +205,34 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8, for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { - int src_byte = frame1[byte]; - int pixel_value = *frame2++; - - modifier = src_byte - pixel_value; - // This is an integer approximation of: - // float coeff = (3.0 * modifer * modifier) / pow(2, strength); - // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); - modifier *= modifier; + int pixel_value = *frame2; + int diff_sse[9] = { 0 }; + int idx, idy, index = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + int row = i + idy; + int col = j + idx; + + if (row >= 0 && row < (int)block_height && + col >= 0 && col < (int)block_width) { + int diff = frame1[byte + idy * (int)stride + idx] - + frame2[idy * (int)block_width + idx]; + diff_sse[index] = diff * diff; + ++index; + } + } + } + assert(index > 0); + + modifier = 0; + for (idx = 0; idx < 9; ++idx) + modifier += diff_sse[idx]; + modifier *= 3; + modifier /= index; + + ++frame2; modifier += rounding; modifier >>= strength; @@ -383,55 +425,58 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset, - f->y_stride, - predictor, 16, 16, adj_strength, - filter_weight, - accumulator, count); - vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset, - f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, - adj_strength, - filter_weight, accumulator + 256, - count + 256); - vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset, - f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, - adj_strength, filter_weight, - accumulator + 512, count + 512); + vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset, + f->y_stride, + predictor, 16, 16, adj_strength, + filter_weight, + accumulator, count); + vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, + f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, + adj_strength, + filter_weight, accumulator + 256, + count + 256); + vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, + adj_strength, filter_weight, + accumulator + 512, count + 512); } else { // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, + strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, + f->uv_stride, + predictor + 256, + mb_uv_width, mb_uv_height, strength, + filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, + f->uv_stride, + predictor + 512, + mb_uv_width, mb_uv_height, strength, + filter_weight, accumulator + 512, + count + 512); + } +#else + // Apply the filter (YUV) + // TODO(jingning): Need SIMD optimization for this. + vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 256, count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); - } -#else - // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, - strength, filter_weight, - accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, - mb_uv_width, mb_uv_height, strength, - filter_weight, accumulator + 256, - count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, - mb_uv_width, mb_uv_height, strength, - filter_weight, accumulator + 512, - count + 512); #endif // CONFIG_VP9_HIGHBITDEPTH } } diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c deleted file mode 100644 index 4531d794a..000000000 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <emmintrin.h> - -#include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" - -void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, - int *min, int *max) { - __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; - u0 = _mm_setzero_si128(); - // Row 0 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff0 = _mm_max_epi16(diff, negdiff); - // Row 1 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(absdiff0, absdiff); - minabsdiff = _mm_min_epi16(absdiff0, absdiff); - // Row 2 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 3 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 4 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 5 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 6 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 7 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); - *max = _mm_extract_epi16(maxabsdiff, 0); - - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); - *min = _mm_extract_epi16(minabsdiff, 0); -} - -unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) { - __m128i s0, s1, u0; - unsigned int avg = 0; - u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); - avg = _mm_extract_epi16(s0, 0); - return (avg + 32) >> 6; -} - -unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { - __m128i s0, s1, u0; - unsigned int avg = 0; - u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); - avg = _mm_extract_epi16(s0, 0); - return (avg + 8) >> 4; -} - -static void hadamard_col8_sse2(__m128i *in, int iter) { - __m128i a0 = in[0]; - __m128i a1 = in[1]; - __m128i a2 = in[2]; - __m128i a3 = in[3]; - __m128i a4 = in[4]; - __m128i a5 = in[5]; - __m128i a6 = in[6]; - __m128i a7 = in[7]; - - __m128i b0 = _mm_add_epi16(a0, a1); - __m128i b1 = _mm_sub_epi16(a0, a1); - __m128i b2 = _mm_add_epi16(a2, a3); - __m128i b3 = _mm_sub_epi16(a2, a3); - __m128i b4 = _mm_add_epi16(a4, a5); - __m128i b5 = _mm_sub_epi16(a4, a5); - __m128i b6 = _mm_add_epi16(a6, a7); - __m128i b7 = _mm_sub_epi16(a6, a7); - - a0 = _mm_add_epi16(b0, b2); - a1 = _mm_add_epi16(b1, b3); - a2 = _mm_sub_epi16(b0, b2); - a3 = _mm_sub_epi16(b1, b3); - a4 = _mm_add_epi16(b4, b6); - a5 = _mm_add_epi16(b5, b7); - a6 = _mm_sub_epi16(b4, b6); - a7 = _mm_sub_epi16(b5, b7); - - if (iter == 0) { - b0 = _mm_add_epi16(a0, a4); - b7 = _mm_add_epi16(a1, a5); - b3 = _mm_add_epi16(a2, a6); - b4 = _mm_add_epi16(a3, a7); - b2 = _mm_sub_epi16(a0, a4); - b6 = _mm_sub_epi16(a1, a5); - b1 = _mm_sub_epi16(a2, a6); - b5 = _mm_sub_epi16(a3, a7); - - a0 = _mm_unpacklo_epi16(b0, b1); - a1 = _mm_unpacklo_epi16(b2, b3); - a2 = _mm_unpackhi_epi16(b0, b1); - a3 = _mm_unpackhi_epi16(b2, b3); - a4 = _mm_unpacklo_epi16(b4, b5); - a5 = _mm_unpacklo_epi16(b6, b7); - a6 = _mm_unpackhi_epi16(b4, b5); - a7 = _mm_unpackhi_epi16(b6, b7); - - b0 = _mm_unpacklo_epi32(a0, a1); - b1 = _mm_unpacklo_epi32(a4, a5); - b2 = _mm_unpackhi_epi32(a0, a1); - b3 = _mm_unpackhi_epi32(a4, a5); - b4 = _mm_unpacklo_epi32(a2, a3); - b5 = _mm_unpacklo_epi32(a6, a7); - b6 = _mm_unpackhi_epi32(a2, a3); - b7 = _mm_unpackhi_epi32(a6, a7); - - in[0] = _mm_unpacklo_epi64(b0, b1); - in[1] = _mm_unpackhi_epi64(b0, b1); - in[2] = _mm_unpacklo_epi64(b2, b3); - in[3] = _mm_unpackhi_epi64(b2, b3); - in[4] = _mm_unpacklo_epi64(b4, b5); - in[5] = _mm_unpackhi_epi64(b4, b5); - in[6] = _mm_unpacklo_epi64(b6, b7); - in[7] = _mm_unpackhi_epi64(b6, b7); - } else { - in[0] = _mm_add_epi16(a0, a4); - in[7] = _mm_add_epi16(a1, a5); - in[3] = _mm_add_epi16(a2, a6); - in[4] = _mm_add_epi16(a3, a7); - in[2] = _mm_sub_epi16(a0, a4); - in[6] = _mm_sub_epi16(a1, a5); - in[1] = _mm_sub_epi16(a2, a6); - in[5] = _mm_sub_epi16(a3, a7); - } -} - -void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - __m128i src[8]; - src[0] = _mm_load_si128((const __m128i *)src_diff); - src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - - hadamard_col8_sse2(src, 0); - hadamard_col8_sse2(src, 1); - - _mm_store_si128((__m128i *)coeff, src[0]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); -} - -void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride - + (idx & 0x01) * 8; - vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); - } - - for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); - - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); - - b0 = _mm_srai_epi16(b0, 1); - b1 = _mm_srai_epi16(b1, 1); - b2 = _mm_srai_epi16(b2, 1); - b3 = _mm_srai_epi16(b3, 1); - - coeff0 = _mm_add_epi16(b0, b2); - coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); - - coeff2 = _mm_sub_epi16(b0, b2); - coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); - - coeff += 8; - } -} - -int16_t vp9_satd_sse2(const int16_t *coeff, int length) { - int i; - __m128i sum = _mm_load_si128((const __m128i *)coeff); - __m128i sign = _mm_srai_epi16(sum, 15); - __m128i val = _mm_xor_si128(sum, sign); - sum = _mm_sub_epi16(val, sign); - coeff += 8; - - for (i = 8; i < length; i += 8) { - __m128i src_line = _mm_load_si128((const __m128i *)coeff); - sign = _mm_srai_epi16(src_line, 15); - val = _mm_xor_si128(src_line, sign); - val = _mm_sub_epi16(val, sign); - sum = _mm_add_epi16(sum, val); - coeff += 8; - } - - val = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, val); - val = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, val); - val = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, val); - - return _mm_extract_epi16(sum, 0); -} - -void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, - const int ref_stride, const int height) { - int idx; - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_loadu_si128((const __m128i *)ref); - __m128i s0 = _mm_unpacklo_epi8(src_line, zero); - __m128i s1 = _mm_unpackhi_epi8(src_line, zero); - __m128i t0, t1; - int height_1 = height - 1; - ref += ref_stride; - - for (idx = 1; idx < height_1; idx += 2) { - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - } - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - - if (height == 64) { - s0 = _mm_srai_epi16(s0, 5); - s1 = _mm_srai_epi16(s1, 5); - } else if (height == 32) { - s0 = _mm_srai_epi16(s0, 4); - s1 = _mm_srai_epi16(s1, 4); - } else { - s0 = _mm_srai_epi16(s0, 3); - s1 = _mm_srai_epi16(s1, 3); - } - - _mm_storeu_si128((__m128i *)hbuf, s0); - hbuf += 8; - _mm_storeu_si128((__m128i *)hbuf, s1); -} - -int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); - __m128i s0 = _mm_sad_epu8(src_line, zero); - __m128i s1; - int i; - - for (i = 16; i < width; i += 16) { - ref += 16; - src_line = _mm_load_si128((const __m128i *)ref); - s1 = _mm_sad_epu8(src_line, zero); - s0 = _mm_adds_epu16(s0, s1); - } - - s1 = _mm_srli_si128(s0, 8); - s0 = _mm_adds_epu16(s0, s1); - - return _mm_extract_epi16(s0, 0); -} - -int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, - const int bwl) { - int idx; - int width = 4 << bwl; - int16_t mean; - __m128i v0 = _mm_loadu_si128((const __m128i *)ref); - __m128i v1 = _mm_load_si128((const __m128i *)src); - __m128i diff = _mm_subs_epi16(v0, v1); - __m128i sum = diff; - __m128i sse = _mm_madd_epi16(diff, diff); - - ref += 8; - src += 8; - - for (idx = 8; idx < width; idx += 8) { - v0 = _mm_loadu_si128((const __m128i *)ref); - v1 = _mm_load_si128((const __m128i *)src); - diff = _mm_subs_epi16(v0, v1); - - sum = _mm_add_epi16(sum, diff); - v0 = _mm_madd_epi16(diff, diff); - sse = _mm_add_epi32(sse, v0); - - ref += 8; - src += 8; - } - - v0 = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, v0); - - v1 = _mm_srli_si128(sse, 8); - sse = _mm_add_epi32(sse, v1); - v1 = _mm_srli_epi64(sse, 32); - sse = _mm_add_epi32(sse, v1); - - mean = _mm_extract_epi16(sum, 0); - - return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); -} diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm deleted file mode 100644 index 74c52df19..000000000 --- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm +++ /dev/null @@ -1,121 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - -SECTION .text - -%if ARCH_X86_64 -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro HMD8_1D 0 - psubw m8, m0, m1 - psubw m9, m2, m3 - paddw m0, m1 - paddw m2, m3 - SWAP 1, 8 - SWAP 3, 9 - psubw m8, m4, m5 - psubw m9, m6, m7 - paddw m4, m5 - paddw m6, m7 - SWAP 5, 8 - SWAP 7, 9 - - psubw m8, m0, m2 - psubw m9, m1, m3 - paddw m0, m2 - paddw m1, m3 - SWAP 2, 8 - SWAP 3, 9 - psubw m8, m4, m6 - psubw m9, m5, m7 - paddw m4, m6 - paddw m5, m7 - SWAP 6, 8 - SWAP 7, 9 - - psubw m8, m0, m4 - psubw m9, m1, m5 - paddw m0, m4 - paddw m1, m5 - SWAP 4, 8 - SWAP 5, 9 - psubw m8, m2, m6 - psubw m9, m3, m7 - paddw m2, m6 - paddw m3, m7 - SWAP 6, 8 - SWAP 7, 9 -%endmacro - -INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output - lea r3, [2 * strideq] - lea r4, [4 * strideq] - - mova m0, [inputq] - mova m1, [inputq + r3] - lea inputq, [inputq + r4] - mova m2, [inputq] - mova m3, [inputq + r3] - lea inputq, [inputq + r4] - mova m4, [inputq] - mova m5, [inputq + r3] - lea inputq, [inputq + r4] - mova m6, [inputq] - mova m7, [inputq + r3] - - HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - HMD8_1D - - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 - - RET -%endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 5918240e2..de688bf48 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -17,7 +17,6 @@ VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) VP9_CX_SRCS-yes += vp9_cx_iface.c -VP9_CX_SRCS-yes += encoder/vp9_avg.c VP9_CX_SRCS-yes += encoder/vp9_bitstream.c VP9_CX_SRCS-yes += encoder/vp9_context_tree.c VP9_CX_SRCS-yes += encoder/vp9_context_tree.h @@ -93,7 +92,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c @@ -114,7 +112,6 @@ endif ifeq ($(ARCH_X86_64),yes) ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm endif endif @@ -131,10 +128,8 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c endif -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c |