From 54eda13f8df587fe0a5a202f232f66863aff445a Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 9 Mar 2015 18:55:38 -0700 Subject: Apply fast motion search to golden reference frame This commit enables the rtc coding mode to run integral projection based motion search for golden reference frame. It improves the speed -6 compression performance by 1.1% on average, 3.46% for jimred_vga, 6.46% for tacomascmvvga, and 0.5% for vidyo clips. The speed -6 is about 6% slower. Change-Id: I0fe402ad2edf0149d0349ad304ab9b2abdf0c804 --- vp9/encoder/vp9_pickmode.c | 40 +++++++++++++++++++++++++++++------ vp9/encoder/x86/vp9_avg_intrin_sse2.c | 12 +++++------ 2 files changed, 40 insertions(+), 12 deletions(-) (limited to 'vp9/encoder') diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 6c2576add..23a2569c8 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -784,15 +784,43 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, continue; if (this_mode == NEWMV) { - if (ref_frame > LAST_FRAME) - continue; if (cpi->sf.partition_search_type != VAR_BASED_PARTITION && best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize])) continue; - if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost)) + + if (ref_frame > LAST_FRAME) { + int tmp_sad; + int dis, cost_list[5]; + + if (bsize < BLOCK_16X16) + continue; + + tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) + continue; + + frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int; + rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, + &mbmi->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv, + &mbmi->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, + &x->pred_sse[ref_frame], NULL, 0, 0); + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], + &rate_mv, best_rdc.rdcost)) { continue; + } } if (this_mode != NEARESTMV && @@ -817,7 +845,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && - pred_filter_search && + pred_filter_search && (ref_frame == LAST_FRAME) && ((mbmi->mv[0].as_mv.row & 0x07) != 0 || (mbmi->mv[0].as_mv.col & 0x07) != 0)) { int pf_rate[3]; diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index f49949940..618b5f73d 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -61,7 +61,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); __m128i s0 = _mm_unpacklo_epi8(src_line, zero); __m128i s1 = _mm_unpackhi_epi8(src_line, zero); __m128i t0, t1; @@ -69,14 +69,14 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, ref += ref_stride; for (idx = 1; idx < height_1; idx += 2) { - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_adds_epu16(s0, t0); s1 = _mm_adds_epu16(s1, t1); ref += ref_stride; - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_adds_epu16(s0, t0); @@ -84,7 +84,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, ref += ref_stride; } - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_adds_epu16(s0, t0); @@ -101,9 +101,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, s1 = _mm_srai_epi16(s1, 3); } - _mm_store_si128((__m128i *)hbuf, s0); + _mm_storeu_si128((__m128i *)hbuf, s0); hbuf += 8; - _mm_store_si128((__m128i *)hbuf, s1); + _mm_storeu_si128((__m128i *)hbuf, s1); } int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { -- cgit v1.2.3