diff options
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/vp9_avg.c | 30 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 120 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 14 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_avg_intrin_sse2.c | 114 |
5 files changed, 275 insertions, 9 deletions
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index f8fa7d2e8..8d6cf0667 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -28,6 +28,36 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { return (sum + 8) >> 4; } +// Integer projection onto row vectors. +void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, + const int ref_stride, const int height) { + int idx; + for (idx = 0; idx < 16; ++idx) { + int i; + hbuf[idx] = 0; + for (i = 0; i < height; ++i) + hbuf[idx] += ref[i * ref_stride]; + ++ref; + } +} + +int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) { + int idx; + int16_t sum = 0; + for (idx = 0; idx < width; ++idx) + sum += ref[idx]; + return sum; +} + +int vp9_vector_sad_c(int16_t const *ref, int16_t const *src, + const int width) { + int i; + int this_sad = 0; + for (i = 0; i < width; ++i) + this_sad += abs(ref[i] - src[i]); + return this_sad; +} + #if CONFIG_VP9_HIGHBITDEPTH unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) { int i, j; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 99bb9300e..39dedf6cb 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -507,6 +507,119 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { } } +#if CONFIG_VP9_HIGHBITDEPTH +#define GLOBAL_MOTION 0 +#else +#define GLOBAL_MOTION 1 +#endif + +#if GLOBAL_MOTION +static int vector_match(int16_t *ref, int16_t *src) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + for (d = 0; d <= 64; d += 16) { + this_sad = vp9_vector_sad(&ref[d], src, 64); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > 64 || this_pos == 32) + continue; + this_sad = vp9_vector_sad(&ref[this_pos], src, 64); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > 64 || this_pos == 32) + continue; + this_sad = vp9_vector_sad(&ref[this_pos], src, 64); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 1) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > 64 || this_pos == 32) + continue; + this_sad = vp9_vector_sad(&ref[this_pos], src, 64); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + + return (center - 32); +} + +static void motion_estimation(MACROBLOCK *x) { + MACROBLOCKD *xd = &x->e_mbd; + DECLARE_ALIGNED(16, int16_t, hbuf[128]); + DECLARE_ALIGNED(16, int16_t, vbuf[128]); + DECLARE_ALIGNED(16, int16_t, src_hbuf[64]); + DECLARE_ALIGNED(16, int16_t, src_vbuf[64]); + int idx; + const int stride = 64; + const int search_width = 128; + const int search_height = 128; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + MV *tmp_mv = &xd->mi[0].src_mi->mbmi.mv[0].as_mv; + + // Set up prediction 1-D reference set + ref_buf = xd->plane[0].pre[0].buf + (-32); + for (idx = 0; idx < search_width; idx += 16) { + vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, 64); + ref_buf += 16; + } + + ref_buf = xd->plane[0].pre[0].buf + (-32) * ref_stride; + for (idx = 0; idx < search_height; ++idx) { + vbuf[idx] = vp9_int_pro_col(ref_buf, 64); + ref_buf += ref_stride; + } + + // Set up src 1-D reference set + for (idx = 0; idx < stride; idx += 16) { + src_buf = x->plane[0].src.buf + idx; + vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, 64); + } + + src_buf = x->plane[0].src.buf; + for (idx = 0; idx < stride; ++idx) { + src_vbuf[idx] = vp9_int_pro_col(src_buf, 64); + src_buf += src_stride; + } + + // Find the best match per 1-D search + + tmp_mv->col = vector_match(hbuf, src_hbuf); + tmp_mv->row = vector_match(vbuf, src_vbuf); + + tmp_mv->row *= 8; + tmp_mv->col *= 8; + + x->pred_mv[LAST_FRAME] = *tmp_mv; +} +#endif // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for downs-sampled inputs. @@ -551,6 +664,11 @@ static void choose_partitioning(VP9_COMP *cpi, mbmi->ref_frame[1] = NONE; mbmi->sb_type = BLOCK_64X64; mbmi->mv[0].as_int = 0; + +#if GLOBAL_MOTION + motion_estimation(x); +#endif + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); for (i = 1; i <= 2; ++i) { @@ -3129,7 +3247,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, bsize, pc_tree); } - if (bsize == BLOCK_64X64) { + if (bsize == BLOCK_64X64 && do_recon) { assert(tp_orig < *tp); assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 366aeb193..4c45a7926 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -110,9 +110,9 @@ static void output_stats(FIRSTPASS_STATS *stats, FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); - fprintf(fpfile, "%12.0f %12.4f %12.0f %12.0f %12.0f %12.4f %12.4f" - "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" - "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n", + fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n", stats->frame, stats->weight, stats->intra_error, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 46a354700..71cea0e45 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -216,6 +216,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, int64_t dist; struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; + const int64_t dc_thr = p->quant_thred[0] >> 6; + const int64_t ac_thr = p->quant_thred[1] >> 6; const uint32_t dc_quant = pd->dequant[0]; const uint32_t ac_quant = pd->dequant[1]; unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, @@ -223,12 +225,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, *var_y = var; *sse_y = sse; - if (sse < dc_quant * dc_quant >> 6) - x->skip_txfm[0] = 1; - else if (var < ac_quant * ac_quant >> 6) + x->skip_txfm[0] = 0; + // Check if all ac coefficients can be quantized to zero. + if (var < ac_thr || var == 0) { x->skip_txfm[0] = 2; - else - x->skip_txfm[0] = 0; + // Check if dc coefficient can be quantized to zero. + if (sse - var < dc_thr || sse == var) + x->skip_txfm[0] = 1; + } if (cpi->common.tx_mode == TX_MODE_SELECT) { if (sse > (var << 2)) diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 4c3495b05..0a105629f 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -56,3 +56,117 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { avg = _mm_extract_epi16(s0, 0); return (avg + 8) >> 4; } + +void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, + const int ref_stride, const int height) { + int idx; + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i s0 = _mm_unpacklo_epi8(src_line, zero); + __m128i s1 = _mm_unpackhi_epi8(src_line, zero); + __m128i t0, t1; + int height_1 = height - 1; + ref += ref_stride; + + for (idx = 1; idx < height_1; idx += 2) { + src_line = _mm_load_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + + src_line = _mm_load_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + } + + src_line = _mm_load_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + + _mm_store_si128((__m128i *)hbuf, s0); + hbuf += 8; + _mm_store_si128((__m128i *)hbuf, s1); +} + +int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i s0 = _mm_sad_epu8(src_line, zero); + __m128i s1; + (void) width; // width = 64 + + ref += 16; + src_line = _mm_load_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + + ref += 16; + src_line = _mm_load_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + + ref += 16; + src_line = _mm_load_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_adds_epu16(s0, s1); + + return _mm_extract_epi16(s0, 0); +} + +int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, + const int width) { + int idx; + __m128i zero = _mm_setzero_si128(); + __m128i sum; + __m128i v0 = _mm_loadu_si128((const __m128i *)ref); + __m128i v1 = _mm_load_si128((const __m128i *)src); + __m128i diff = _mm_subs_epi16(v0, v1); + __m128i sign = _mm_srai_epi16(diff, 15); + + diff = _mm_xor_si128(diff, sign); + sum = _mm_sub_epi16(diff, sign); + + (void) width; // width = 64; + + ref += 8; + src += 8; + + v0 = _mm_unpacklo_epi16(sum, zero); + v1 = _mm_unpackhi_epi16(sum, zero); + sum = _mm_add_epi32(v0, v1); + + for (idx = 1; idx < 8; ++idx) { + v0 = _mm_loadu_si128((const __m128i *)ref); + v1 = _mm_load_si128((const __m128i *)src); + diff = _mm_subs_epi16(v0, v1); + sign = _mm_srai_epi16(diff, 15); + diff = _mm_xor_si128(diff, sign); + diff = _mm_sub_epi16(diff, sign); + + v0 = _mm_unpacklo_epi16(diff, zero); + v1 = _mm_unpackhi_epi16(diff, zero); + + sum = _mm_add_epi32(sum, v0); + sum = _mm_add_epi32(sum, v1); + + ref += 8; + src += 8; + } + + v0 = _mm_srli_si128(sum, 8); + sum = _mm_add_epi32(sum, v0); + v0 = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi32(sum, v0); + + return _mm_cvtsi128_si32(sum); +} |