diff options
Diffstat (limited to 'vp9/encoder')
46 files changed, 5105 insertions, 7239 deletions
diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c index 71fad2e07..e174a894a 100644 --- a/vp9/encoder/vp9_asm_enc_offsets.c +++ b/vp9/encoder/vp9_asm_enc_offsets.c @@ -32,7 +32,6 @@ DEFINE(vp9_block_quant_shift, offsetof(BLOCK, quant_shift)); DEFINE(vp9_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); DEFINE(vp9_blockd_dequant, offsetof(BLOCKD, dequant)); DEFINE(vp9_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); -DEFINE(vp9_blockd_eob, offsetof(BLOCKD, eob)); END diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 61aac5cd1..7101947a6 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -14,6 +14,7 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_findnearmv.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/common/vp9_systemdependent.h" #include <assert.h> @@ -41,12 +42,9 @@ unsigned __int64 Sectionbits[500]; int intra_mode_stats[VP9_KF_BINTRAMODES] [VP9_KF_BINTRAMODES] [VP9_KF_BINTRAMODES]; -vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16]; -vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16]; +vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES]; +vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES]; +vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES]; vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32]; extern unsigned int active_section; @@ -189,15 +187,7 @@ static void update_refpred_stats(VP9_COMP *cpi) { int old_cost, new_cost; // Set the prediction probability structures to defaults - if (cm->frame_type == KEY_FRAME) { - // Set the prediction probabilities to defaults - cm->ref_pred_probs[0] = 120; - cm->ref_pred_probs[1] = 80; - cm->ref_pred_probs[2] = 40; - - vpx_memset(cpi->ref_pred_probs_update, 0, - sizeof(cpi->ref_pred_probs_update)); - } else { + if (cm->frame_type != KEY_FRAME) { // From the prediction counts set the probabilities for each context for (i = 0; i < PREDICTION_PROBS; i++) { new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0], @@ -219,7 +209,6 @@ static void update_refpred_stats(VP9_COMP *cpi) { cm->ref_pred_probs[i] = new_pred_probs[i]; } else cpi->ref_pred_probs_update[i] = 0; - } } } @@ -230,8 +219,8 @@ static void update_refpred_stats(VP9_COMP *cpi) { // // The branch counts table is re-populated during the actual pack stage and in // the decoder to facilitate backwards update of the context. -static void update_mode_probs(VP9_COMMON *cm, - int mode_context[INTER_MODE_CONTEXTS][4]) { +static void update_inter_mode_probs(VP9_COMMON *cm, + int mode_context[INTER_MODE_CONTEXTS][4]) { int i, j; unsigned int (*mv_ref_ct)[4][2]; @@ -508,7 +497,8 @@ static void write_sub_mv_ref vp9_sub_mv_ref_encoding_array - LEFT4X4 + m); } -static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref, +static void write_nmv(VP9_COMP *cpi, vp9_writer *bc, + const MV *mv, const int_mv *ref, const nmv_context *nmvc, int usehp) { MV e; e.row = mv->row - ref->as_mv.row; @@ -585,6 +575,28 @@ static void write_mb_segid(vp9_writer *bc, } } +static void write_mb_segid_except(VP9_COMMON *cm, + vp9_writer *bc, + const MB_MODE_INFO *mi, + const MACROBLOCKD *xd, + int mb_row, int mb_col) { + // Encode the MB segment id. + int seg_id = mi->segment_id; + int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, + mb_row * cm->mb_cols + mb_col); + const vp9_prob *p = xd->mb_segment_tree_probs; + const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id]; + + if (xd->segmentation_enabled && xd->update_mb_segmentation_map) { + vp9_write(bc, seg_id >= 2, p1); + if (pred_seg_id >= 2 && seg_id < 2) { + vp9_write(bc, seg_id == 1, p[1]); + } else if (pred_seg_id < 2 && seg_id >= 2) { + vp9_write(bc, seg_id == 3, p[2]); + } + } +} + // This function encodes the reference frame static void encode_ref_frame(vp9_writer *const bc, VP9_COMMON *const cm, @@ -728,7 +740,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, // If the mb segment id wasn't predicted code explicitly if (!prediction_flag) - write_mb_segid(bc, mi, &cpi->mb.e_mbd); + write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col); } else { // Normal unpredicted coding write_mb_segid(bc, mi, &cpi->mb.e_mbd); @@ -737,8 +749,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, if (!pc->mb_no_coeff_skip) { skip_coeff = 0; - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { skip_coeff = 1; } else { const int nmbs = mb_size; @@ -758,24 +769,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, } // Encode the reference frame. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) - || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) { - encode_ref_frame(bc, pc, xd, segment_id, rf); - } else { - assert(rf == INTRA_FRAME); - } + encode_ref_frame(bc, pc, xd, segment_id, rf); if (rf == INTRA_FRAME) { #ifdef ENTROPY_STATS active_section = 6; #endif - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { - if (m->mbmi.sb_type) - write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob); - else - write_ymode(bc, mode, pc->fc.ymode_prob); - } + if (m->mbmi.sb_type) + write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob); + else + write_ymode(bc, mode, pc->fc.ymode_prob); + if (mode == B_PRED) { int j = 0; do { @@ -801,14 +806,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]); - // #ifdef ENTROPY_STATS #ifdef ENTROPY_STATS - accum_mv_refs(mode, ct); active_section = 3; #endif - // Is the segment coding of mode enabled - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { + // If segment skip is not enabled code the mode. + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { if (mi->sb_type) { write_sb_mv_ref(bc, mode, mv_ref_p); } else { @@ -878,12 +881,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #ifdef ENTROPY_STATS active_section = 5; #endif - write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv, + write_nmv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv, (const nmv_context*) nmvc, xd->allow_high_precision_mv); if (mi->second_ref_frame > 0) { - write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv, + write_nmv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv, (const nmv_context*) nmvc, xd->allow_high_precision_mv); } @@ -915,7 +918,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #else while (j != L[++k]); #endif - leftmv.as_int = left_block_mv(m, k); + leftmv.as_int = left_block_mv(xd, m, k); abovemv.as_int = above_block_mv(m, k, mis); mv_contz = vp9_mv_cont(&leftmv, &abovemv); @@ -926,12 +929,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #ifdef ENTROPY_STATS active_section = 11; #endif - write_nmv(bc, &blockmv.as_mv, &mi->best_mv, + write_nmv(cpi, bc, &blockmv.as_mv, &mi->best_mv, (const nmv_context*) nmvc, xd->allow_high_precision_mv); if (mi->second_ref_frame > 0) { - write_nmv(bc, + write_nmv(cpi, bc, &cpi->mb.partition_info->bmi[j].second_mv.as_mv, &mi->best_second_mv, (const nmv_context*) nmvc, @@ -951,8 +954,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, mi->partitioning == PARTITIONING_4X4))) && pc->txfm_mode == TX_MODE_SELECT && !((pc->mb_no_coeff_skip && skip_coeff) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { TX_SIZE sz = mi->txfm_size; // FIXME(rbultje) code ternary symbol once all experiments are merged vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]); @@ -981,8 +983,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, if (!c->mb_no_coeff_skip) { skip_coeff = 0; - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { skip_coeff = 1; } else { const int nmbs = 1 << m->mbmi.sb_type; @@ -1013,7 +1014,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, int i = 0; do { const B_PREDICTION_MODE A = above_block_mode(m, i, mis); - const B_PREDICTION_MODE L = left_block_mode(m, i); + const B_PREDICTION_MODE L = (xd->left_available || (i & 3)) ? + left_block_mode(m, i) : B_DC_PRED; const int bm = m->bmi[i].as_mode.first; #ifdef ENTROPY_STATS @@ -1041,8 +1043,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT && !((c->mb_no_coeff_skip && skip_coeff) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { TX_SIZE sz = m->mbmi.txfm_size; // FIXME(rbultje) code ternary symbol once all experiments are merged vp9_write(bc, sz != TX_4X4, c->prob_tx[0]); @@ -1061,6 +1062,10 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, MACROBLOCKD *const xd = &cpi->mb.e_mbd; xd->mode_info_context = m; + xd->left_available = mb_col > c->cur_tile_mb_col_start; + xd->right_available = + (mb_col + (1 << m->mbmi.sb_type)) < c->cur_tile_mb_col_end; + xd->up_available = mb_row > 0; if (c->frame_type == KEY_FRAME) { write_mb_modes_kf(cpi, m, bc, c->mb_rows - mb_row, c->mb_cols - mb_col); @@ -1079,20 +1084,22 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, pack_mb_tokens(bc, tok, tok_end); } -static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { +static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, + TOKENEXTRA **tok, TOKENEXTRA *tok_end) { VP9_COMMON *const c = &cpi->common; const int mis = c->mode_info_stride; MODE_INFO *m, *m_ptr = c->mi; int i, mb_row, mb_col; - TOKENEXTRA *tok = cpi->tok; - TOKENEXTRA *tok_end = tok + cpi->tok_count; - for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) { + m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis; + for (mb_row = c->cur_tile_mb_row_start; + mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) { m = m_ptr; - for (mb_col = 0; mb_col < c->mb_cols; mb_col += 4, m += 4) { + for (mb_col = c->cur_tile_mb_col_start; + mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) { vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded); if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) { - write_modes_b(cpi, m, bc, &tok, tok_end, mb_row, mb_col); + write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col); } else { int j; @@ -1107,7 +1114,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded); if (sb_m->mbmi.sb_type) { assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32); - write_modes_b(cpi, sb_m, bc, &tok, tok_end, + write_modes_b(cpi, sb_m, bc, tok, tok_end, mb_row + y_idx_sb, mb_col + x_idx_sb); } else { // Process the 4 MBs in the order: @@ -1123,7 +1130,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { } assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16); - write_modes_b(cpi, mb_m, bc, &tok, tok_end, + write_modes_b(cpi, mb_m, bc, tok, tok_end, mb_row + y_idx, mb_col + x_idx); } } @@ -1135,20 +1142,23 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { /* This function is used for debugging probability trees. */ -static void print_prob_tree(vp9_coeff_probs *coef_probs) { +static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) { /* print coef probability tree */ - int i, j, k, l; + int i, j, k, l, m; FILE *f = fopen("enc_tree_probs.txt", "a"); fprintf(f, "{\n"); - for (i = 0; i < BLOCK_TYPES_4X4; i++) { + for (i = 0; i < block_types; i++) { fprintf(f, " {\n"); - for (j = 0; j < COEF_BANDS; j++) { - fprintf(f, " {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - fprintf(f, " {"); - for (l = 0; l < ENTROPY_NODES; l++) { - fprintf(f, "%3u, ", - (unsigned int)(coef_probs [i][j][k][l])); + for (j = 0; j < REF_TYPES; ++j) { + fprintf(f, " {\n"); + for (k = 0; k < COEF_BANDS; k++) { + fprintf(f, " {\n"); + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + fprintf(f, " {"); + for (m = 0; m < ENTROPY_NODES; m++) { + fprintf(f, "%3u, ", + (unsigned int)(coef_probs[i][j][k][l][m])); + } } fprintf(f, " }\n"); } @@ -1168,26 +1178,28 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs, #endif vp9_coeff_stats *coef_branch_ct, int block_types) { - int i = 0, j, k; + int i, j, k, l; #ifdef ENTROPY_STATS int t = 0; #endif for (i = 0; i < block_types; ++i) { - for (j = 0; j < COEF_BANDS; ++j) { - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, - vp9_coef_encodings, vp9_coef_tree, - coef_probs[i][j][k], - coef_branch_ct[i][j][k], - coef_counts[i][j][k]); + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + if (l >= 3 && k == 0) + continue; + vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, + vp9_coef_encodings, vp9_coef_tree, + coef_probs[i][j][k][l], + coef_branch_ct[i][j][k][l], + coef_counts[i][j][k][l]); #ifdef ENTROPY_STATS if (!cpi->dummy_packing) for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - context_counters[i][j][k][t] += coef_counts[i][j][k][t]; + context_counters[i][j][k][l][t] += coef_counts[i][j][k][l][t]; #endif + } } } } @@ -1199,37 +1211,19 @@ static void build_coeff_contexts(VP9_COMP *cpi) { #ifdef ENTROPY_STATS cpi, context_counters_4x4, #endif - cpi->frame_branch_ct_4x4, BLOCK_TYPES_4X4); - build_tree_distribution(cpi->frame_hybrid_coef_probs_4x4, - cpi->hybrid_coef_counts_4x4, -#ifdef ENTROPY_STATS - cpi, hybrid_context_counters_4x4, -#endif - cpi->frame_hybrid_branch_ct_4x4, BLOCK_TYPES_4X4); + cpi->frame_branch_ct_4x4, BLOCK_TYPES); build_tree_distribution(cpi->frame_coef_probs_8x8, cpi->coef_counts_8x8, #ifdef ENTROPY_STATS cpi, context_counters_8x8, #endif - cpi->frame_branch_ct_8x8, BLOCK_TYPES_8X8); - build_tree_distribution(cpi->frame_hybrid_coef_probs_8x8, - cpi->hybrid_coef_counts_8x8, -#ifdef ENTROPY_STATS - cpi, hybrid_context_counters_8x8, -#endif - cpi->frame_hybrid_branch_ct_8x8, BLOCK_TYPES_8X8); + cpi->frame_branch_ct_8x8, BLOCK_TYPES); build_tree_distribution(cpi->frame_coef_probs_16x16, cpi->coef_counts_16x16, #ifdef ENTROPY_STATS cpi, context_counters_16x16, #endif - cpi->frame_branch_ct_16x16, BLOCK_TYPES_16X16); - build_tree_distribution(cpi->frame_hybrid_coef_probs_16x16, - cpi->hybrid_coef_counts_16x16, -#ifdef ENTROPY_STATS - cpi, hybrid_context_counters_16x16, -#endif - cpi->frame_hybrid_branch_ct_16x16, BLOCK_TYPES_16X16); + cpi->frame_branch_ct_16x16, BLOCK_TYPES); build_tree_distribution(cpi->frame_coef_probs_32x32, cpi->coef_counts_32x32, #ifdef ENTROPY_STATS @@ -1247,7 +1241,7 @@ static void update_coef_probs_common(vp9_writer* const bc, vp9_coeff_probs *old_frame_coef_probs, vp9_coeff_stats *frame_branch_ct, int block_types) { - int i, j, k, t; + int i, j, k, l, t; int update[2] = {0, 0}; int savings; // vp9_prob bestupd = find_coef_update_prob(cpi); @@ -1255,38 +1249,39 @@ static void update_coef_probs_common(vp9_writer* const bc, /* dry run to see if there is any udpate at all needed */ savings = 0; for (i = 0; i < block_types; ++i) { - for (j = !i; j < COEF_BANDS; ++j) { - int prev_coef_savings[ENTROPY_NODES] = {0}; - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - for (t = 0; t < ENTROPY_NODES; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][t]; - const vp9_prob oldp = old_frame_coef_probs[i][j][k][t]; - const vp9_prob upd = COEF_UPDATE_PROB; - int s = prev_coef_savings[t]; - int u = 0; - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + int prev_coef_savings[ENTROPY_NODES] = {0}; + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + for (t = 0; t < ENTROPY_NODES; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t]; + const vp9_prob upd = COEF_UPDATE_PROB; + int s = prev_coef_savings[t]; + int u = 0; + + if (l >= 3 && k == 0) + continue; #if defined(SEARCH_NEWP) - s = prob_diff_update_savings_search( - frame_branch_ct[i][j][k][t], - oldp, &newp, upd); - if (s > 0 && newp != oldp) - u = 1; - if (u) - savings += s - (int)(vp9_cost_zero(upd)); - else - savings -= (int)(vp9_cost_zero(upd)); + s = prob_diff_update_savings_search(frame_branch_ct[i][j][k][l][t], + oldp, &newp, upd); + if (s > 0 && newp != oldp) + u = 1; + if (u) + savings += s - (int)(vp9_cost_zero(upd)); + else + savings -= (int)(vp9_cost_zero(upd)); #else - s = prob_update_savings( - frame_branch_ct[i][j][k][t], - oldp, newp, upd); - if (s > 0) - u = 1; - if (u) - savings += s; + s = prob_update_savings(frame_branch_ct[i][j][k][l][t], + oldp, newp, upd); + if (s > 0) + u = 1; + if (u) + savings += s; #endif - update[u]++; + update[u]++; + } } } } @@ -1299,41 +1294,42 @@ static void update_coef_probs_common(vp9_writer* const bc, } else { vp9_write_bit(bc, 1); for (i = 0; i < block_types; ++i) { - for (j = !i; j < COEF_BANDS; ++j) { - int prev_coef_savings[ENTROPY_NODES] = {0}; - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - // calc probs and branch cts for this frame only - for (t = 0; t < ENTROPY_NODES; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t; - const vp9_prob upd = COEF_UPDATE_PROB; - int s = prev_coef_savings[t]; - int u = 0; - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + int prev_coef_savings[ENTROPY_NODES] = {0}; + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < ENTROPY_NODES; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + const vp9_prob upd = COEF_UPDATE_PROB; + int s = prev_coef_savings[t]; + int u = 0; + if (l >= 3 && k == 0) + continue; #if defined(SEARCH_NEWP) - s = prob_diff_update_savings_search( - frame_branch_ct[i][j][k][t], - *oldp, &newp, upd); - if (s > 0 && newp != *oldp) - u = 1; + s = prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], + *oldp, &newp, upd); + if (s > 0 && newp != *oldp) + u = 1; #else - s = prob_update_savings( - frame_branch_ct[i][j][k][t], - *oldp, newp, upd); - if (s > 0) - u = 1; + s = prob_update_savings(frame_branch_ct[i][j][k][l][t], + *oldp, newp, upd); + if (s > 0) + u = 1; #endif - vp9_write(bc, u, upd); + vp9_write(bc, u, upd); #ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[i][j][k][t][u]; + if (!cpi->dummy_packing) + ++tree_update_hist[i][j][k][l][t][u]; #endif - if (u) { - /* send/use new probability */ - write_prob_diff_update(bc, newp, *oldp); - *oldp = newp; + if (u) { + /* send/use new probability */ + write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } } } } @@ -1356,17 +1352,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_4x4, cpi->common.fc.coef_probs_4x4, cpi->frame_branch_ct_4x4, - BLOCK_TYPES_4X4); - - update_coef_probs_common(bc, -#ifdef ENTROPY_STATS - cpi, - hybrid_tree_update_hist_4x4, -#endif - cpi->frame_hybrid_coef_probs_4x4, - cpi->common.fc.hybrid_coef_probs_4x4, - cpi->frame_hybrid_branch_ct_4x4, - BLOCK_TYPES_4X4); + BLOCK_TYPES); /* do not do this if not even allowed */ if (cpi->common.txfm_mode != ONLY_4X4) { @@ -1378,17 +1364,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_8x8, cpi->common.fc.coef_probs_8x8, cpi->frame_branch_ct_8x8, - BLOCK_TYPES_8X8); - - update_coef_probs_common(bc, -#ifdef ENTROPY_STATS - cpi, - hybrid_tree_update_hist_8x8, -#endif - cpi->frame_hybrid_coef_probs_8x8, - cpi->common.fc.hybrid_coef_probs_8x8, - cpi->frame_hybrid_branch_ct_8x8, - BLOCK_TYPES_8X8); + BLOCK_TYPES); } if (cpi->common.txfm_mode > ALLOW_8X8) { @@ -1400,16 +1376,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_16x16, cpi->common.fc.coef_probs_16x16, cpi->frame_branch_ct_16x16, - BLOCK_TYPES_16X16); - update_coef_probs_common(bc, -#ifdef ENTROPY_STATS - cpi, - hybrid_tree_update_hist_16x16, -#endif - cpi->frame_hybrid_coef_probs_16x16, - cpi->common.fc.hybrid_coef_probs_16x16, - cpi->frame_hybrid_branch_ct_16x16, - BLOCK_TYPES_16X16); + BLOCK_TYPES); } if (cpi->common.txfm_mode > ALLOW_16X16) { @@ -1523,33 +1490,37 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, * and color type. */ if (oh.type == KEY_FRAME) { - int v; - // Start / synch code cx_data[0] = 0x9D; cx_data[1] = 0x01; cx_data[2] = 0x2a; + extra_bytes_packed = 3; + cx_data += extra_bytes_packed; + } + { + int v; + /* TODO(jkoleszar): support arbitrary resolutions */ v = (pc->horiz_scale << 14) | pc->Width; - cx_data[3] = v; - cx_data[4] = v >> 8; + cx_data[0] = v; + cx_data[1] = v >> 8; v = (pc->vert_scale << 14) | pc->Height; - cx_data[5] = v; - cx_data[6] = v >> 8; + cx_data[2] = v; + cx_data[3] = v >> 8; - extra_bytes_packed = 7; - cx_data += extra_bytes_packed; + extra_bytes_packed += 4; + cx_data += 4; + } - vp9_start_encode(&header_bc, cx_data); + vp9_start_encode(&header_bc, cx_data); - // signal clr type - vp9_write_bit(&header_bc, pc->clr_type); - vp9_write_bit(&header_bc, pc->clamp_type); + // TODO(jkoleszar): remove these two unused bits? + vp9_write_bit(&header_bc, pc->clr_type); + vp9_write_bit(&header_bc, pc->clamp_type); - } else { - vp9_start_encode(&header_bc, cx_data); - } + // error resilient mode + vp9_write_bit(&header_bc, pc->error_resilient_mode); // Signal whether or not Segmentation is enabled vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0); @@ -1655,7 +1626,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]); vp9_write_literal(&header_bc, pc->sb32_coded, 8); - { + vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless); + if (cpi->mb.e_mbd.lossless) { + pc->txfm_mode = ONLY_4X4; + } else { if (pc->txfm_mode == TX_MODE_SELECT) { pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] + cpi->txfm_count_16x16p[TX_4X4] + @@ -1765,29 +1739,35 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, // Transmit Dc, Second order and Uv quantizer delta information put_delta_q(&header_bc, pc->y1dc_delta_q); - put_delta_q(&header_bc, pc->y2dc_delta_q); - put_delta_q(&header_bc, pc->y2ac_delta_q); put_delta_q(&header_bc, pc->uvdc_delta_q); put_delta_q(&header_bc, pc->uvac_delta_q); // When there is a key frame all reference buffers are updated using the new key frame if (pc->frame_type != KEY_FRAME) { - // Should the GF or ARF be updated using the transmitted frame or buffer - vp9_write_bit(&header_bc, pc->refresh_golden_frame); - vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame); - - // For inter frames the current default behavior is that when - // cm->refresh_golden_frame is set we copy the old GF over to - // the ARF buffer. This is purely an encoder decision at present. - if (pc->refresh_golden_frame) - pc->copy_buffer_to_arf = 2; - - // If not being updated from current frame should either GF or ARF be updated from another buffer - if (!pc->refresh_golden_frame) - vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2); + int refresh_mask; - if (!pc->refresh_alt_ref_frame) - vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2); + // Should the GF or ARF be updated using the transmitted frame or buffer + if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { + /* Preserve the previously existing golden frame and update the frame in + * the alt ref slot instead. This is highly specific to the use of + * alt-ref as a forward reference, and this needs to be generalized as + * other uses are implemented (like RTC/temporal scaling) + * + * gld_fb_idx and alt_fb_idx need to be swapped for future frames, but + * that happens in vp9_onyx_if.c:update_reference_frames() so that it can + * be done outside of the recode loop. + */ + refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { + refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << cpi->alt_fb_idx); + } + vp9_write_literal(&header_bc, refresh_mask, NUM_REF_FRAMES); + vp9_write_literal(&header_bc, cpi->lst_fb_idx, NUM_REF_FRAMES_LG2); + vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2); + vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2); // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer) vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]); @@ -1831,10 +1811,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, #endif } - vp9_write_bit(&header_bc, pc->refresh_entropy_probs); + if (!pc->error_resilient_mode) { + vp9_write_bit(&header_bc, pc->refresh_entropy_probs); + vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode); + } - if (pc->frame_type != KEY_FRAME) - vp9_write_bit(&header_bc, pc->refresh_last_frame); + vp9_write_literal(&header_bc, pc->frame_context_idx, + NUM_FRAME_CONTEXTS_LG2); #ifdef ENTROPY_STATS if (pc->frame_type == INTER_FRAME) @@ -1848,7 +1831,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, if (pc->frame_type != KEY_FRAME) { int i, j; int new_context[INTER_MODE_CONTEXTS][4]; - update_mode_probs(pc, new_context); + if (!cpi->dummy_packing) { + update_inter_mode_probs(pc, new_context); + } else { + // In dummy pack assume context unchanged. + vpx_memcpy(new_context, pc->fc.vp9_mode_contexts, + sizeof(pc->fc.vp9_mode_contexts)); + } for (i = 0; i < INTER_MODE_CONTEXTS; i++) { for (j = 0; j < 4; j++) { @@ -1902,16 +1891,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_copy(cpi->common.fc.pre_coef_probs_4x4, cpi->common.fc.coef_probs_4x4); - vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_4x4, - cpi->common.fc.hybrid_coef_probs_4x4); vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8); - vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, - cpi->common.fc.hybrid_coef_probs_8x8); vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16); - vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, - cpi->common.fc.hybrid_coef_probs_16x16); vp9_copy(cpi->common.fc.pre_coef_probs_32x32, cpi->common.fc.coef_probs_32x32); vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob); @@ -1960,7 +1943,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, if (pc->mcomp_filter_type == SWITCHABLE) update_switchable_interp_probs(cpi, &header_bc); - #if CONFIG_COMP_INTERINTRA_PRED +#if CONFIG_COMP_INTERINTRA_PRED if (pc->use_interintra) { vp9_cond_prob_update(&header_bc, &pc->fc.interintra_prob, @@ -1995,6 +1978,25 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc); } + /* tiling */ + { + int min_log2_tiles, delta_log2_tiles, n_tile_bits, n; + + vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles); + n_tile_bits = pc->log2_tile_columns - min_log2_tiles; + for (n = 0; n < delta_log2_tiles; n++) { + if (n_tile_bits--) { + vp9_write_bit(&header_bc, 1); + } else { + vp9_write_bit(&header_bc, 0); + break; + } + } + vp9_write_bit(&header_bc, pc->log2_tile_rows != 0); + if (pc->log2_tile_rows != 0) + vp9_write_bit(&header_bc, pc->log2_tile_rows != 1); + } + vp9_stop_encode(&header_bc); oh.first_partition_length_in_bytes = header_bc.pos; @@ -2012,42 +2014,80 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, } *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos; - vp9_start_encode(&residual_bc, cx_data + header_bc.pos); if (pc->frame_type == KEY_FRAME) { decide_kf_ymode_entropy(cpi); - write_modes(cpi, &residual_bc); } else { /* This is not required if the counts in cpi are consistent with the * final packing pass */ // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount); - write_modes(cpi, &residual_bc); - - vp9_update_mode_context(&cpi->common); } - vp9_stop_encode(&residual_bc); + { + int tile_row, tile_col, total_size = 0; + unsigned char *data_ptr = cx_data + header_bc.pos; + TOKENEXTRA *tok[1 << 6], *tok_end; + + tok[0] = cpi->tok; + for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) + tok[tile_col] = tok[tile_col - 1] + cpi->tok_count[tile_col - 1]; + + for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + vp9_get_tile_row_offsets(pc, tile_row); + tok_end = cpi->tok + cpi->tok_count[0]; + for (tile_col = 0; tile_col < pc->tile_columns; + tile_col++, tok_end += cpi->tok_count[tile_col]) { + vp9_get_tile_col_offsets(pc, tile_col); + + if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) + vp9_start_encode(&residual_bc, data_ptr + total_size + 4); + else + vp9_start_encode(&residual_bc, data_ptr + total_size); + write_modes(cpi, &residual_bc, &tok[tile_col], tok_end); + vp9_stop_encode(&residual_bc); + if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) { + /* size of this tile */ + data_ptr[total_size + 0] = residual_bc.pos; + data_ptr[total_size + 1] = residual_bc.pos >> 8; + data_ptr[total_size + 2] = residual_bc.pos >> 16; + data_ptr[total_size + 3] = residual_bc.pos >> 24; + total_size += 4; + } + + total_size += residual_bc.pos; + } + } + + assert((unsigned int)(tok[0] - cpi->tok) == cpi->tok_count[0]); + for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) + assert((unsigned int)(tok[tile_col] - tok[tile_col - 1]) == + cpi->tok_count[tile_col]); - *size += residual_bc.pos; + *size += total_size; + } } #ifdef ENTROPY_STATS static void print_tree_update_for_type(FILE *f, vp9_coeff_stats *tree_update_hist, int block_types, const char *header) { - int i, j, k, l; + int i, j, k, l, m; fprintf(f, "const vp9_coeff_prob %s = {\n", header); for (i = 0; i < block_types; i++) { fprintf(f, " { \n"); - for (j = 0; j < COEF_BANDS; j++) { - fprintf(f, " {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - fprintf(f, " {"); - for (l = 0; l < ENTROPY_NODES; l++) { - fprintf(f, "%3d, ", - get_binary_prob(tree_update_hist[i][j][k][l][0], - tree_update_hist[i][j][k][l][1])); + for (j = 0; j < REF_TYPES; j++) { + fprintf(f, " { \n"); + for (k = 0; k < COEF_BANDS; k++) { + fprintf(f, " {\n"); + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + fprintf(f, " {"); + for (m = 0; m < ENTROPY_NODES; m++) { + fprintf(f, "%3d, ", + get_binary_prob(tree_update_hist[i][j][k][l][m][0], + tree_update_hist[i][j][k][l][m][1])); + } + fprintf(f, "},\n"); } fprintf(f, "},\n"); } @@ -2062,18 +2102,11 @@ void print_tree_update_probs() { FILE *f = fopen("coefupdprob.h", "w"); fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n"); - print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES_4X4, + print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES, "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]"); - print_tree_update_for_type(f, hybrid_tree_update_hist_4x4, BLOCK_TYPES_4X4, - "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]"); - print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES_8X8, - "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]"); - print_tree_update_for_type(f, hybrid_tree_update_hist_8x8, BLOCK_TYPES_8X8, + print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES, "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]"); - print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES_16X16, - "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]"); - print_tree_update_for_type(f, hybrid_tree_update_hist_16x16, - BLOCK_TYPES_16X16, + print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES, "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]"); print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES_32X32, "vp9_coef_update_probs_32x32[BLOCK_TYPES_32X32]"); @@ -2083,6 +2116,7 @@ void print_tree_update_probs() { fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f); fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f); fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f); + fwrite(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f); fclose(f); } #endif diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 1960b9162..79a021cfb 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -50,10 +50,7 @@ typedef struct block { int src; int src_stride; - int eob_max_offset; - int eob_max_offset_8x8; - int eob_max_offset_16x16; - int eob_max_offset_32x32; + int skip_block; } BLOCK; typedef struct { @@ -91,12 +88,12 @@ typedef struct superblock { DECLARE_ALIGNED(16, int16_t, coeff[32*32+16*16*2]); } SUPERBLOCK; -typedef struct macroblock { - DECLARE_ALIGNED(16, int16_t, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y - DECLARE_ALIGNED(16, int16_t, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y +typedef struct macroblock MACROBLOCK; +struct macroblock { + DECLARE_ALIGNED(16, int16_t, src_diff[384]); // 16x16 Y 8x8 U 8x8 V + DECLARE_ALIGNED(16, int16_t, coeff[384]); // 16x16 Y 8x8 U 8x8 V // 16 Y blocks, 4 U blocks, 4 V blocks, - // 1 DC 2nd order block each with 16 entries - BLOCK block[25]; + BLOCK block[24]; SUPERBLOCK sb_coeff_data; @@ -160,8 +157,7 @@ typedef struct macroblock { unsigned char *active_ptr; - vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4]; - vp9_coeff_count hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4]; + vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES]; int optimize; @@ -172,17 +168,14 @@ typedef struct macroblock { PICK_MODE_CONTEXT sb32_context[4]; PICK_MODE_CONTEXT sb64_context; - void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch); - void (*vp9_short_fdct8x4)(int16_t *input, int16_t *output, int pitch); - void (*short_walsh4x4)(int16_t *input, int16_t *output, int pitch); - void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d); - void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1); - void (*vp9_short_fdct8x8)(int16_t *input, int16_t *output, int pitch); - void (*vp9_short_fdct16x16)(int16_t *input, int16_t *output, int pitch); - void (*short_fhaar2x2)(int16_t *input, int16_t *output, int pitch); - void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d); - void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d); - void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d); -} MACROBLOCK; + void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch); + void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx); + void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2); + void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx); + void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx); +}; #endif // VP9_ENCODER_VP9_BLOCK_H_ diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c index d1b1e0e89..a590902c2 100644 --- a/vp9/encoder/vp9_boolhuff.c +++ b/vp9/encoder/vp9_boolhuff.c @@ -40,7 +40,6 @@ const unsigned int vp9_prob_cost[256] = { }; void vp9_start_encode(BOOL_CODER *br, unsigned char *source) { - br->lowvalue = 0; br->range = 255; br->value = 0; diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index bfde02ccb..e4ac2ce36 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -15,842 +15,362 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9/common/vp9_blockd.h" - -// TODO: these transforms can be converted into integer forms to reduce -// the complexity -static const float dct_4[16] = { - 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000, - 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188, - 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000, - 0.270598050073099, -0.653281482438188, 0.653281482438188, -0.270598050073099 -}; - -static const float adst_4[16] = { - 0.228013428883779, 0.428525073124360, 0.577350269189626, 0.656538502008139, - 0.577350269189626, 0.577350269189626, 0.000000000000000, -0.577350269189626, - 0.656538502008139, -0.228013428883779, -0.577350269189626, 0.428525073124359, - 0.428525073124360, -0.656538502008139, 0.577350269189626, -0.228013428883779 -}; - -static const float dct_8[64] = { - 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274, - 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274, - 0.490392640201615, 0.415734806151273, 0.277785116509801, 0.097545161008064, - -0.097545161008064, -0.277785116509801, -0.415734806151273, -0.490392640201615, - 0.461939766255643, 0.191341716182545, -0.191341716182545, -0.461939766255643, - -0.461939766255643, -0.191341716182545, 0.191341716182545, 0.461939766255643, - 0.415734806151273, -0.097545161008064, -0.490392640201615, -0.277785116509801, - 0.277785116509801, 0.490392640201615, 0.097545161008064, -0.415734806151273, - 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274, - 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274, - 0.277785116509801, -0.490392640201615, 0.097545161008064, 0.415734806151273, - -0.415734806151273, -0.097545161008064, 0.490392640201615, -0.277785116509801, - 0.191341716182545, -0.461939766255643, 0.461939766255643, -0.191341716182545, - -0.191341716182545, 0.461939766255643, -0.461939766255643, 0.191341716182545, - 0.097545161008064, -0.277785116509801, 0.415734806151273, -0.490392640201615, - 0.490392640201615, -0.415734806151273, 0.277785116509801, -0.097545161008064 -}; - -static const float adst_8[64] = { - 0.089131608307533, 0.175227946595735, 0.255357107325376, 0.326790388032145, - 0.387095214016349, 0.434217976756762, 0.466553967085785, 0.483002021635509, - 0.255357107325376, 0.434217976756762, 0.483002021635509, 0.387095214016349, - 0.175227946595735, -0.089131608307533, -0.326790388032145, -0.466553967085785, - 0.387095214016349, 0.466553967085785, 0.175227946595735, -0.255357107325376, - -0.483002021635509, -0.326790388032145, 0.089131608307533, 0.434217976756762, - 0.466553967085785, 0.255357107325376, -0.326790388032145, -0.434217976756762, - 0.089131608307533, 0.483002021635509, 0.175227946595735, -0.387095214016348, - 0.483002021635509, -0.089131608307533, -0.466553967085785, 0.175227946595735, - 0.434217976756762, -0.255357107325376, -0.387095214016348, 0.326790388032145, - 0.434217976756762, -0.387095214016348, -0.089131608307533, 0.466553967085786, - -0.326790388032145, -0.175227946595735, 0.483002021635509, -0.255357107325375, - 0.326790388032145, -0.483002021635509, 0.387095214016349, -0.089131608307534, - -0.255357107325377, 0.466553967085785, -0.434217976756762, 0.175227946595736, - 0.175227946595735, -0.326790388032145, 0.434217976756762, -0.483002021635509, - 0.466553967085785, -0.387095214016348, 0.255357107325376, -0.089131608307532 -}; - -/* Converted the transforms to integers. */ -static const int16_t dct_i4[16] = { - 16384, 16384, 16384, 16384, - 21407, 8867, -8867, -21407, - 16384, -16384, -16384, 16384, - 8867, -21407, 21407, -8867 -}; - -static const int16_t adst_i4[16] = { - 7472, 14042, 18919, 21513, - 18919, 18919, 0, -18919, - 21513, -7472, -18919, 14042, - 14042, -21513, 18919, -7472 -}; - -static const int16_t dct_i8[64] = { - 11585, 11585, 11585, 11585, - 11585, 11585, 11585, 11585, - 16069, 13623, 9102, 3196, - -3196, -9102, -13623, -16069, - 15137, 6270, -6270, -15137, - -15137, -6270, 6270, 15137, - 13623, -3196, -16069, -9102, - 9102, 16069, 3196, -13623, - 11585, -11585, -11585, 11585, - 11585, -11585, -11585, 11585, - 9102, -16069, 3196, 13623, - -13623, -3196, 16069, -9102, - 6270, -15137, 15137, -6270, - -6270, 15137, -15137, 6270, - 3196, -9102, 13623, -16069, - 16069, -13623, 9102, -3196 -}; - -static const int16_t adst_i8[64] = { - 2921, 5742, 8368, 10708, - 12684, 14228, 15288, 15827, - 8368, 14228, 15827, 12684, - 5742, -2921, -10708, -15288, - 12684, 15288, 5742, -8368, - -15827, -10708, 2921, 14228, - 15288, 8368, -10708, -14228, - 2921, 15827, 5742, -12684, - 15827, -2921, -15288, 5742, - 14228, -8368, -12684, 10708, - 14228, -12684, -2921, 15288, - -10708, -5742, 15827, -8368, - 10708, -15827, 12684, -2921, - -8368, 15288, -14228, 5742, - 5742, -10708, 14228, -15827, - 15288, -12684, 8368, -2921 -}; - -static const float dct_16[256] = { - 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, - 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, - 0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0.034654, - -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851, - 0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0.346760, - -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0.346760, - 0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631, - 0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0.338330, - 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641, - 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641, - 0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0.166664, - -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0.311806, - 0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0.293969, - -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0.293969, - 0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0.224292, - 0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0.273300, - 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000, - 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000, - 0.224292, -0.311806, -0.102631, 0.351851, -0.034654, -0.338330, 0.166664, 0.273300, - -0.273300, -0.166664, 0.338330, 0.034654, -0.351851, 0.102631, 0.311806, -0.224292, - 0.196424, -0.346760, 0.068975, 0.293969, -0.293969, -0.068975, 0.346760, -0.196424, - -0.196424, 0.346760, -0.068975, -0.293969, 0.293969, 0.068975, -0.346760, 0.196424, - 0.166664, -0.351851, 0.224292, 0.102631, -0.338330, 0.273300, 0.034654, -0.311806, - 0.311806, -0.034654, -0.273300, 0.338330, -0.102631, -0.224292, 0.351851, -0.166664, - 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299, - 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299, - 0.102631, -0.273300, 0.351851, -0.311806, 0.166664, 0.034654, -0.224292, 0.338330, - -0.338330, 0.224292, -0.034654, -0.166664, 0.311806, -0.351851, 0.273300, -0.102631, - 0.068975, -0.196424, 0.293969, -0.346760, 0.346760, -0.293969, 0.196424, -0.068975, - -0.068975, 0.196424, -0.293969, 0.346760, -0.346760, 0.293969, -0.196424, 0.068975, - 0.034654, -0.102631, 0.166664, -0.224292, 0.273300, -0.311806, 0.338330, -0.351851, - 0.351851, -0.338330, 0.311806, -0.273300, 0.224292, -0.166664, 0.102631, -0.034654 -}; - -static const float adst_16[256] = { - 0.033094, 0.065889, 0.098087, 0.129396, 0.159534, 0.188227, 0.215215, 0.240255, - 0.263118, 0.283599, 0.301511, 0.316693, 0.329007, 0.338341, 0.344612, 0.347761, - 0.098087, 0.188227, 0.263118, 0.316693, 0.344612, 0.344612, 0.316693, 0.263118, - 0.188227, 0.098087, 0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612, - 0.159534, 0.283599, 0.344612, 0.329007, 0.240255, 0.098087, -0.065889, -0.215215, - -0.316693, -0.347761, -0.301511, -0.188227, -0.033094, 0.129396, 0.263118, 0.338341, - 0.215215, 0.338341, 0.316693, 0.159534, -0.065889, -0.263118, -0.347761, -0.283599, - -0.098087, 0.129396, 0.301511, 0.344612, 0.240255, 0.033094, -0.188227, -0.329007, - 0.263118, 0.344612, 0.188227, -0.098087, -0.316693, -0.316693, -0.098087, 0.188227, - 0.344612, 0.263118, 0.000000, -0.263118, -0.344612, -0.188227, 0.098087, 0.316693, - 0.301511, 0.301511, 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, - 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, 0.000000, -0.301511, - 0.329007, 0.215215, -0.188227, -0.338341, -0.033094, 0.316693, 0.240255, -0.159534, - -0.344612, -0.065889, 0.301511, 0.263118, -0.129396, -0.347761, -0.098087, 0.283599, - 0.344612, 0.098087, -0.316693, -0.188227, 0.263118, 0.263118, -0.188227, -0.316693, - 0.098087, 0.344612, 0.000000, -0.344612, -0.098087, 0.316693, 0.188227, -0.263118, - 0.347761, -0.033094, -0.344612, 0.065889, 0.338341, -0.098087, -0.329007, 0.129396, - 0.316693, -0.159534, -0.301511, 0.188227, 0.283599, -0.215215, -0.263118, 0.240255, - 0.338341, -0.159534, -0.263118, 0.283599, 0.129396, -0.344612, 0.033094, 0.329007, - -0.188227, -0.240255, 0.301511, 0.098087, -0.347761, 0.065889, 0.316693, -0.215215, - 0.316693, -0.263118, -0.098087, 0.344612, -0.188227, -0.188227, 0.344612, -0.098087, - -0.263118, 0.316693, 0.000000, -0.316693, 0.263118, 0.098087, -0.344612, 0.188227, - 0.283599, -0.329007, 0.098087, 0.215215, -0.347761, 0.188227, 0.129396, -0.338341, - 0.263118, 0.033094, -0.301511, 0.316693, -0.065889, -0.240255, 0.344612, -0.159534, - 0.240255, -0.347761, 0.263118, -0.033094, -0.215215, 0.344612, -0.283599, 0.065889, - 0.188227, -0.338341, 0.301511, -0.098087, -0.159534, 0.329007, -0.316693, 0.129396, - 0.188227, -0.316693, 0.344612, -0.263118, 0.098087, 0.098087, -0.263118, 0.344612, - -0.316693, 0.188227, 0.000000, -0.188227, 0.316693, -0.344612, 0.263118, -0.098087, - 0.129396, -0.240255, 0.316693, -0.347761, 0.329007, -0.263118, 0.159534, -0.033094, - -0.098087, 0.215215, -0.301511, 0.344612, -0.338341, 0.283599, -0.188227, 0.065889, - 0.065889, -0.129396, 0.188227, -0.240255, 0.283599, -0.316693, 0.338341, -0.347761, - 0.344612, -0.329007, 0.301511, -0.263118, 0.215215, -0.159534, 0.098087, -0.033094 -}; - -/* Converted the transforms to integers. */ -static const int16_t dct_i16[256] = { - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192, - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192, - 11529, 11086, 10217, 8955, 7350, 5461, 3363, 1136, - -1136, -3363, -5461, -7350, -8955, -10217, -11086, -11529, - 11363, 9633, 6436, 2260, -2260, -6436, -9633, -11363, - -11363, -9633, -6436, -2260, 2260, 6436, 9633, 11363, - 11086, 7350, 1136, -5461, -10217, -11529, -8955, -3363, - 3363, 8955, 11529, 10217, 5461, -1136, -7350, -11086, - 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703, - 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703, - 10217, 1136, -8955, -11086, -3363, 7350, 11529, 5461, - -5461, -11529, -7350, 3363, 11086, 8955, -1136, -10217, - 9633, -2260, -11363, -6436, 6436, 11363, 2260, -9633, - -9633, 2260, 11363, 6436, -6436, -11363, -2260, 9633, - 8955, -5461, -11086, 1136, 11529, 3363, -10217, -7350, - 7350, 10217, -3363, -11529, -1136, 11086, 5461, -8955, - 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192, - 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192, - 7350, -10217, -3363, 11529, -1136, -11086, 5461, 8955, - -8955, -5461, 11086, 1136, -11529, 3363, 10217, -7350, - 6436, -11363, 2260, 9633, -9633, -2260, 11363, -6436, - -6436, 11363, -2260, -9633, 9633, 2260, -11363, 6436, - 5461, -11529, 7350, 3363, -11086, 8955, 1136, -10217, - 10217, -1136, -8955, 11086, -3363, -7350, 11529, -5461, - 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433, - 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433, - 3363, -8955, 11529, -10217, 5461, 1136, -7350, 11086, - -11086, 7350, -1136, -5461, 10217, -11529, 8955, -3363, - 2260, -6436, 9633, -11363, 11363, -9633, 6436, -2260, - -2260, 6436, -9633, 11363, -11363, 9633, -6436, 2260, - 1136, -3363, 5461, -7350, 8955, -10217, 11086, -11529, - 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136 -}; - -static const int16_t adst_i16[256] = { - 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873, - 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395, - 3214, 6168, 8622, 10377, 11292, 11292, 10377, 8622, - 6168, 3214, 0, -3214, -6168, -8622, -10377, -11292, - 5228, 9293, 11292, 10781, 7873, 3214, -2159, -7052, - -10377, -11395, -9880, -6168, -1084, 4240, 8622, 11087, - 7052, 11087, 10377, 5228, -2159, -8622, -11395, -9293, - -3214, 4240, 9880, 11292, 7873, 1084, -6168, -10781, - 8622, 11292, 6168, -3214, -10377, -10377, -3214, 6168, - 11292, 8622, 0, -8622, -11292, -6168, 3214, 10377, - 9880, 9880, 0, -9880, -9880, 0, 9880, 9880, - 0, -9880, -9880, 0, 9880, 9880, 0, -9880, - 10781, 7052, -6168, -11087, -1084, 10377, 7873, -5228, - -11292, -2159, 9880, 8622, -4240, -11395, -3214, 9293, - 11292, 3214, -10377, -6168, 8622, 8622, -6168, -10377, - 3214, 11292, 0, -11292, -3214, 10377, 6168, -8622, - 11395, -1084, -11292, 2159, 11087, -3214, -10781, 4240, - 10377, -5228, -9880, 6168, 9293, -7052, -8622, 7873, - 11087, -5228, -8622, 9293, 4240, -11292, 1084, 10781, - -6168, -7873, 9880, 3214, -11395, 2159, 10377, -7052, - 10377, -8622, -3214, 11292, -6168, -6168, 11292, -3214, - -8622, 10377, 0, -10377, 8622, 3214, -11292, 6168, - 9293, -10781, 3214, 7052, -11395, 6168, 4240, -11087, - 8622, 1084, -9880, 10377, -2159, -7873, 11292, -5228, - 7873, -11395, 8622, -1084, -7052, 11292, -9293, 2159, - 6168, -11087, 9880, -3214, -5228, 10781, -10377, 4240, - 6168, -10377, 11292, -8622, 3214, 3214, -8622, 11292, - -10377, 6168, 0, -6168, 10377, -11292, 8622, -3214, - 4240, -7873, 10377, -11395, 10781, -8622, 5228, -1084, - -3214, 7052, -9880, 11292, -11087, 9293, -6168, 2159, - 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395, - 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084 -}; - -static const int xC1S7 = 16069; -static const int xC2S6 = 15137; -static const int xC3S5 = 13623; -static const int xC4S4 = 11585; -static const int xC5S3 = 9102; -static const int xC6S2 = 6270; -static const int xC7S1 = 3196; - -#define SHIFT_BITS 14 -#define DOROUND(X) X += (1<<(SHIFT_BITS-1)); - -#define FINAL_SHIFT 3 -#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1)) -#define IN_SHIFT (FINAL_SHIFT+1) - - -void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) { - int loop; - int short_pitch = pitch >> 1; - int is07, is12, is34, is56; - int is0734, is1256; - int id07, id12, id34, id56; - int irot_input_x, irot_input_y; - int icommon_product1; // Re-used product (c4s4 * (s12 - s56)) - int icommon_product2; // Re-used product (c4s4 * (d12 + d56)) - int temp1, temp2; // intermediate variable for computation - - int InterData[64]; - int *ip = InterData; - short *op = OutputData; - - for (loop = 0; loop < 8; loop++) { - // Pre calculate some common sums and differences. - is07 = (InputData[0] + InputData[7]) << IN_SHIFT; - is12 = (InputData[1] + InputData[2]) << IN_SHIFT; - is34 = (InputData[3] + InputData[4]) << IN_SHIFT; - is56 = (InputData[5] + InputData[6]) << IN_SHIFT; - id07 = (InputData[0] - InputData[7]) << IN_SHIFT; - id12 = (InputData[1] - InputData[2]) << IN_SHIFT; - id34 = (InputData[3] - InputData[4]) << IN_SHIFT; - id56 = (InputData[5] - InputData[6]) << IN_SHIFT; - - is0734 = is07 + is34; - is1256 = is12 + is56; - - // Pre-Calculate some common product terms. - icommon_product1 = xC4S4 * (is12 - is56); - DOROUND(icommon_product1) - icommon_product1 >>= SHIFT_BITS; - - icommon_product2 = xC4S4 * (id12 + id56); - DOROUND(icommon_product2) - icommon_product2 >>= SHIFT_BITS; - - - ip[0] = (xC4S4 * (is0734 + is1256)); - DOROUND(ip[0]); - ip[0] >>= SHIFT_BITS; - - ip[4] = (xC4S4 * (is0734 - is1256)); - DOROUND(ip[4]); - ip[4] >>= SHIFT_BITS; - - // Define inputs to rotation for outputs 2 and 6 - irot_input_x = id12 - id56; - irot_input_y = is07 - is34; - - // Apply rotation for outputs 2 and 6. - temp1 = xC6S2 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[2] = temp1 + temp2; - - temp1 = xC6S2 * irot_input_y; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_x; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[6] = temp1 - temp2; - - // Define inputs to rotation for outputs 1 and 7 - irot_input_x = icommon_product1 + id07; - irot_input_y = -(id34 + icommon_product2); - - // Apply rotation for outputs 1 and 7. - temp1 = xC1S7 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC7S1 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[1] = temp1 - temp2; - - temp1 = xC7S1 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC1S7 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[7] = temp1 + temp2; - - // Define inputs to rotation for outputs 3 and 5 - irot_input_x = id07 - icommon_product1; - irot_input_y = id34 - icommon_product2; - - // Apply rotation for outputs 3 and 5. - temp1 = xC3S5 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC5S3 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[3] = temp1 - temp2; - - - temp1 = xC5S3 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC3S5 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[5] = temp1 + temp2; - - // Increment data pointer for next row - InputData += short_pitch; - ip += 8; - } - - // Performed DCT on rows, now transform the columns - ip = InterData; - for (loop = 0; loop < 8; loop++) { - // Pre calculate some common sums and differences. - is07 = ip[0 * 8] + ip[7 * 8]; - is12 = ip[1 * 8] + ip[2 * 8]; - is34 = ip[3 * 8] + ip[4 * 8]; - is56 = ip[5 * 8] + ip[6 * 8]; - - id07 = ip[0 * 8] - ip[7 * 8]; - id12 = ip[1 * 8] - ip[2 * 8]; - id34 = ip[3 * 8] - ip[4 * 8]; - id56 = ip[5 * 8] - ip[6 * 8]; - - is0734 = is07 + is34; - is1256 = is12 + is56; - - // Pre-Calculate some common product terms - icommon_product1 = xC4S4 * (is12 - is56); - icommon_product2 = xC4S4 * (id12 + id56); - DOROUND(icommon_product1) - DOROUND(icommon_product2) - icommon_product1 >>= SHIFT_BITS; - icommon_product2 >>= SHIFT_BITS; - - - temp1 = xC4S4 * (is0734 + is1256); - temp2 = xC4S4 * (is0734 - is1256); - DOROUND(temp1); - DOROUND(temp2); - temp1 >>= SHIFT_BITS; - - temp2 >>= SHIFT_BITS; - op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT; - op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Define inputs to rotation for outputs 2 and 6 - irot_input_x = id12 - id56; - irot_input_y = is07 - is34; - - // Apply rotation for outputs 2 and 6. - temp1 = xC6S2 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - temp1 = xC6S2 * irot_input_y; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_x; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Define inputs to rotation for outputs 1 and 7 - irot_input_x = icommon_product1 + id07; - irot_input_y = -(id34 + icommon_product2); - - // Apply rotation for outputs 1 and 7. - temp1 = xC1S7 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC7S1 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - temp1 = xC7S1 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC1S7 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Define inputs to rotation for outputs 3 and 5 - irot_input_x = id07 - icommon_product1; - irot_input_y = id34 - icommon_product2; - - // Apply rotation for outputs 3 and 5. - temp1 = xC3S5 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC5S3 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - - temp1 = xC5S3 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC3S5 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Increment data pointer for next column. - ip++; - op++; - } +#include "vp9/common/vp9_idct.h" + +static void fdct4_1d(int16_t *input, int16_t *output) { + int16_t step[4]; + int temp1, temp2; + + step[0] = input[0] + input[3]; + step[1] = input[1] + input[2]; + step[2] = input[1] - input[2]; + step[3] = input[0] - input[3]; + + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + output[0] = dct_const_round_shift(temp1); + output[2] = dct_const_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + output[1] = dct_const_round_shift(temp1); + output[3] = dct_const_round_shift(temp2); } -void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) { - /* [1 1; 1 -1] orthogonal transform */ - /* use position: 0,1, 4, 8 */ - int i; - short *ip1 = input; - short *op1 = output; - for (i = 0; i < 16; i++) { - op1[i] = 0; +void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[4 * 4]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[4], temp_out[4]; + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j * short_pitch + i] << 4; + if (i == 0 && temp_in[0]) + temp_in[0] += 1; + fdct4_1d(temp_in, temp_out); + for (j = 0; j < 4; ++j) + outptr[j * 4 + i] = temp_out[j]; } - op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1; - op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1; - op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1; - op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1; -} - -/* For test */ -#define TEST_INT 1 -#if TEST_INT -#define vp9_fht_int_c vp9_fht_c -#else -#define vp9_fht_float_c vp9_fht_c -#endif - -void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output, - TX_TYPE tx_type, int tx_dim) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int i, j, k; - float bufa[256], bufb[256]; // buffers are for floating-point test purpose - // the implementation could be simplified in - // conjunction with integer transform - const int16_t *ip = input; - int16_t *op = output; - - float *pfa = &bufa[0]; - float *pfb = &bufb[0]; - - // pointers to vertical and horizontal transforms - const float *ptv, *pth; - - assert(tx_type != DCT_DCT); - // load and convert residual array into floating-point - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - pfa[i] = (float)ip[i]; - } - pfa += tx_dim; - ip += pitch / 2; - } - - // vertical transformation - pfa = &bufa[0]; - pfb = &bufb[0]; - - switch (tx_type) { - case ADST_ADST : - case ADST_DCT : - ptv = (tx_dim == 4) ? &adst_4[0] : - ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]); - break; - - default : - ptv = (tx_dim == 4) ? &dct_4[0] : - ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); - break; - } - - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - pfb[i] = 0; - for (k = 0; k < tx_dim; k++) { - pfb[i] += ptv[k] * pfa[(k * tx_dim)]; - } - pfa += 1; - } - pfb += tx_dim; - ptv += tx_dim; - pfa = &bufa[0]; - } - - // horizontal transformation - pfa = &bufa[0]; - pfb = &bufb[0]; - - switch (tx_type) { - case ADST_ADST : - case DCT_ADST : - pth = (tx_dim == 4) ? &adst_4[0] : - ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]); - break; - - default : - pth = (tx_dim == 4) ? &dct_4[0] : - ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); - break; - } - - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - pfa[i] = 0; - for (k = 0; k < tx_dim; k++) { - pfa[i] += pfb[k] * pth[k]; - } - pth += tx_dim; - } - - pfa += tx_dim; - pfb += tx_dim; - // pth -= tx_dim * tx_dim; - - switch (tx_type) { - case ADST_ADST : - case DCT_ADST : - pth = (tx_dim == 4) ? &adst_4[0] : - ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]); - break; - - default : - pth = (tx_dim == 4) ? &dct_4[0] : - ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); - break; - } - } - - // convert to short integer format and load BLOCKD buffer - op = output; - pfa = &bufa[0]; - - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) : - -(int16_t)(- 8 * pfa[i] + 0.49); - } - op += tx_dim; - pfa += tx_dim; - } + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j + i * 4]; + fdct4_1d(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j + i * 4] = (temp_out[j] + 1) >> 2; } - vp9_clear_system_state(); // Make it simd safe : __asm emms; } -/* Converted the transforms to integer form. */ -#define VERTICAL_SHIFT 11 -#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) -#define HORIZONTAL_SHIFT 16 -#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) -void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output, - TX_TYPE tx_type, int tx_dim) { - int i, j, k; - int16_t imbuf[256]; - - const int16_t *ip = input; - int16_t *op = output; - int16_t *im = &imbuf[0]; - - /* pointers to vertical and horizontal transforms. */ - const int16_t *ptv = NULL, *pth = NULL; - - switch (tx_type) { - case ADST_ADST : - ptv = pth = (tx_dim == 4) ? &adst_i4[0] - : ((tx_dim == 8) ? &adst_i8[0] - : &adst_i16[0]); - break; - case ADST_DCT : - ptv = (tx_dim == 4) ? &adst_i4[0] - : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]); - pth = (tx_dim == 4) ? &dct_i4[0] - : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]); - break; - case DCT_ADST : - ptv = (tx_dim == 4) ? &dct_i4[0] - : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]); - pth = (tx_dim == 4) ? &adst_i4[0] - : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]); - break; - case DCT_DCT : - ptv = pth = (tx_dim == 4) ? &dct_i4[0] - : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]); - break; - default: - assert(0); - break; - } - - /* vertical transformation */ - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - int temp = 0; +static void fadst4_1d(int16_t *input, int16_t *output) { + int x0, x1, x2, x3; + int s0, s1, s2, s3, s4, s5, s6, s7; - for (k = 0; k < tx_dim; k++) { - temp += ptv[k] * ip[(k * (pitch >> 1))]; - } + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; - im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); - ip++; - } - im += tx_dim; // 16 - ptv += tx_dim; - ip = input; + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; } - /* horizontal transformation */ - im = &imbuf[0]; - - for (j = 0; j < tx_dim; j++) { - const int16_t *pthc = pth; - - for (i = 0; i < tx_dim; i++) { - int temp = 0; - - for (k = 0; k < tx_dim; k++) { - temp += im[k] * pthc[k]; - } - - op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); - pthc += tx_dim; - } - - im += tx_dim; // 16 - op += tx_dim; - } + s0 = sinpi_1_9 * x0; + s1 = sinpi_4_9 * x0; + s2 = sinpi_2_9 * x1; + s3 = sinpi_1_9 * x1; + s4 = sinpi_3_9 * x2; + s5 = sinpi_4_9 * x3; + s6 = sinpi_2_9 * x3; + s7 = x0 + x1 - x3; + + x0 = s0 + s2 + s5; + x1 = sinpi_3_9 * s7; + x2 = s1 - s3 + s6; + x3 = s4; + + s0 = x0 + x3; + s1 = x1; + s2 = x2 - x3; + s3 = x2 - x0 + x3; + + // 1-D transform scaling factor is sqrt(2). + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); } -void vp9_short_fdct4x4_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3]) << 5); - b1 = ((ip[1] + ip[2]) << 5); - c1 = ((ip[1] - ip[2]) << 5); - d1 = ((ip[0] - ip[3]) << 5); - - op[0] = a1 + b1; - op[2] = a1 - b1; - - op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12; - op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12; - - ip += pitch / 2; - op += 4; +static const transform_2d FHT_4[] = { + { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 + { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 + { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 + { fadst4_1d, fadst4_1d } // ADST_ADST = 3 +}; +void vp9_short_fht4x4_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[4 * 4]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[4], temp_out[4]; + const transform_2d ht = FHT_4[tx_type]; + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j * pitch + i] << 4; + if (i == 0 && temp_in[0]) + temp_in[0] += 1; + ht.cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) + outptr[j * 4 + i] = temp_out[j]; } - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; - b1 = ip[4] + ip[8]; - c1 = ip[4] - ip[8]; - d1 = ip[0] - ip[12]; - - op[0] = (a1 + b1 + 7) >> 4; - op[8] = (a1 - b1 + 7) >> 4; - op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0); - op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16; - - ip++; - op++; + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j + i * 4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j + i * 4] = (temp_out[j] + 1) >> 2; } } -void vp9_short_fdct8x4_c(short *input, short *output, int pitch) -{ - vp9_short_fdct4x4_c(input, output, pitch); +void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { + vp9_short_fdct4x4_c(input, output, pitch); vp9_short_fdct4x4_c(input + 4, output + 16, pitch); } -void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - int pitch_short = pitch >> 1; +static void fdct8_1d(int16_t *input, int16_t *output) { + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = dct_const_round_shift(t0); + output[2] = dct_const_round_shift(t2); + output[4] = dct_const_round_shift(t1); + output[6] = dct_const_round_shift(t3); - for (i = 0; i < 4; i++) { - a1 = ip[0 * pitch_short] + ip[3 * pitch_short]; - b1 = ip[1 * pitch_short] + ip[2 * pitch_short]; - c1 = ip[1 * pitch_short] - ip[2 * pitch_short]; - d1 = ip[0 * pitch_short] - ip[3 * pitch_short]; + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; - ip++; - op++; - } - ip = output; - op = output; + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = dct_const_round_shift(t0); + output[3] = dct_const_round_shift(t2); + output[5] = dct_const_round_shift(t1); + output[7] = dct_const_round_shift(t3); +} - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[3]; - b1 = ip[1] + ip[2]; - c1 = ip[1] - ip[2]; - d1 = ip[0] - ip[3]; +void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { + const int stride = pitch >> 1; + int i, j; + int16_t intermediate[64]; - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; + // Transform columns + { + int16_t *output = intermediate; + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + int i; + for (i = 0; i < 8; i++) { + // stage 1 + s0 = (input[0 * stride] + input[7 * stride]) << 2; + s1 = (input[1 * stride] + input[6 * stride]) << 2; + s2 = (input[2 * stride] + input[5 * stride]) << 2; + s3 = (input[3 * stride] + input[4 * stride]) << 2; + s4 = (input[3 * stride] - input[4 * stride]) << 2; + s5 = (input[2 * stride] - input[5 * stride]) << 2; + s6 = (input[1 * stride] - input[6 * stride]) << 2; + s7 = (input[0 * stride] - input[7 * stride]) << 2; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0 * 8] = dct_const_round_shift(t0); + output[2 * 8] = dct_const_round_shift(t2); + output[4 * 8] = dct_const_round_shift(t1); + output[6 * 8] = dct_const_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1 * 8] = dct_const_round_shift(t0); + output[3 * 8] = dct_const_round_shift(t2); + output[5 * 8] = dct_const_round_shift(t1); + output[7 * 8] = dct_const_round_shift(t3); + input++; + output++; + } + } - ip += 4; - op += 4; + // Rows + for (i = 0; i < 8; ++i) { + fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); + for (j = 0; j < 8; ++j) + final_output[j + i * 8] /= 2; } } -#if CONFIG_LOSSLESS -void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - int pitch_short = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; +static void fadst8_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + int x0 = input[7]; + int x1 = input[0]; + int x2 = input[5]; + int x3 = input[2]; + int x4 = input[3]; + int x5 = input[4]; + int x6 = input[1]; + int x7 = input[6]; + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = dct_const_round_shift(s0 + s4); + x1 = dct_const_round_shift(s1 + s5); + x2 = dct_const_round_shift(s2 + s6); + x3 = dct_const_round_shift(s3 + s7); + x4 = dct_const_round_shift(s0 - s4); + x5 = dct_const_round_shift(s1 - s5); + x6 = dct_const_round_shift(s2 - s6); + x7 = dct_const_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + + output[0] = x0; + output[1] = - x4; + output[2] = x6; + output[3] = - x2; + output[4] = x3; + output[5] = - x7; + output[6] = x5; + output[7] = - x1; +} - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; +static const transform_2d FHT_8[] = { + { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 + { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 + { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 + { fadst8_1d, fadst8_1d } // ADST_ADST = 3 +}; - ip++; - op++; +void vp9_short_fht8x8_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[64]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[8], temp_out[8]; + const transform_2d ht = FHT_8[tx_type]; + + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = input[j * pitch + i] << 2; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) + outptr[j * 8 + i] = temp_out[j]; } - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[3]; - b1 = ip[1] + ip[2]; - c1 = ip[1] - ip[2]; - d1 = ip[0] - ip[3]; - - op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - - ip += 4; - op += 4; + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j + i * 8]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j + i * 8] = temp_out[j] >> 1; } } @@ -898,1491 +418,642 @@ void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) { vp9_short_walsh4x4_x8_c(input, output, pitch); vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch); } -#endif - -#define TEST_INT_16x16_DCT 1 -#if !TEST_INT_16x16_DCT - -static void dct16x16_1d(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8]*C7; - temp2 = step[15]*C9; - output[ 8] = temp1 + temp2; - - temp1 = step[ 9]*C11; - temp2 = step[14]*C5; - output[ 9] = temp1 - temp2; - - temp1 = step[10]*C3; - temp2 = step[13]*C13; - output[10] = temp1 + temp2; - - temp1 = step[11]*C15; - temp2 = step[12]*C1; - output[11] = temp1 - temp2; - - temp1 = step[11]*C1; - temp2 = step[12]*C15; - output[12] = temp2 + temp1; - - temp1 = step[10]*C13; - temp2 = step[13]*C3; - output[13] = temp2 - temp1; - - temp1 = step[ 9]*C5; - temp2 = step[14]*C11; - output[14] = temp2 + temp1; - - temp1 = step[ 8]*C9; - temp2 = step[15]*C7; - output[15] = temp2 - temp1; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4]*C14; - temp2 = output[7]*C2; - step[ 4] = temp1 + temp2; - - temp1 = output[5]*C10; - temp2 = output[6]*C6; - step[ 5] = temp1 + temp2; - - temp1 = output[5]*C6; - temp2 = output[6]*C10; - step[ 6] = temp2 - temp1; - - temp1 = output[4]*C2; - temp2 = output[7]*C14; - step[ 7] = temp2 - temp1; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1]); - output[ 8] = (step[ 0] - step[ 1]); - - temp1 = step[2]*C12; - temp2 = step[3]*C4; - temp1 = temp1 + temp2; - output[ 4] = 2*(temp1*C8); - - temp1 = step[2]*C4; - temp2 = step[3]*C12; - temp1 = temp2 - temp1; - output[12] = 2*(temp1*C8); - - output[ 2] = 2*((step[4] + step[ 5])*C8); - output[14] = 2*((step[7] - step[ 6])*C8); - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2); - output[10] = (temp1 - temp2); - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8]*C12; - temp2 = intermediate[9]*C4; - temp1 = temp1 - temp2; - output[3] = 2*(temp1*C8); - - temp1 = intermediate[8]*C4; - temp2 = intermediate[9]*C12; - temp1 = temp2 + temp1; - output[13] = 2*(temp1*C8); - - output[ 9] = 2*((step[10] + step[11])*C8); - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12]); - output[ 1] = -(intermediate[11] - intermediate[12]); - - output[ 7] = 2*(intermediate[13]*C8); - - temp1 = intermediate[14]*C12; - temp2 = intermediate[15]*C4; - temp1 = temp1 - temp2; - output[11] = -2*(temp1*C8); - - temp1 = intermediate[14]*C4; - temp2 = intermediate[15]*C12; - temp1 = temp2 + temp1; - output[ 5] = 2*(temp1*C8); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -void vp9_short_fdct16x16_c(short *input, short *out, int pitch) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[256]; - // First transform columns - for (i = 0; i < 16; i++) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j*shortpitch + i]; - dct16x16_1d(temp_in, temp_out); - for (j = 0; j < 16; j++) - output[j*16 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = output[j + i*16]; - dct16x16_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i*16] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 256; i++) - out[i] = (short)round(output[i]/2); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -#else -static const int16_t C1 = 16305; -static const int16_t C2 = 16069; -static const int16_t C3 = 15679; -static const int16_t C4 = 15137; -static const int16_t C5 = 14449; -static const int16_t C6 = 13623; -static const int16_t C7 = 12665; -static const int16_t C8 = 11585; -static const int16_t C9 = 10394; -static const int16_t C10 = 9102; -static const int16_t C11 = 7723; -static const int16_t C12 = 6270; -static const int16_t C13 = 4756; -static const int16_t C14 = 3196; -static const int16_t C15 = 1606; - -#define RIGHT_SHIFT 14 -#define ROUNDING (1 << (RIGHT_SHIFT - 1)) - -static void dct16x16_1d(int16_t input[16], int16_t output[16], - int last_shift_bits) { - int16_t step[16]; - int intermediate[16]; - int temp1, temp2; - int final_shift = RIGHT_SHIFT; - int final_rounding = ROUNDING; - int output_shift = 0; - int output_rounding = 0; - - final_shift += last_shift_bits; - if (final_shift > 0) - final_rounding = 1 << (final_shift - 1); - - output_shift += last_shift_bits; - if (output_shift > 0) - output_rounding = 1 << (output_shift - 1); - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8] * C7; - temp2 = step[15] * C9; - output[ 8] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 9] * C11; - temp2 = step[14] * C5; - output[ 9] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[10] * C3; - temp2 = step[13] * C13; - output[10] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[11] * C15; - temp2 = step[12] * C1; - output[11] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[11] * C1; - temp2 = step[12] * C15; - output[12] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[10] * C13; - temp2 = step[13] * C3; - output[13] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 9] * C5; - temp2 = step[14] * C11; - output[14] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 8] * C9; - temp2 = step[15] * C7; - output[15] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4] * C14; - temp2 = output[7] * C2; - step[ 4] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[5] * C10; - temp2 = output[6] * C6; - step[ 5] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[5] * C6; - temp2 = output[6] * C10; - step[ 6] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[4] * C2; - temp2 = output[7] * C14; - step[ 7] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1] + output_rounding) >> output_shift; - output[ 8] = (step[ 0] - step[ 1] + output_rounding) >> output_shift; - - temp1 = step[2] * C12; - temp2 = step[3] * C4; - temp1 = (temp1 + temp2 + final_rounding) >> final_shift; - output[ 4] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[2] * C4; - temp2 = step[3] * C12; - temp1 = (temp2 - temp1 + final_rounding) >> final_shift; - output[12] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - output[ 2] = (2 * ((step[4] + step[ 5]) * C8) + final_rounding) - >> final_shift; - output[14] = (2 * ((step[7] - step[ 6]) * C8) + final_rounding) - >> final_shift; - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2 + output_rounding) >> output_shift; - output[10] = (temp1 - temp2 + output_rounding) >> output_shift; - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8] * C12; - temp2 = intermediate[9] * C4; - temp1 = (temp1 - temp2 + final_rounding) >> final_shift; - output[3] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - temp1 = intermediate[8] * C4; - temp2 = intermediate[9] * C12; - temp1 = (temp2 + temp1 + final_rounding) >> final_shift; - output[13] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - output[ 9] = (2 * ((step[10] + step[11]) * C8) + final_rounding) - >> final_shift; - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12] + output_rounding) - >> output_shift; - output[ 1] = -(intermediate[11] - intermediate[12] + output_rounding) - >> output_shift; - - output[ 7] = (2 * (intermediate[13] * C8) + final_rounding) >> final_shift; - - temp1 = intermediate[14] * C12; - temp2 = intermediate[15] * C4; - temp1 = (temp1 - temp2 + final_rounding) >> final_shift; - output[11] = (-2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - temp1 = intermediate[14] * C4; - temp2 = intermediate[15] * C12; - temp1 = (temp2 + temp1 + final_rounding) >> final_shift; - output[ 5] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; +// Rewrote to use same algorithm as others. +static void fdct16_1d(int16_t input[16], int16_t output[16]) { + int16_t step[16]; + int temp1, temp2; + + // step 1 + step[ 0] = input[0] + input[15]; + step[ 1] = input[1] + input[14]; + step[ 2] = input[2] + input[13]; + step[ 3] = input[3] + input[12]; + step[ 4] = input[4] + input[11]; + step[ 5] = input[5] + input[10]; + step[ 6] = input[6] + input[ 9]; + step[ 7] = input[7] + input[ 8]; + step[ 8] = input[7] - input[ 8]; + step[ 9] = input[6] - input[ 9]; + step[10] = input[5] - input[10]; + step[11] = input[4] - input[11]; + step[12] = input[3] - input[12]; + step[13] = input[2] - input[13]; + step[14] = input[1] - input[14]; + step[15] = input[0] - input[15]; + + fdct8_1d(step, step); + + // step 2 + output[8] = step[8]; + output[9] = step[9]; + temp1 = (-step[10] + step[13]) * cospi_16_64; + temp2 = (-step[11] + step[12]) * cospi_16_64; + output[10] = dct_const_round_shift(temp1); + output[11] = dct_const_round_shift(temp2); + temp1 = (step[11] + step[12]) * cospi_16_64; + temp2 = (step[10] + step[13]) * cospi_16_64; + output[12] = dct_const_round_shift(temp1); + output[13] = dct_const_round_shift(temp2); + output[14] = step[14]; + output[15] = step[15]; + + // step 3 + step[ 8] = output[8] + output[11]; + step[ 9] = output[9] + output[10]; + step[ 10] = output[9] - output[10]; + step[ 11] = output[8] - output[11]; + step[ 12] = -output[12] + output[15]; + step[ 13] = -output[13] + output[14]; + step[ 14] = output[13] + output[14]; + step[ 15] = output[12] + output[15]; + + // step 4 + output[8] = step[8]; + temp1 = -step[9] * cospi_8_64 + step[14] * cospi_24_64; + temp2 = -step[10] * cospi_24_64 - step[13] * cospi_8_64; + output[9] = dct_const_round_shift(temp1); + output[10] = dct_const_round_shift(temp2); + output[11] = step[11]; + output[12] = step[12]; + temp1 = -step[10] * cospi_8_64 + step[13] * cospi_24_64; + temp2 = step[9] * cospi_24_64 + step[14] * cospi_8_64; + output[13] = dct_const_round_shift(temp1); + output[14] = dct_const_round_shift(temp2); + output[15] = step[15]; + + // step 5 + step[8] = output[8] + output[9]; + step[9] = output[8] - output[9]; + step[10] = -output[10] + output[11]; + step[11] = output[10] + output[11]; + step[12] = output[12] + output[13]; + step[13] = output[12] - output[13]; + step[14] = -output[14] + output[15]; + step[15] = output[14] + output[15]; + + // step 6 + output[0] = step[0]; + output[8] = step[4]; + output[4] = step[2]; + output[12] = step[6]; + output[2] = step[1]; + output[10] = step[5]; + output[6] = step[3]; + output[14] = step[7]; + + temp1 = step[8] * cospi_30_64 + step[15] * cospi_2_64; + temp2 = step[9] * cospi_14_64 + step[14] * cospi_18_64; + output[1] = dct_const_round_shift(temp1); + output[9] = dct_const_round_shift(temp2); + + temp1 = step[10] * cospi_22_64 + step[13] * cospi_10_64; + temp2 = step[11] * cospi_6_64 + step[12] * cospi_26_64; + output[5] = dct_const_round_shift(temp1); + output[13] = dct_const_round_shift(temp2); + + temp1 = -step[11] * cospi_26_64 + step[12] * cospi_6_64; + temp2 = -step[10] * cospi_10_64 + step[13] * cospi_22_64; + output[3] = dct_const_round_shift(temp1); + output[11] = dct_const_round_shift(temp2); + + temp1 = -step[9] * cospi_18_64 + step[14] * cospi_14_64; + temp2 = -step[8] * cospi_2_64 + step[15] * cospi_30_64; + output[7] = dct_const_round_shift(temp1); + output[15] = dct_const_round_shift(temp2); } void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; - int i, j; - int16_t output[256]; - int16_t *outptr = &output[0]; - - // First transform columns - for (i = 0; i < 16; i++) { - int16_t temp_in[16]; - int16_t temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j * shortpitch + i]; - dct16x16_1d(temp_in, temp_out, 0); - for (j = 0; j < 16; j++) - output[j * 16 + i] = temp_out[j]; - } - - // Then transform rows - for (i = 0; i < 16; ++i) { - dct16x16_1d(outptr, out, 1); - outptr += 16; - out += 16; - } -} -#undef RIGHT_SHIFT -#undef ROUNDING -#endif - -#if !CONFIG_DWTDCTHYBRID -static void dct32_1d(double *input, double *output, int stride) { - static const double C1 = 0.998795456205; // cos(pi * 1 / 64) - static const double C2 = 0.995184726672; // cos(pi * 2 / 64) - static const double C3 = 0.989176509965; // cos(pi * 3 / 64) - static const double C4 = 0.980785280403; // cos(pi * 4 / 64) - static const double C5 = 0.970031253195; // cos(pi * 5 / 64) - static const double C6 = 0.956940335732; // cos(pi * 6 / 64) - static const double C7 = 0.941544065183; // cos(pi * 7 / 64) - static const double C8 = 0.923879532511; // cos(pi * 8 / 64) - static const double C9 = 0.903989293123; // cos(pi * 9 / 64) - static const double C10 = 0.881921264348; // cos(pi * 10 / 64) - static const double C11 = 0.857728610000; // cos(pi * 11 / 64) - static const double C12 = 0.831469612303; // cos(pi * 12 / 64) - static const double C13 = 0.803207531481; // cos(pi * 13 / 64) - static const double C14 = 0.773010453363; // cos(pi * 14 / 64) - static const double C15 = 0.740951125355; // cos(pi * 15 / 64) - static const double C16 = 0.707106781187; // cos(pi * 16 / 64) - static const double C17 = 0.671558954847; // cos(pi * 17 / 64) - static const double C18 = 0.634393284164; // cos(pi * 18 / 64) - static const double C19 = 0.595699304492; // cos(pi * 19 / 64) - static const double C20 = 0.555570233020; // cos(pi * 20 / 64) - static const double C21 = 0.514102744193; // cos(pi * 21 / 64) - static const double C22 = 0.471396736826; // cos(pi * 22 / 64) - static const double C23 = 0.427555093430; // cos(pi * 23 / 64) - static const double C24 = 0.382683432365; // cos(pi * 24 / 64) - static const double C25 = 0.336889853392; // cos(pi * 25 / 64) - static const double C26 = 0.290284677254; // cos(pi * 26 / 64) - static const double C27 = 0.242980179903; // cos(pi * 27 / 64) - static const double C28 = 0.195090322016; // cos(pi * 28 / 64) - static const double C29 = 0.146730474455; // cos(pi * 29 / 64) - static const double C30 = 0.098017140330; // cos(pi * 30 / 64) - static const double C31 = 0.049067674327; // cos(pi * 31 / 64) - - double step[32]; - - // Stage 1 - step[0] = input[stride*0] + input[stride*(32 - 1)]; - step[1] = input[stride*1] + input[stride*(32 - 2)]; - step[2] = input[stride*2] + input[stride*(32 - 3)]; - step[3] = input[stride*3] + input[stride*(32 - 4)]; - step[4] = input[stride*4] + input[stride*(32 - 5)]; - step[5] = input[stride*5] + input[stride*(32 - 6)]; - step[6] = input[stride*6] + input[stride*(32 - 7)]; - step[7] = input[stride*7] + input[stride*(32 - 8)]; - step[8] = input[stride*8] + input[stride*(32 - 9)]; - step[9] = input[stride*9] + input[stride*(32 - 10)]; - step[10] = input[stride*10] + input[stride*(32 - 11)]; - step[11] = input[stride*11] + input[stride*(32 - 12)]; - step[12] = input[stride*12] + input[stride*(32 - 13)]; - step[13] = input[stride*13] + input[stride*(32 - 14)]; - step[14] = input[stride*14] + input[stride*(32 - 15)]; - step[15] = input[stride*15] + input[stride*(32 - 16)]; - step[16] = -input[stride*16] + input[stride*(32 - 17)]; - step[17] = -input[stride*17] + input[stride*(32 - 18)]; - step[18] = -input[stride*18] + input[stride*(32 - 19)]; - step[19] = -input[stride*19] + input[stride*(32 - 20)]; - step[20] = -input[stride*20] + input[stride*(32 - 21)]; - step[21] = -input[stride*21] + input[stride*(32 - 22)]; - step[22] = -input[stride*22] + input[stride*(32 - 23)]; - step[23] = -input[stride*23] + input[stride*(32 - 24)]; - step[24] = -input[stride*24] + input[stride*(32 - 25)]; - step[25] = -input[stride*25] + input[stride*(32 - 26)]; - step[26] = -input[stride*26] + input[stride*(32 - 27)]; - step[27] = -input[stride*27] + input[stride*(32 - 28)]; - step[28] = -input[stride*28] + input[stride*(32 - 29)]; - step[29] = -input[stride*29] + input[stride*(32 - 30)]; - step[30] = -input[stride*30] + input[stride*(32 - 31)]; - step[31] = -input[stride*31] + input[stride*(32 - 32)]; - - // Stage 2 - output[stride*0] = step[0] + step[16 - 1]; - output[stride*1] = step[1] + step[16 - 2]; - output[stride*2] = step[2] + step[16 - 3]; - output[stride*3] = step[3] + step[16 - 4]; - output[stride*4] = step[4] + step[16 - 5]; - output[stride*5] = step[5] + step[16 - 6]; - output[stride*6] = step[6] + step[16 - 7]; - output[stride*7] = step[7] + step[16 - 8]; - output[stride*8] = -step[8] + step[16 - 9]; - output[stride*9] = -step[9] + step[16 - 10]; - output[stride*10] = -step[10] + step[16 - 11]; - output[stride*11] = -step[11] + step[16 - 12]; - output[stride*12] = -step[12] + step[16 - 13]; - output[stride*13] = -step[13] + step[16 - 14]; - output[stride*14] = -step[14] + step[16 - 15]; - output[stride*15] = -step[15] + step[16 - 16]; - - output[stride*16] = step[16]; - output[stride*17] = step[17]; - output[stride*18] = step[18]; - output[stride*19] = step[19]; - - output[stride*20] = (-step[20] + step[27])*C16; - output[stride*21] = (-step[21] + step[26])*C16; - output[stride*22] = (-step[22] + step[25])*C16; - output[stride*23] = (-step[23] + step[24])*C16; - - output[stride*24] = (step[24] + step[23])*C16; - output[stride*25] = (step[25] + step[22])*C16; - output[stride*26] = (step[26] + step[21])*C16; - output[stride*27] = (step[27] + step[20])*C16; - - output[stride*28] = step[28]; - output[stride*29] = step[29]; - output[stride*30] = step[30]; - output[stride*31] = step[31]; - - // Stage 3 - step[0] = output[stride*0] + output[stride*(8 - 1)]; - step[1] = output[stride*1] + output[stride*(8 - 2)]; - step[2] = output[stride*2] + output[stride*(8 - 3)]; - step[3] = output[stride*3] + output[stride*(8 - 4)]; - step[4] = -output[stride*4] + output[stride*(8 - 5)]; - step[5] = -output[stride*5] + output[stride*(8 - 6)]; - step[6] = -output[stride*6] + output[stride*(8 - 7)]; - step[7] = -output[stride*7] + output[stride*(8 - 8)]; - step[8] = output[stride*8]; - step[9] = output[stride*9]; - step[10] = (-output[stride*10] + output[stride*13])*C16; - step[11] = (-output[stride*11] + output[stride*12])*C16; - step[12] = (output[stride*12] + output[stride*11])*C16; - step[13] = (output[stride*13] + output[stride*10])*C16; - step[14] = output[stride*14]; - step[15] = output[stride*15]; - - step[16] = output[stride*16] + output[stride*23]; - step[17] = output[stride*17] + output[stride*22]; - step[18] = output[stride*18] + output[stride*21]; - step[19] = output[stride*19] + output[stride*20]; - step[20] = -output[stride*20] + output[stride*19]; - step[21] = -output[stride*21] + output[stride*18]; - step[22] = -output[stride*22] + output[stride*17]; - step[23] = -output[stride*23] + output[stride*16]; - step[24] = -output[stride*24] + output[stride*31]; - step[25] = -output[stride*25] + output[stride*30]; - step[26] = -output[stride*26] + output[stride*29]; - step[27] = -output[stride*27] + output[stride*28]; - step[28] = output[stride*28] + output[stride*27]; - step[29] = output[stride*29] + output[stride*26]; - step[30] = output[stride*30] + output[stride*25]; - step[31] = output[stride*31] + output[stride*24]; - - // Stage 4 - output[stride*0] = step[0] + step[3]; - output[stride*1] = step[1] + step[2]; - output[stride*2] = -step[2] + step[1]; - output[stride*3] = -step[3] + step[0]; - output[stride*4] = step[4]; - output[stride*5] = (-step[5] + step[6])*C16; - output[stride*6] = (step[6] + step[5])*C16; - output[stride*7] = step[7]; - output[stride*8] = step[8] + step[11]; - output[stride*9] = step[9] + step[10]; - output[stride*10] = -step[10] + step[9]; - output[stride*11] = -step[11] + step[8]; - output[stride*12] = -step[12] + step[15]; - output[stride*13] = -step[13] + step[14]; - output[stride*14] = step[14] + step[13]; - output[stride*15] = step[15] + step[12]; - - output[stride*16] = step[16]; - output[stride*17] = step[17]; - output[stride*18] = step[18]*-C8 + step[29]*C24; - output[stride*19] = step[19]*-C8 + step[28]*C24; - output[stride*20] = step[20]*-C24 + step[27]*-C8; - output[stride*21] = step[21]*-C24 + step[26]*-C8; - output[stride*22] = step[22]; - output[stride*23] = step[23]; - output[stride*24] = step[24]; - output[stride*25] = step[25]; - output[stride*26] = step[26]*C24 + step[21]*-C8; - output[stride*27] = step[27]*C24 + step[20]*-C8; - output[stride*28] = step[28]*C8 + step[19]*C24; - output[stride*29] = step[29]*C8 + step[18]*C24; - output[stride*30] = step[30]; - output[stride*31] = step[31]; - - // Stage 5 - step[0] = (output[stride*0] + output[stride*1]) * C16; - step[1] = (-output[stride*1] + output[stride*0]) * C16; - step[2] = output[stride*2]*C24 + output[stride*3] * C8; - step[3] = output[stride*3]*C24 - output[stride*2] * C8; - step[4] = output[stride*4] + output[stride*5]; - step[5] = -output[stride*5] + output[stride*4]; - step[6] = -output[stride*6] + output[stride*7]; - step[7] = output[stride*7] + output[stride*6]; - step[8] = output[stride*8]; - step[9] = output[stride*9]*-C8 + output[stride*14]*C24; - step[10] = output[stride*10]*-C24 + output[stride*13]*-C8; - step[11] = output[stride*11]; - step[12] = output[stride*12]; - step[13] = output[stride*13]*C24 + output[stride*10]*-C8; - step[14] = output[stride*14]*C8 + output[stride*9]*C24; - step[15] = output[stride*15]; - - step[16] = output[stride*16] + output[stride*19]; - step[17] = output[stride*17] + output[stride*18]; - step[18] = -output[stride*18] + output[stride*17]; - step[19] = -output[stride*19] + output[stride*16]; - step[20] = -output[stride*20] + output[stride*23]; - step[21] = -output[stride*21] + output[stride*22]; - step[22] = output[stride*22] + output[stride*21]; - step[23] = output[stride*23] + output[stride*20]; - step[24] = output[stride*24] + output[stride*27]; - step[25] = output[stride*25] + output[stride*26]; - step[26] = -output[stride*26] + output[stride*25]; - step[27] = -output[stride*27] + output[stride*24]; - step[28] = -output[stride*28] + output[stride*31]; - step[29] = -output[stride*29] + output[stride*30]; - step[30] = output[stride*30] + output[stride*29]; - step[31] = output[stride*31] + output[stride*28]; - - // Stage 6 - output[stride*0] = step[0]; - output[stride*1] = step[1]; - output[stride*2] = step[2]; - output[stride*3] = step[3]; - output[stride*4] = step[4]*C28 + step[7]*C4; - output[stride*5] = step[5]*C12 + step[6]*C20; - output[stride*6] = step[6]*C12 + step[5]*-C20; - output[stride*7] = step[7]*C28 + step[4]*-C4; - output[stride*8] = step[8] + step[9]; - output[stride*9] = -step[9] + step[8]; - output[stride*10] = -step[10] + step[11]; - output[stride*11] = step[11] + step[10]; - output[stride*12] = step[12] + step[13]; - output[stride*13] = -step[13] + step[12]; - output[stride*14] = -step[14] + step[15]; - output[stride*15] = step[15] + step[14]; - - output[stride*16] = step[16]; - output[stride*17] = step[17]*-C4 + step[30]*C28; - output[stride*18] = step[18]*-C28 + step[29]*-C4; - output[stride*19] = step[19]; - output[stride*20] = step[20]; - output[stride*21] = step[21]*-C20 + step[26]*C12; - output[stride*22] = step[22]*-C12 + step[25]*-C20; - output[stride*23] = step[23]; - output[stride*24] = step[24]; - output[stride*25] = step[25]*C12 + step[22]*-C20; - output[stride*26] = step[26]*C20 + step[21]*C12; - output[stride*27] = step[27]; - output[stride*28] = step[28]; - output[stride*29] = step[29]*C28 + step[18]*-C4; - output[stride*30] = step[30]*C4 + step[17]*C28; - output[stride*31] = step[31]; - - // Stage 7 - step[0] = output[stride*0]; - step[1] = output[stride*1]; - step[2] = output[stride*2]; - step[3] = output[stride*3]; - step[4] = output[stride*4]; - step[5] = output[stride*5]; - step[6] = output[stride*6]; - step[7] = output[stride*7]; - step[8] = output[stride*8]*C30 + output[stride*15]*C2; - step[9] = output[stride*9]*C14 + output[stride*14]*C18; - step[10] = output[stride*10]*C22 + output[stride*13]*C10; - step[11] = output[stride*11]*C6 + output[stride*12]*C26; - step[12] = output[stride*12]*C6 + output[stride*11]*-C26; - step[13] = output[stride*13]*C22 + output[stride*10]*-C10; - step[14] = output[stride*14]*C14 + output[stride*9]*-C18; - step[15] = output[stride*15]*C30 + output[stride*8]*-C2; - - step[16] = output[stride*16] + output[stride*17]; - step[17] = -output[stride*17] + output[stride*16]; - step[18] = -output[stride*18] + output[stride*19]; - step[19] = output[stride*19] + output[stride*18]; - step[20] = output[stride*20] + output[stride*21]; - step[21] = -output[stride*21] + output[stride*20]; - step[22] = -output[stride*22] + output[stride*23]; - step[23] = output[stride*23] + output[stride*22]; - step[24] = output[stride*24] + output[stride*25]; - step[25] = -output[stride*25] + output[stride*24]; - step[26] = -output[stride*26] + output[stride*27]; - step[27] = output[stride*27] + output[stride*26]; - step[28] = output[stride*28] + output[stride*29]; - step[29] = -output[stride*29] + output[stride*28]; - step[30] = -output[stride*30] + output[stride*31]; - step[31] = output[stride*31] + output[stride*30]; - - // Final stage --- outputs indices are bit-reversed. - output[stride*0] = step[0]; - output[stride*16] = step[1]; - output[stride*8] = step[2]; - output[stride*24] = step[3]; - output[stride*4] = step[4]; - output[stride*20] = step[5]; - output[stride*12] = step[6]; - output[stride*28] = step[7]; - output[stride*2] = step[8]; - output[stride*18] = step[9]; - output[stride*10] = step[10]; - output[stride*26] = step[11]; - output[stride*6] = step[12]; - output[stride*22] = step[13]; - output[stride*14] = step[14]; - output[stride*30] = step[15]; - - output[stride*1] = step[16]*C31 + step[31]*C1; - output[stride*17] = step[17]*C15 + step[30]*C17; - output[stride*9] = step[18]*C23 + step[29]*C9; - output[stride*25] = step[19]*C7 + step[28]*C25; - output[stride*5] = step[20]*C27 + step[27]*C5; - output[stride*21] = step[21]*C11 + step[26]*C21; - output[stride*13] = step[22]*C19 + step[25]*C13; - output[stride*29] = step[23]*C3 + step[24]*C29; - output[stride*3] = step[24]*C3 + step[23]*-C29; - output[stride*19] = step[25]*C19 + step[22]*-C13; - output[stride*11] = step[26]*C11 + step[21]*-C21; - output[stride*27] = step[27]*C27 + step[20]*-C5; - output[stride*7] = step[28]*C7 + step[19]*-C25; - output[stride*23] = step[29]*C23 + step[18]*-C9; - output[stride*15] = step[30]*C15 + step[17]*-C17; - output[stride*31] = step[31]*C31 + step[16]*-C1; -} - -void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[1024]; - // First transform columns - for (i = 0; i < 32; i++) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; j++) - temp_in[j] = input[j*shortpitch + i]; - dct32_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; j++) - output[j*32 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 32; ++i) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i*32]; - dct32_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - output[j + i*32] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 1024; i++) { - out[i] = (short)round(output[i]/4); - } - } - - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#else // CONFIG_DWTDCTHYBRID - -#if DWT_TYPE == 53 - -// Note: block length must be even for this implementation -static void analysis_53_row(int length, short *x, - short *lowpass, short *highpass) { - int n; - short r, *a, *b; + int shortpitch = pitch >> 1; + int i, j; + int16_t output[256]; + int16_t temp_in[16], temp_out[16]; - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *a++ = (r = *x++) << 1; - *b++ = *x - ((r + x[1] + 1) >> 1); - x++; - } - *a = (r = *x++) << 1; - *b = *x - r; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ += (r + (*b) + 1) >> 1; - r = *b++; + // First transform columns + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j++) + temp_in[j] = input[j * shortpitch + i] << 2; + fdct16_1d(temp_in, temp_out); + for (j = 0; j < 16; j++) + output[j * 16 + i] = (temp_out[j] + 1) >> 2; } -} -static void analysis_53_col(int length, short *x, - short *lowpass, short *highpass) { - int n; - short r, *a, *b; - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *a++ = (r = *x++); - *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2; - x++; - } - *a = (r = *x++); - *b = (*x - r + 1) >> 1; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ += (r + (*b) + 1) >> 1; - r = *b++; + // Then transform rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = output[j + i * 16]; + fdct16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + out[j + i * 16] = temp_out[j]; } } -static void dyadic_analyze_53(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); - analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = c[i * pitch_c + j]; - analysis_53_col(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i]; - } - } +void fadst16_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = - x8; + output[2] = x12; + output[3] = - x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = - x13; + output[14] = x9; + output[15] = - x1; } -#elif DWT_TYPE == 26 - -static void analysis_26_row(int length, short *x, - short *lowpass, short *highpass) { - int i, n; - short r, s, *a, *b; - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - r = *x++; - s = *x++; - *a++ = r + s; - *b++ = r - s; - } - n = length >> 1; - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ -= (r - a[1] + 4) >> 3; - r = *a++; - } - *b -= (r - *a + 4) >> 3; - } -} +static const transform_2d FHT_16[] = { + { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 + { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 + { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 + { fadst16_1d, fadst16_1d } // ADST_ADST = 3 +}; -static void analysis_26_col(int length, short *x, - short *lowpass, short *highpass) { - int i, n; - short r, s, *a, *b; - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - r = *x++; - s = *x++; - *a++ = (r + s + 1) >> 1; - *b++ = (r - s + 1) >> 1; - } - n = length >> 1; - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ -= (r - a[1] + 4) >> 3; - r = *a++; - } - *b -= (r - *a + 4) >> 3; - } -} +void vp9_short_fht16x16_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[256]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[16], temp_out[16]; + const transform_2d ht = FHT_16[tx_type]; -static void dyadic_analyze_26(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); - analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = c[i * pitch_c + j]; - analysis_26_col(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i]; - } + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = input[j * pitch + i] << 2; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } -} -#elif DWT_TYPE == 97 - -static void analysis_97(int length, double *x, - double *lowpass, double *highpass) { - static const double a_predict1 = -1.586134342; - static const double a_update1 = -0.05298011854; - static const double a_predict2 = 0.8829110762; - static const double a_update2 = 0.4435068522; - static const double s_low = 1.149604398; - static const double s_high = 1/1.149604398; - int i; - double y[DWT_MAX_LENGTH]; - // Predict 1 - for (i = 1; i < length - 2; i += 2) { - x[i] += a_predict1 * (x[i - 1] + x[i + 1]); - } - x[length - 1] += 2 * a_predict1 * x[length - 2]; - // Update 1 - for (i = 2; i < length; i += 2) { - x[i] += a_update1 * (x[i - 1] + x[i + 1]); - } - x[0] += 2 * a_update1 * x[1]; - // Predict 2 - for (i = 1; i < length - 2; i += 2) { - x[i] += a_predict2 * (x[i - 1] + x[i + 1]); - } - x[length - 1] += 2 * a_predict2 * x[length - 2]; - // Update 2 - for (i = 2; i < length; i += 2) { - x[i] += a_update2 * (x[i - 1] + x[i + 1]); - } - x[0] += 2 * a_update2 * x[1]; - memcpy(y, x, sizeof(*y) * length); - // Scale and pack - for (i = 0; i < length / 2; i++) { - lowpass[i] = y[2 * i] * s_low; - highpass[i] = y[2 * i + 1] * s_high; + // Rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j + i * 16]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j + i * 16] = temp_out[j]; } } -static void dyadic_analyze_97(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - double buffer[2 * DWT_MAX_LENGTH]; - double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); - analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH], - &y[i * DWT_MAX_LENGTH] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = y[i * DWT_MAX_LENGTH + j]; - analysis_97(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = round(buffer[i]); - } - } -} -#endif // DWT_TYPE - -// TODO(debargha): Implement the scaling differently so as not to have to -// use the floating point dct -static void dct16x16_1d_f(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8]*C7; - temp2 = step[15]*C9; - output[ 8] = temp1 + temp2; - - temp1 = step[ 9]*C11; - temp2 = step[14]*C5; - output[ 9] = temp1 - temp2; - - temp1 = step[10]*C3; - temp2 = step[13]*C13; - output[10] = temp1 + temp2; - - temp1 = step[11]*C15; - temp2 = step[12]*C1; - output[11] = temp1 - temp2; - - temp1 = step[11]*C1; - temp2 = step[12]*C15; - output[12] = temp2 + temp1; - - temp1 = step[10]*C13; - temp2 = step[13]*C3; - output[13] = temp2 - temp1; - - temp1 = step[ 9]*C5; - temp2 = step[14]*C11; - output[14] = temp2 + temp1; - - temp1 = step[ 8]*C9; - temp2 = step[15]*C7; - output[15] = temp2 - temp1; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4]*C14; - temp2 = output[7]*C2; - step[ 4] = temp1 + temp2; - - temp1 = output[5]*C10; - temp2 = output[6]*C6; - step[ 5] = temp1 + temp2; - - temp1 = output[5]*C6; - temp2 = output[6]*C10; - step[ 6] = temp2 - temp1; - - temp1 = output[4]*C2; - temp2 = output[7]*C14; - step[ 7] = temp2 - temp1; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1]); - output[ 8] = (step[ 0] - step[ 1]); - - temp1 = step[2]*C12; - temp2 = step[3]*C4; - temp1 = temp1 + temp2; - output[ 4] = 2*(temp1*C8); - - temp1 = step[2]*C4; - temp2 = step[3]*C12; - temp1 = temp2 - temp1; - output[12] = 2*(temp1*C8); - - output[ 2] = 2*((step[4] + step[ 5])*C8); - output[14] = 2*((step[7] - step[ 6])*C8); - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2); - output[10] = (temp1 - temp2); - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8]*C12; - temp2 = intermediate[9]*C4; - temp1 = temp1 - temp2; - output[3] = 2*(temp1*C8); - - temp1 = intermediate[8]*C4; - temp2 = intermediate[9]*C12; - temp1 = temp2 + temp1; - output[13] = 2*(temp1*C8); - - output[ 9] = 2*((step[10] + step[11])*C8); - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12]); - output[ 1] = -(intermediate[11] - intermediate[12]); - - output[ 7] = 2*(intermediate[13]*C8); - - temp1 = intermediate[14]*C12; - temp2 = intermediate[15]*C4; - temp1 = temp1 - temp2; - output[11] = -2*(temp1*C8); - - temp1 = intermediate[14]*C4; - temp2 = intermediate[15]*C12; - temp1 = temp2 + temp1; - output[ 5] = 2*(temp1*C8); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} +static void dct32_1d(int *input, int *output) { + int step[32]; + // Stage 1 + step[0] = input[0] + input[(32 - 1)]; + step[1] = input[1] + input[(32 - 2)]; + step[2] = input[2] + input[(32 - 3)]; + step[3] = input[3] + input[(32 - 4)]; + step[4] = input[4] + input[(32 - 5)]; + step[5] = input[5] + input[(32 - 6)]; + step[6] = input[6] + input[(32 - 7)]; + step[7] = input[7] + input[(32 - 8)]; + step[8] = input[8] + input[(32 - 9)]; + step[9] = input[9] + input[(32 - 10)]; + step[10] = input[10] + input[(32 - 11)]; + step[11] = input[11] + input[(32 - 12)]; + step[12] = input[12] + input[(32 - 13)]; + step[13] = input[13] + input[(32 - 14)]; + step[14] = input[14] + input[(32 - 15)]; + step[15] = input[15] + input[(32 - 16)]; + step[16] = -input[16] + input[(32 - 17)]; + step[17] = -input[17] + input[(32 - 18)]; + step[18] = -input[18] + input[(32 - 19)]; + step[19] = -input[19] + input[(32 - 20)]; + step[20] = -input[20] + input[(32 - 21)]; + step[21] = -input[21] + input[(32 - 22)]; + step[22] = -input[22] + input[(32 - 23)]; + step[23] = -input[23] + input[(32 - 24)]; + step[24] = -input[24] + input[(32 - 25)]; + step[25] = -input[25] + input[(32 - 26)]; + step[26] = -input[26] + input[(32 - 27)]; + step[27] = -input[27] + input[(32 - 28)]; + step[28] = -input[28] + input[(32 - 29)]; + step[29] = -input[29] + input[(32 - 30)]; + step[30] = -input[30] + input[(32 - 31)]; + step[31] = -input[31] + input[(32 - 32)]; -static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch, - int scale) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[256]; - // First transform columns - for (i = 0; i < 16; i++) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j*shortpitch + i]; - dct16x16_1d_f(temp_in, temp_out); - for (j = 0; j < 16; j++) - output[j*16 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = output[j + i*16]; - dct16x16_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i*16] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 256; i++) - out[i] = (short)round(output[i] / (2 << scale)); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} + // Stage 2 + output[0] = step[0] + step[16 - 1]; + output[1] = step[1] + step[16 - 2]; + output[2] = step[2] + step[16 - 3]; + output[3] = step[3] + step[16 - 4]; + output[4] = step[4] + step[16 - 5]; + output[5] = step[5] + step[16 - 6]; + output[6] = step[6] + step[16 - 7]; + output[7] = step[7] + step[16 - 8]; + output[8] = -step[8] + step[16 - 9]; + output[9] = -step[9] + step[16 - 10]; + output[10] = -step[10] + step[16 - 11]; + output[11] = -step[11] + step[16 - 12]; + output[12] = -step[12] + step[16 - 13]; + output[13] = -step[13] + step[16 - 14]; + output[14] = -step[14] + step[16 - 15]; + output[15] = -step[15] + step[16 - 16]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = step[18]; + output[19] = step[19]; + + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); + + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); + + output[28] = step[28]; + output[29] = step[29]; + output[30] = step[30]; + output[31] = step[31]; -void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) { - int j1, i, j, k; - float b[8]; - float b1[8]; - float d[8][8]; - float f0 = (float) .7071068; - float f1 = (float) .4903926; - float f2 = (float) .4619398; - float f3 = (float) .4157348; - float f4 = (float) .3535534; - float f5 = (float) .2777851; - float f6 = (float) .1913417; - float f7 = (float) .0975452; - pitch = pitch / 2; - for (i = 0, k = 0; i < 8; i++, k += pitch) { - for (j = 0; j < 8; j++) { - b[j] = (float)(block[k + j] << (3 - scale)); - } - /* Horizontal transform */ - for (j = 0; j < 4; j++) { - j1 = 7 - j; - b1[j] = b[j] + b[j1]; - b1[j1] = b[j] - b[j1]; - } - b[0] = b1[0] + b1[3]; - b[1] = b1[1] + b1[2]; - b[2] = b1[1] - b1[2]; - b[3] = b1[0] - b1[3]; - b[4] = b1[4]; - b[5] = (b1[6] - b1[5]) * f0; - b[6] = (b1[6] + b1[5]) * f0; - b[7] = b1[7]; - d[i][0] = (b[0] + b[1]) * f4; - d[i][4] = (b[0] - b[1]) * f4; - d[i][2] = b[2] * f6 + b[3] * f2; - d[i][6] = b[3] * f6 - b[2] * f2; - b1[4] = b[4] + b[5]; - b1[7] = b[7] + b[6]; - b1[5] = b[4] - b[5]; - b1[6] = b[7] - b[6]; - d[i][1] = b1[4] * f7 + b1[7] * f1; - d[i][5] = b1[5] * f3 + b1[6] * f5; - d[i][7] = b1[7] * f7 - b1[4] * f1; - d[i][3] = b1[6] * f3 - b1[5] * f5; - } - /* Vertical transform */ - for (i = 0; i < 8; i++) { - for (j = 0; j < 4; j++) { - j1 = 7 - j; - b1[j] = d[j][i] + d[j1][i]; - b1[j1] = d[j][i] - d[j1][i]; - } - b[0] = b1[0] + b1[3]; - b[1] = b1[1] + b1[2]; - b[2] = b1[1] - b1[2]; - b[3] = b1[0] - b1[3]; - b[4] = b1[4]; - b[5] = (b1[6] - b1[5]) * f0; - b[6] = (b1[6] + b1[5]) * f0; - b[7] = b1[7]; - d[0][i] = (b[0] + b[1]) * f4; - d[4][i] = (b[0] - b[1]) * f4; - d[2][i] = b[2] * f6 + b[3] * f2; - d[6][i] = b[3] * f6 - b[2] * f2; - b1[4] = b[4] + b[5]; - b1[7] = b[7] + b[6]; - b1[5] = b[4] - b[5]; - b1[6] = b[7] - b[6]; - d[1][i] = b1[4] * f7 + b1[7] * f1; - d[5][i] = b1[5] * f3 + b1[6] * f5; - d[7][i] = b1[7] * f7 - b1[4] * f1; - d[3][i] = b1[6] * f3 - b1[5] * f5; - } - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5); - } - } - return; -} + // Stage 3 + step[0] = output[0] + output[(8 - 1)]; + step[1] = output[1] + output[(8 - 2)]; + step[2] = output[2] + output[(8 - 3)]; + step[3] = output[3] + output[(8 - 4)]; + step[4] = -output[4] + output[(8 - 5)]; + step[5] = -output[5] + output[(8 - 6)]; + step[6] = -output[6] + output[(8 - 7)]; + step[7] = -output[7] + output[(8 - 8)]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); + step[14] = output[14]; + step[15] = output[15]; + + step[16] = output[16] + output[23]; + step[17] = output[17] + output[22]; + step[18] = output[18] + output[21]; + step[19] = output[19] + output[20]; + step[20] = -output[20] + output[19]; + step[21] = -output[21] + output[18]; + step[22] = -output[22] + output[17]; + step[23] = -output[23] + output[16]; + step[24] = -output[24] + output[31]; + step[25] = -output[25] + output[30]; + step[26] = -output[26] + output[29]; + step[27] = -output[27] + output[28]; + step[28] = output[28] + output[27]; + step[29] = output[29] + output[26]; + step[30] = output[30] + output[25]; + step[31] = output[31] + output[24]; -#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n)) + // Stage 4 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = -step[2] + step[1]; + output[3] = -step[3] + step[0]; + output[4] = step[4]; + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = -step[10] + step[9]; + output[11] = -step[11] + step[8]; + output[12] = -step[12] + step[15]; + output[13] = -step[13] + step[14]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); + output[22] = step[22]; + output[23] = step[23]; + output[24] = step[24]; + output[25] = step[25]; + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); + output[30] = step[30]; + output[31] = step[31]; -#if DWTDCT_TYPE == DWTDCT16X16_LEAN + // Stage 5 + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); + step[4] = output[4] + output[5]; + step[5] = -output[5] + output[4]; + step[6] = -output[6] + output[7]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); + step[11] = output[11]; + step[12] = output[12]; + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); + step[15] = output[15]; + + step[16] = output[16] + output[19]; + step[17] = output[17] + output[18]; + step[18] = -output[18] + output[17]; + step[19] = -output[19] + output[16]; + step[20] = -output[20] + output[23]; + step[21] = -output[21] + output[22]; + step[22] = output[22] + output[21]; + step[23] = output[23] + output[20]; + step[24] = output[24] + output[27]; + step[25] = output[25] + output[26]; + step[26] = -output[26] + output[25]; + step[27] = -output[27] + output[24]; + step[28] = -output[28] + output[31]; + step[29] = -output[29] + output[30]; + step[30] = output[30] + output[29]; + step[31] = output[31] + output[28]; -void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 97 - dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 53 - dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } -} + // Stage 6 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); + output[8] = step[8] + step[9]; + output[9] = -step[9] + step[8]; + output[10] = -step[10] + step[11]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = -step[13] + step[12]; + output[14] = -step[14] + step[15]; + output[15] = step[15] + step[14]; + + output[16] = step[16]; + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); + output[19] = step[19]; + output[20] = step[20]; + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); + output[23] = step[23]; + output[24] = step[24]; + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); + output[27] = step[27]; + output[28] = step[28]; + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); + output[31] = step[31]; -#elif DWTDCT_TYPE == DWTDCT16X16 + // Stage 7 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); + + step[16] = output[16] + output[17]; + step[17] = -output[17] + output[16]; + step[18] = -output[18] + output[19]; + step[19] = output[19] + output[18]; + step[20] = output[20] + output[21]; + step[21] = -output[21] + output[20]; + step[22] = -output[22] + output[23]; + step[23] = output[23] + output[22]; + step[24] = output[24] + output[25]; + step[25] = -output[25] + output[24]; + step[26] = -output[26] + output[27]; + step[27] = output[27] + output[26]; + step[28] = output[28] + output[29]; + step[29] = -output[29] + output[28]; + step[30] = -output[30] + output[31]; + step[31] = output[31] + output[30]; -void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 97 - dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 53 - dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16); + // Final stage --- outputs indices are bit-reversed. + output[0] = step[0]; + output[16] = step[1]; + output[8] = step[2]; + output[24] = step[3]; + output[4] = step[4]; + output[20] = step[5]; + output[12] = step[6]; + output[28] = step[7]; + output[2] = step[8]; + output[18] = step[9]; + output[10] = step[10]; + output[26] = step[11]; + output[6] = step[12]; + output[22] = step[13]; + output[14] = step[14]; + output[30] = step[15]; + + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -#elif DWTDCT_TYPE == DWTDCT8X8 - -void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[8 * 8]; +void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { + int shortpitch = pitch >> 1; int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 97 - dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 53 - dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8); - - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } + int output[32 * 32]; + + // Columns + for (i = 0; i < 32; i++) { + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; j++) + temp_in[j] = input[j * shortpitch + i] << 2; + dct32_1d(temp_in, temp_out); + for (j = 0; j < 32; j++) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } -} -#endif - -#if CONFIG_TX64X64 -void vp9_short_fdct64x64_c(short *input, short *out, int pitch) { - // assume out is a 64x64 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64); -#elif DWT_TYPE == 97 - dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64); -#elif DWT_TYPE == 53 - dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16); - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - for (i = 0; i < 16; ++i) { - for (j = 16; j < 48; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 16; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } -#elif DWTDCT_TYPE == DWTDCT16X16 - vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16); - - // There is no dct used on the highest bands for now. - // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS - // TODO(debargha): experiment with turning these coeffs to 0 + // Rows for (i = 0; i < 32; ++i) { - for (j = 32; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 32; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i * 32]; + dct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } -#endif // DWTDCT_TYPE } -#endif // CONFIG_TX64X64 -#endif // CONFIG_DWTDCTHYBRID + diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3f5133062..5271a597c 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -21,7 +21,6 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_setupintrarecon.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/encoder/vp9_encodeintra.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_invtrans.h" @@ -29,8 +28,9 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_tokenize.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include <stdio.h> #include <math.h> #include <limits.h> @@ -45,18 +45,15 @@ int enc_debug = 0; #endif -extern void select_interp_filter_type(VP9_COMP *cpi); +void vp9_select_interp_filter_type(VP9_COMP *cpi); static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col); static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col); static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col); static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); @@ -103,7 +100,7 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) { */ act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0, &sse); - act = act << 4; + act <<= 4; /* If the region is flat, lower the activity some more. */ if (act < 8 << 12) @@ -488,8 +485,7 @@ static void update_state(VP9_COMP *cpi, { int segment_id = mbmi->segment_id; - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) { + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { for (i = 0; i < NB_TXFM_MODES; i++) { cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i]; } @@ -625,27 +621,19 @@ static unsigned find_seg_id(uint8_t *buf, int block_size, } static void set_offsets(VP9_COMP *cpi, - int mb_row, int mb_col, int block_size, - int *ref_yoffset, int *ref_uvoffset) { + int mb_row, int mb_col, int block_size) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; const int dst_fb_idx = cm->new_fb_idx; - const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride; - const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride; - const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col; - const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col; - const int src_y_stride = x->src.y_stride; - const int src_uv_stride = x->src.uv_stride; - const int src_yoffset = 16 * mb_row * src_y_stride + 16 * mb_col; - const int src_uvoffset = 8 * mb_row * src_uv_stride + 8 * mb_col; - const int ref_fb_idx = cm->lst_fb_idx; - const int ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; - const int ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; const int idx_map = mb_row * cm->mb_cols + mb_col; const int idx_str = xd->mode_info_stride * mb_row + mb_col; +#ifdef ENC_DEBUG + enc_debug = (cpi->common.current_video_frame == 2 && + mb_row == 4 && mb_col == 5); +#endif // entropy context structures xd->above_context = cm->above_context + mb_col; xd->left_context = cm->left_context + (mb_row & 3); @@ -664,9 +652,9 @@ static void set_offsets(VP9_COMP *cpi, xd->prev_mode_info_context = cm->prev_mi + idx_str; // Set up destination pointers - xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; - xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset; - xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->dst, + &cm->yv12_fb[dst_fb_idx], + mb_row, mb_col, NULL, NULL); /* Set up limit values for MV components to prevent them from * extending beyond the UMV borders assuming 16x16 block size */ @@ -686,17 +674,12 @@ static void set_offsets(VP9_COMP *cpi, xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3; // Are edges available for intra prediction? - xd->up_available = (mb_row != 0); - xd->left_available = (mb_col != 0); - - /* Reference buffer offsets */ - *ref_yoffset = (mb_row * ref_y_stride * 16) + (mb_col * 16); - *ref_uvoffset = (mb_row * ref_uv_stride * 8) + (mb_col * 8); + xd->up_available = (mb_row != 0); + xd->left_available = (mb_col > cm->cur_tile_mb_col_start); + xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end); /* set up source buffers */ - x->src.y_buffer = cpi->Source->y_buffer + src_yoffset; - x->src.u_buffer = cpi->Source->u_buffer + src_uvoffset; - x->src.v_buffer = cpi->Source->v_buffer + src_uvoffset; + setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL); /* R/D setup */ x->rddiv = cpi->RDDIV; @@ -727,34 +710,36 @@ static void set_offsets(VP9_COMP *cpi, const int x = mb_col & ~3; const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); + const int tile_progress = cm->cur_tile_mb_col_start * cm->mb_rows; + const int mb_cols = cm->cur_tile_mb_col_end - cm->cur_tile_mb_col_start; cpi->seg0_progress = - ((y * cm->mb_cols + x * 4 + p32 + p16) << 16) / cm->MBs; + ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs; } } else { mbmi->segment_id = 0; } } -static void pick_mb_modes(VP9_COMP *cpi, - int mb_row, - int mb_col, - TOKENEXTRA **tp, - int *totalrate, - int *totaldist) { +static int pick_mb_modes(VP9_COMP *cpi, + int mb_row0, + int mb_col0, + TOKENEXTRA **tp, + int *totalrate, + int *totaldist) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int i; - int recon_yoffset, recon_uvoffset; + int splitmodes_used = 0; ENTROPY_CONTEXT_PLANES left_context[2]; ENTROPY_CONTEXT_PLANES above_context[2]; ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context - + mb_col; + + mb_col0; /* Function should not modify L & A contexts; save and restore on exit */ vpx_memcpy(left_context, - cm->left_context + (mb_row & 2), + cm->left_context + (mb_row0 & 2), sizeof(left_context)); vpx_memcpy(above_context, initial_above_context_ptr, @@ -763,17 +748,18 @@ static void pick_mb_modes(VP9_COMP *cpi, /* Encode MBs in raster order within the SB */ for (i = 0; i < 4; i++) { const int x_idx = i & 1, y_idx = i >> 1; + const int mb_row = mb_row0 + y_idx; + const int mb_col = mb_col0 + x_idx; MB_MODE_INFO *mbmi; - if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) { + if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) { // MB lies outside frame, move on continue; } // Index of the MB in the SB 0..3 xd->mb_index = i; - set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16, - &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 16); if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); @@ -781,10 +767,6 @@ static void pick_mb_modes(VP9_COMP *cpi, mbmi = &xd->mode_info_context->mbmi; mbmi->sb_type = BLOCK_SIZE_MB16X16; - cpi->update_context = 0; // TODO Do we need this now?? - - vp9_intra_prediction_down_copy(xd); - // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (cm->frame_type == KEY_FRAME) { @@ -798,8 +780,8 @@ static void pick_mb_modes(VP9_COMP *cpi, *totaldist += d; // Dummy encode, do not do the tokenization - encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0, - mb_row + y_idx, mb_col + x_idx); + encode_macroblock(cpi, tp, 0, mb_row, mb_col); + // Note the encoder may have changed the segment_id // Save the coding context @@ -812,14 +794,14 @@ static void pick_mb_modes(VP9_COMP *cpi, if (enc_debug) printf("inter pick_mb_modes %d %d\n", mb_row, mb_col); #endif - vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset, - recon_uvoffset, &r, &d); + vp9_pick_mode_inter_macroblock(cpi, x, mb_row, mb_col, &r, &d); *totalrate += r; *totaldist += d; + splitmodes_used += (mbmi->mode == SPLITMV); + // Dummy encode, do not do the tokenization - encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0, - mb_row + y_idx, mb_col + x_idx); + encode_macroblock(cpi, tp, 0, mb_row, mb_col); seg_id = mbmi->segment_id; if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) { @@ -842,12 +824,14 @@ static void pick_mb_modes(VP9_COMP *cpi, } /* Restore L & A coding context to those in place on entry */ - vpx_memcpy(cm->left_context + (mb_row & 2), + vpx_memcpy(cm->left_context + (mb_row0 & 2), left_context, sizeof(left_context)); vpx_memcpy(initial_above_context_ptr, above_context, sizeof(above_context)); + + return splitmodes_used; } static void pick_sb_modes(VP9_COMP *cpi, @@ -859,13 +843,11 @@ static void pick_sb_modes(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int recon_yoffset, recon_uvoffset; - set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 32); xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32; if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - cpi->update_context = 0; // TODO Do we need this now?? /* Find best coding mode & reconstruct the MB so it is available * as a predictor for MBs that follow in the SB */ @@ -878,11 +860,7 @@ static void pick_sb_modes(VP9_COMP *cpi, vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context, sizeof(MODE_INFO)); } else { - vp9_rd_pick_inter_mode_sb32(cpi, x, - recon_yoffset, - recon_uvoffset, - totalrate, - totaldist); + vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist); } } @@ -895,30 +873,21 @@ static void pick_sb64_modes(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int recon_yoffset, recon_uvoffset; - set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 64); xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64; if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - cpi->update_context = 0; // TODO(rbultje) Do we need this now?? /* Find best coding mode & reconstruct the MB so it is available * as a predictor for MBs that follow in the SB */ if (cm->frame_type == KEY_FRAME) { - vp9_rd_pick_intra_mode_sb64(cpi, x, - totalrate, - totaldist); + vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist); /* Save the coding context */ - vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, - sizeof(MODE_INFO)); + vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO)); } else { - vp9_rd_pick_inter_mode_sb64(cpi, x, - recon_yoffset, - recon_uvoffset, - totalrate, - totaldist); + vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist); } } @@ -986,14 +955,13 @@ static void encode_sb(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int recon_yoffset, recon_uvoffset; cpi->sb32_count[is_sb]++; if (is_sb) { - set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 32); update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled); - encode_superblock32(cpi, tp, recon_yoffset, recon_uvoffset, + encode_superblock32(cpi, tp, output_enabled, mb_row, mb_col); if (output_enabled) update_stats(cpi); @@ -1015,17 +983,14 @@ static void encode_sb(VP9_COMP *cpi, continue; } - set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16, - &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16); xd->mb_index = i; update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled); if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - vp9_intra_prediction_down_copy(xd); - - encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, + encode_macroblock(cpi, tp, output_enabled, mb_row + y_idx, mb_col + x_idx); if (output_enabled) update_stats(cpi); @@ -1060,11 +1025,9 @@ static void encode_sb64(VP9_COMP *cpi, cpi->sb64_count[is_sb[0] == 2]++; if (is_sb[0] == 2) { - int recon_yoffset, recon_uvoffset; - - set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 64); update_state(cpi, &x->sb64_context, 64, 1); - encode_superblock64(cpi, tp, recon_yoffset, recon_uvoffset, + encode_superblock64(cpi, tp, 1, mb_row, mb_col); update_stats(cpi); @@ -1098,17 +1061,18 @@ static void encode_sb_row(VP9_COMP *cpi, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int mb_col; - int mb_cols = cm->mb_cols; // Initialize the left context for the new SB row vpx_memset(cm->left_context, 0, sizeof(cm->left_context)); // Code each SB in the row - for (mb_col = 0; mb_col < mb_cols; mb_col += 4) { + for (mb_col = cm->cur_tile_mb_col_start; + mb_col < cm->cur_tile_mb_col_end; mb_col += 4) { int i; int sb32_rate = 0, sb32_dist = 0; int is_sb[4]; int sb64_rate = INT_MAX, sb64_dist; + int sb64_skip = 0; ENTROPY_CONTEXT_PLANES l[4], a[4]; TOKENEXTRA *tp_orig = *tp; @@ -1118,18 +1082,27 @@ static void encode_sb_row(VP9_COMP *cpi, const int x_idx = (i & 1) << 1, y_idx = i & 2; int mb_rate = 0, mb_dist = 0; int sb_rate = INT_MAX, sb_dist; + int splitmodes_used = 0; + int sb32_skip = 0; if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols) continue; xd->sb_index = i; - pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx, - tp, &mb_rate, &mb_dist); + splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx, + tp, &mb_rate, &mb_dist); + mb_rate += vp9_cost_bit(cm->sb32_coded, 0); - if (!((( mb_cols & 1) && mb_col + x_idx == mb_cols - 1) || - ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) { + if (cpi->sf.splitmode_breakout) { + sb32_skip = splitmodes_used; + sb64_skip += splitmodes_used; + } + + if ( !sb32_skip && + !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) || + ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) { /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */ pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx, tp, &sb_rate, &sb_dist); @@ -1147,6 +1120,11 @@ static void encode_sb_row(VP9_COMP *cpi, is_sb[i] = 0; sb32_rate += mb_rate; sb32_dist += mb_dist; + + // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled). + if (cpi->sf.mb16_breakout) { + ++sb64_skip; + } } /* Encode SB using best computed mode(s) */ @@ -1162,7 +1140,8 @@ static void encode_sb_row(VP9_COMP *cpi, memcpy(cm->left_context, &l, sizeof(l)); sb32_rate += vp9_cost_bit(cm->sb64_coded, 0); - if (!((( mb_cols & 3) && mb_col + 3 >= mb_cols) || + if (!sb64_skip && + !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) || ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) { pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist); sb64_rate += vp9_cost_bit(cm->sb64_coded, 1); @@ -1205,7 +1184,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // Copy data over into macro block data structures. x->src = *cpi->Source; - xd->pre = cm->yv12_fb[cm->lst_fb_idx]; + xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]]; xd->dst = cm->yv12_fb[cm->new_fb_idx]; // set up frame for intra coded blocks @@ -1239,18 +1218,33 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { vpx_memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols); - xd->fullpixel_mask = 0xffffffff; - if (cm->full_pixel) - xd->fullpixel_mask = 0xfffffff8; + xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff; } +static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { + if (lossless) { + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4_x8; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4_x8; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_inv_walsh4x4_1_x8; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_inv_walsh4x4_x8; + cpi->mb.optimize = 0; + cpi->common.filter_level = 0; + cpi->zbin_mode_boost_enabled = FALSE; + cpi->common.txfm_mode = ONLY_4X4; + } else { + cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; + cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4llm_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4llm; + } +} + + static void encode_frame_internal(VP9_COMP *cpi) { int mb_row; MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - - TOKENEXTRA *tp = cpi->tok; int totalrate; // printf("encode_frame_internal frame %d (%d)\n", @@ -1273,9 +1267,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { totalrate = 0; - // Functions setup for all frame types so we can use MC in AltRef - vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm); - // Reset frame count of inter 0,0 motion vector usage. cpi->inter_zz_count = 0; @@ -1292,16 +1283,21 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cpi->NMVcount); vp9_zero(cpi->coef_counts_4x4); - vp9_zero(cpi->hybrid_coef_counts_4x4); vp9_zero(cpi->coef_counts_8x8); - vp9_zero(cpi->hybrid_coef_counts_8x8); vp9_zero(cpi->coef_counts_16x16); - vp9_zero(cpi->hybrid_coef_counts_16x16); vp9_zero(cpi->coef_counts_32x32); #if CONFIG_NEW_MVREF vp9_zero(cpi->mb_mv_ref_count); #endif + + // force lossless mode when Q0 is selected + cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 && + cm->y1dc_delta_q == 0 && + cm->uvdc_delta_q == 0 && + cm->uvac_delta_q == 0); + switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); + vp9_frame_init_quantizer(cpi); vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q); @@ -1330,12 +1326,20 @@ static void encode_frame_internal(VP9_COMP *cpi) { vpx_usec_timer_start(&emr_timer); { - // For each row of SBs in the frame - for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) { - encode_sb_row(cpi, mb_row, &tp, &totalrate); - } + // Take tiles into account and give start/end MB + int tile_col; + TOKENEXTRA *tp = cpi->tok; + + for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) { + TOKENEXTRA *tp_old = tp; - cpi->tok_count = (unsigned int)(tp - cpi->tok); + // For each row of SBs in the frame + vp9_get_tile_col_offsets(cm, tile_col); + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) { + encode_sb_row(cpi, mb_row, &tp, &totalrate); + } + cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old); + } } vpx_usec_timer_mark(&emr_timer); @@ -1388,8 +1392,7 @@ static void reset_skip_txfm_size_mb(VP9_COMP *cpi, const int segment_id = mbmi->segment_id; xd->mode_info_context = mi; - assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) || + assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) || (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff)); mbmi->txfm_size = txfm_max; } @@ -1413,9 +1416,8 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs, int x, y; for (y = 0; y < ymbs; y++) { - for (x = 0; x < xmbs; x++) { + for (x = 0; x < xmbs; x++) mi[y * mis + x].mbmi.txfm_size = txfm_size; - } } } @@ -1433,8 +1435,7 @@ static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi, const int xmbs = MIN(2, mb_cols_left); xd->mode_info_context = mi; - assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) || + assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) || (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs))); set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); } @@ -1454,8 +1455,7 @@ static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi, const int xmbs = MIN(4, mb_cols_left); xd->mode_info_context = mi; - assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) || + assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) || (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs))); set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); } @@ -1526,9 +1526,9 @@ void vp9_encode_frame(VP9_COMP *cpi) { */ if (cpi->common.frame_type == KEY_FRAME) frame_type = 0; - else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame) + else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) frame_type = 3; - else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame) + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) frame_type = 1; else frame_type = 2; @@ -1549,11 +1549,12 @@ void vp9_encode_frame(VP9_COMP *cpi) { pred_type = HYBRID_PREDICTION; /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */ -#if CONFIG_LOSSLESS + + cpi->mb.e_mbd.lossless = 0; if (cpi->oxcf.lossless) { txfm_type = ONLY_4X4; + cpi->mb.e_mbd.lossless = 1; } else -#endif /* FIXME (rbultje) * this is a hack (no really), basically to work around the complete * nonsense coefficient cost prediction for keyframes. The probabilities @@ -1671,7 +1672,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { // Update interpolation filter strategy for next frame. if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter)) - select_interp_filter_type(cpi); + vp9_select_interp_filter_type(cpi); } else { encode_frame_internal(cpi); } @@ -1683,30 +1684,23 @@ void vp9_setup_block_ptrs(MACROBLOCK *x) { int i; for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { + for (c = 0; c < 4; c++) x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4; - } } for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) { + for (c = 0; c < 2; c++) x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4; - } } for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) { + for (c = 0; c < 2; c++) x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4; - } } - x->block[24].src_diff = x->src_diff + 384; - - - for (i = 0; i < 25; i++) { + for (i = 0; i < 24; i++) x->block[i].coeff = x->coeff + i * 16; - } } void vp9_build_block_offsets(MACROBLOCK *x) { @@ -1995,7 +1989,6 @@ static void update_sb64_skip_coeff_state(VP9_COMP *cpi, } static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; @@ -2007,8 +2000,8 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, assert(!xd->mode_info_context->mbmi.sb_type); #ifdef ENC_DEBUG - enc_debug = (cpi->common.current_video_frame == 46 && - mb_row == 5 && mb_col == 2); + enc_debug = (cpi->common.current_video_frame == 2 && + mb_row == 5 && mb_col == 18); if (enc_debug) printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled); #endif @@ -2086,58 +2079,50 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); if (mbmi->ref_frame == LAST_FRAME) - ref_fb_idx = cpi->common.lst_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (mbmi->ref_frame == GOLDEN_FRAME) - ref_fb_idx = cpi->common.gld_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - ref_fb_idx = cpi->common.alt_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, + &cpi->common.yv12_fb[ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); if (mbmi->second_ref_frame > 0) { int second_ref_fb_idx; if (mbmi->second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cpi->common.lst_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (mbmi->second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cpi->common.gld_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - second_ref_fb_idx = cpi->common.alt_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + - recon_yoffset; - xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + - recon_uvoffset; - xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + - recon_uvoffset; + setup_pred_block(&xd->second_pre, + &cpi->common.yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); } if (!x->skip) { - vp9_encode_inter16x16(x); + vp9_encode_inter16x16(x, mb_row, mb_col); // Clear mb_skip_coeff if mb_no_coeff_skip is not set if (!cpi->common.mb_no_coeff_skip) mbmi->mb_skip_coeff = 0; } else { - vp9_build_1st_inter16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - } + vp9_build_inter16x16_predictors_mb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); #if CONFIG_COMP_INTERINTRA_PRED - else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { + if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { vp9_build_interintra_16x16_predictors_mb(xd, xd->dst.y_buffer, xd->dst.u_buffer, @@ -2150,7 +2135,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, } if (!x->skip) { -#ifdef ENC_DEBUG +#if 0 // def ENC_DEBUG if (enc_debug) { int i, j; printf("\n"); @@ -2227,8 +2212,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, int segment_id = mbmi->segment_id; if (cpi->common.txfm_mode == TX_MODE_SELECT && !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) || - (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP)))) { assert(mbmi->txfm_size <= TX_16X16); if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV) { @@ -2253,7 +2237,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, } static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; @@ -2326,37 +2309,37 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) - ref_fb_idx = cpi->common.lst_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) - ref_fb_idx = cpi->common.gld_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - ref_fb_idx = cpi->common.alt_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, + &cpi->common.yv12_fb[ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); if (xd->mode_info_context->mbmi.second_ref_frame > 0) { int second_ref_fb_idx; if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cpi->common.lst_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cpi->common.gld_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - second_ref_fb_idx = cpi->common.alt_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + - recon_yoffset; - xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + - recon_uvoffset; - xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + - recon_uvoffset; + setup_pred_block(&xd->second_pre, + &cpi->common.yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); } vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); + xd->dst.y_stride, xd->dst.uv_stride, + mb_row, mb_col); } if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { @@ -2465,8 +2448,7 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, if (output_enabled) { if (cm->txfm_mode == TX_MODE_SELECT && !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++; } else { TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? @@ -2485,7 +2467,6 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, } static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; @@ -2557,40 +2538,37 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) - ref_fb_idx = cpi->common.lst_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) - ref_fb_idx = cpi->common.gld_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - ref_fb_idx = cpi->common.alt_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->pre.y_buffer = - cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = - cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = - cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, + &cpi->common.yv12_fb[ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); if (xd->mode_info_context->mbmi.second_ref_frame > 0) { int second_ref_fb_idx; if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cpi->common.lst_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cpi->common.gld_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - second_ref_fb_idx = cpi->common.alt_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->second_pre.y_buffer = - cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset; - xd->second_pre.u_buffer = - cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset; - xd->second_pre.v_buffer = - cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->second_pre, + &cpi->common.yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); } vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); + xd->dst.y_stride, xd->dst.uv_stride, + mb_row, mb_col); } if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { @@ -2729,8 +2707,7 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, skip[4] && skip[5] && skip[6] && skip[7] && skip[8] && skip[9] && skip[10] && skip[11] && skip[12] && skip[13] && skip[14] && skip[15]))) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++; } else { int x, y; diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index 1b056e163..9f13edcec 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -14,8 +14,8 @@ struct macroblock; -extern void vp9_build_block_offsets(struct macroblock *x); +void vp9_build_block_offsets(struct macroblock *x); -extern void vp9_setup_block_ptrs(struct macroblock *x); +void vp9_setup_block_ptrs(struct macroblock *x); #endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index ce9a38003..be9c224b3 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -12,14 +12,11 @@ #include "vp9_rtcd.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/common/vp9_invtrans.h" #include "vp9/encoder/vp9_encodeintra.h" int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { - int i; - int intra_pred_var = 0; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; (void) cpi; @@ -30,15 +27,15 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { vp9_encode_intra16x16mby(x); } else { + int i; + for (i = 0; i < 16; i++) { x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED; vp9_encode_intra4x4block(x, i); } } - intra_pred_var = vp9_get_mb_ss(x->src_diff); - - return intra_pred_var; + return vp9_get_mb_ss(x->src_diff); } void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { @@ -50,17 +47,17 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { b->bmi.as_mode.context = vp9_find_bpred_context(b); #endif - vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor); + vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor); vp9_subtract_b(be, b, 16); tx_type = get_tx_type_4x4(&x->e_mbd, b); if (tx_type != DCT_DCT) { - vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); - vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, ib, tx_type); + vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type); } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b) ; + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, ib); vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32); } @@ -72,7 +69,6 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb) { for (i = 0; i < 16; i++) vp9_encode_intra4x4block(mb, i); - return; } void vp9_encode_intra16x16mby(MACROBLOCK *x) { @@ -84,24 +80,28 @@ void vp9_encode_intra16x16mby(MACROBLOCK *x) { vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride); - if (tx_size == TX_16X16) { - vp9_transform_mby_16x16(x); - vp9_quantize_mby_16x16(x); - if (x->optimize) - vp9_optimize_mby_16x16(x); - vp9_inverse_transform_mby_16x16(xd); - } else if (tx_size == TX_8X8) { - vp9_transform_mby_8x8(x); - vp9_quantize_mby_8x8(x); - if (x->optimize) - vp9_optimize_mby_8x8(x); - vp9_inverse_transform_mby_8x8(xd); - } else { - vp9_transform_mby_4x4(x); - vp9_quantize_mby_4x4(x); - if (x->optimize) - vp9_optimize_mby_4x4(x); - vp9_inverse_transform_mby_4x4(xd); + switch (tx_size) { + case TX_16X16: + vp9_transform_mby_16x16(x); + vp9_quantize_mby_16x16(x); + if (x->optimize) + vp9_optimize_mby_16x16(x); + vp9_inverse_transform_mby_16x16(xd); + break; + case TX_8X8: + vp9_transform_mby_8x8(x); + vp9_quantize_mby_8x8(x); + if (x->optimize) + vp9_optimize_mby_8x8(x); + vp9_inverse_transform_mby_8x8(xd); + break; + default: + vp9_transform_mby_4x4(x); + vp9_quantize_mby_4x4(x); + if (x->optimize) + vp9_optimize_mby_4x4(x); + vp9_inverse_transform_mby_4x4(xd); + break; } vp9_recon_mby(xd); @@ -116,19 +116,22 @@ void vp9_encode_intra16x16mbuv(MACROBLOCK *x) { vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, xd->predictor, x->src.uv_stride); - if (tx_size == TX_4X4) { - vp9_transform_mbuv_4x4(x); - vp9_quantize_mbuv_4x4(x); - if (x->optimize) - vp9_optimize_mbuv_4x4(x); - vp9_inverse_transform_mbuv_4x4(xd); - } else /* 16x16 or 8x8 */ { - vp9_transform_mbuv_8x8(x); - vp9_quantize_mbuv_8x8(x); - if (x->optimize) - vp9_optimize_mbuv_8x8(x); - vp9_inverse_transform_mbuv_8x8(xd); - } + switch (tx_size) { + case TX_4X4: + vp9_transform_mbuv_4x4(x); + vp9_quantize_mbuv_4x4(x); + if (x->optimize) + vp9_optimize_mbuv_4x4(x); + vp9_inverse_transform_mbuv_4x4(xd); + break; + default: // 16x16 or 8x8 + vp9_transform_mbuv_8x8(x); + vp9_quantize_mbuv_8x8(x); + if (x->optimize) + vp9_optimize_mbuv_8x8(x); + vp9_inverse_transform_mbuv_8x8(xd); + break; + } vp9_recon_intra_mbuv(xd); } @@ -141,7 +144,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { int i; TX_TYPE tx_type; - vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor); + vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, b->predictor); // generate residual blocks vp9_subtract_4b_c(be, b, 16); @@ -150,14 +153,13 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { tx_type = get_tx_type_8x8(xd, &xd->block[ib]); if (tx_type != DCT_DCT) { - vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, - tx_type, 8); - x->quantize_b_8x8(x->block + idx, xd->block + idx); - vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32, - tx_type, 8, xd->block[idx].eob); + vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); + x->quantize_b_8x8(x, idx); + vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, + 16, tx_type); } else { - x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32); - x->quantize_b_8x8(x->block + idx, xd->block + idx); + x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32); + x->quantize_b_8x8(x, idx); vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32); } } else { @@ -166,12 +168,18 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { be = &x->block[ib + iblock[i]]; tx_type = get_tx_type_4x4(xd, b); if (tx_type != DCT_DCT) { - vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); - vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); + vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type); + } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) { + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1); + vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32); + vp9_inverse_transform_b_4x4(xd, ib + iblock[i] + 1, 32); + i++; } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, ib + iblock[i]); vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32); } } @@ -186,25 +194,22 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { } void vp9_encode_intra8x8mby(MACROBLOCK *x) { - int i, ib; + int i; - for (i = 0; i < 4; i++) { - ib = vp9_i8x8_block[i]; - vp9_encode_intra8x8(x, ib); - } + for (i = 0; i < 4; i++) + vp9_encode_intra8x8(x, vp9_i8x8_block[i]); } -static void encode_intra_uv4x4(MACROBLOCK *x, int ib, - int mode) { +static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) { BLOCKD *b = &x->e_mbd.block[ib]; BLOCK *be = &x->block[ib]; - vp9_intra_uv4x4_predict(b, mode, b->predictor); + vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor); vp9_subtract_b(be, b, 8); - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16); - x->quantize_b_4x4(be, b); + x->fwd_txm4x4(be->src_diff, be->coeff, 16); + x->quantize_b_4x4(x, ib); vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16); vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst, @@ -212,17 +217,13 @@ static void encode_intra_uv4x4(MACROBLOCK *x, int ib, } void vp9_encode_intra8x8mbuv(MACROBLOCK *x) { - int i, ib, mode; - BLOCKD *b; + int i; for (i = 0; i < 4; i++) { - ib = vp9_i8x8_block[i]; - b = &x->e_mbd.block[ib]; - mode = b->bmi.as_mode.first; - - /*u */ - encode_intra_uv4x4(x, i + 16, mode); - /*v */ - encode_intra_uv4x4(x, i + 20, mode); + BLOCKD *b = &x->e_mbd.block[vp9_i8x8_block[i]]; + int mode = b->bmi.as_mode.first; + + encode_intra_uv4x4(x, i + 16, mode); // u + encode_intra_uv4x4(x, i + 20, mode); // v } } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 45278a71b..62f1a2a30 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -29,9 +29,8 @@ void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) { int r, c; for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { + for (c = 0; c < 4; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - } diff_ptr += pitch; pred_ptr += pitch; @@ -47,9 +46,9 @@ void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) { int r, c; for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { + for (c = 0; c < 8; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - } + diff_ptr += pitch; pred_ptr += pitch; src_ptr += src_stride; @@ -65,9 +64,8 @@ void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc, int r, c; for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { + for (c = 0; c < 8; c++) udiff[c] = usrc[c] - upred[c]; - } udiff += 8; upred += dst_stride; @@ -98,9 +96,8 @@ void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src, int src_stride, int r, c; for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { + for (c = 0; c < 16; c++) diff[c] = src[c] - pred[c]; - } diff += 16; pred += dst_stride; @@ -113,9 +110,8 @@ void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride, int r, c; for (r = 0; r < 32; r++) { - for (c = 0; c < 32; c++) { + for (c = 0; c < 32; c++) diff[c] = src[c] - pred[c]; - } diff += 32; pred += dst_stride; @@ -132,9 +128,8 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, int r, c; for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { + for (c = 0; c < 16; c++) udiff[c] = usrc[c] - upred[c]; - } udiff += 16; upred += dst_stride; @@ -142,9 +137,8 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, } for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { + for (c = 0; c < 16; c++) vdiff[c] = vsrc[c] - vpred[c]; - } vdiff += 16; vpred += dst_stride; @@ -166,52 +160,29 @@ static void subtract_mb(MACROBLOCK *x) { x->e_mbd.predictor, x->src.uv_stride); } -static void build_dcblock_4x4(MACROBLOCK *x) { - int16_t *src_diff_ptr = &x->src_diff[384]; - int i; - - for (i = 0; i < 16; i++) { - src_diff_ptr[i] = x->coeff[i * 16]; - x->coeff[i * 16] = 0; - } -} - void vp9_transform_mby_4x4(MACROBLOCK *x) { int i; MACROBLOCKD *xd = &x->e_mbd; - int has_2nd_order = get_2nd_order_usage(xd); for (i = 0; i < 16; i++) { BLOCK *b = &x->block[i]; TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 4); + vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type); + } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) { + x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32); + i++; } else { - x->vp9_short_fdct4x4(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 32); + x->fwd_txm4x4(x->block[i].src_diff, x->block[i].coeff, 32); } } - - if (has_2nd_order) { - // build dc block from 16 y dc values - build_dcblock_4x4(x); - - // do 2nd order transform on the dc block - x->short_walsh4x4(&x->block[24].src_diff[0], - &x->block[24].coeff[0], 8); - } else { - vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0])); - } } void vp9_transform_mbuv_4x4(MACROBLOCK *x) { int i; - for (i = 16; i < 24; i += 2) { - x->vp9_short_fdct8x4(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 16); - } + for (i = 16; i < 24; i += 2) + x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 16); } static void transform_mb_4x4(MACROBLOCK *x) { @@ -219,71 +190,36 @@ static void transform_mb_4x4(MACROBLOCK *x) { vp9_transform_mbuv_4x4(x); } -static void build_dcblock_8x8(MACROBLOCK *x) { - int16_t *src_diff_ptr = x->block[24].src_diff; - int i; - - for (i = 0; i < 16; i++) { - src_diff_ptr[i] = 0; - } - src_diff_ptr[0] = x->coeff[0 * 16]; - src_diff_ptr[1] = x->coeff[4 * 16]; - src_diff_ptr[4] = x->coeff[8 * 16]; - src_diff_ptr[8] = x->coeff[12 * 16]; - x->coeff[0 * 16] = 0; - x->coeff[4 * 16] = 0; - x->coeff[8 * 16] = 0; - x->coeff[12 * 16] = 0; -} - void vp9_transform_mby_8x8(MACROBLOCK *x) { int i; MACROBLOCKD *xd = &x->e_mbd; TX_TYPE tx_type; - int has_2nd_order = get_2nd_order_usage(xd); for (i = 0; i < 9; i += 8) { BLOCK *b = &x->block[i]; tx_type = get_tx_type_8x8(xd, &xd->block[i]); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 8); + vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type); } else { - x->vp9_short_fdct8x8(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 32); + x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 32); } } for (i = 2; i < 11; i += 8) { BLOCK *b = &x->block[i]; tx_type = get_tx_type_8x8(xd, &xd->block[i]); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_fht_c(b->src_diff, 32, (b + 2)->coeff, tx_type, 8); + vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type); } else { - x->vp9_short_fdct8x8(&x->block[i].src_diff[0], - &x->block[i + 2].coeff[0], 32); + x->fwd_txm8x8(x->block[i].src_diff, x->block[i + 2].coeff, 32); } } - - if (has_2nd_order) { - // build dc block from 2x2 y dc values - build_dcblock_8x8(x); - - // do 2nd order transform on the dc block - x->short_fhaar2x2(&x->block[24].src_diff[0], - &x->block[24].coeff[0], 8); - } else { - vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0])); - } } void vp9_transform_mbuv_8x8(MACROBLOCK *x) { int i; - for (i = 16; i < 24; i += 4) { - x->vp9_short_fdct8x8(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 16); - } + for (i = 16; i < 24; i += 4) + x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 16); } void vp9_transform_mb_8x8(MACROBLOCK *x) { @@ -297,10 +233,9 @@ void vp9_transform_mby_16x16(MACROBLOCK *x) { TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]); vp9_clear_system_state(); if (tx_type != DCT_DCT) { - vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 16); + vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type); } else { - x->vp9_short_fdct16x16(&x->block[0].src_diff[0], - &x->block[0].coeff[0], 32); + x->fwd_txm16x16(x->block[0].src_diff, x->block[0].coeff, 32); } } @@ -317,10 +252,8 @@ void vp9_transform_sby_32x32(MACROBLOCK *x) { void vp9_transform_sbuv_16x16(MACROBLOCK *x) { SUPERBLOCK * const x_sb = &x->sb_coeff_data; vp9_clear_system_state(); - x->vp9_short_fdct16x16(x_sb->src_diff + 1024, - x_sb->coeff + 1024, 32); - x->vp9_short_fdct16x16(x_sb->src_diff + 1280, - x_sb->coeff + 1280, 32); + x->fwd_txm16x16(x_sb->src_diff + 1024, x_sb->coeff + 1024, 32); + x->fwd_txm16x16(x_sb->src_diff + 1280, x_sb->coeff + 1280, 32); } #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) @@ -338,13 +271,10 @@ struct vp9_token_state { // TODO: experiments to find optimal multiple numbers #define Y1_RD_MULT 4 #define UV_RD_MULT 2 -#define Y2_RD_MULT 4 static const int plane_rd_mult[4] = { Y1_RD_MULT, - Y2_RD_MULT, UV_RD_MULT, - Y1_RD_MULT }; #define UPDATE_RD_COST()\ @@ -357,34 +287,39 @@ static const int plane_rd_mult[4] = { }\ } +// This function is a place holder for now but may ultimately need +// to scan previous tokens to work out the correct context. +static int trellis_get_coeff_context(int token) { + int recent_energy = 0; + return vp9_get_coef_context(&recent_energy, token); +} + static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int tx_size) { + const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME; + MACROBLOCKD *const xd = &mb->e_mbd; BLOCK *b = &mb->block[i]; - BLOCKD *d = &mb->e_mbd.block[i]; + BLOCKD *d = &xd->block[i]; vp9_token_state tokens[257][2]; unsigned best_index[257][2]; const int16_t *dequant_ptr = d->dequant, *coeff_ptr = b->coeff; int16_t *qcoeff_ptr = d->qcoeff; int16_t *dqcoeff_ptr = d->dqcoeff; - int eob = d->eob, final_eob, sz = 0; - int i0 = (type == PLANE_TYPE_Y_NO_DC); + int eob = xd->eobs[i], final_eob, sz = 0; + const int i0 = 0; int rc, x, next; int64_t rdmult, rddiv, rd_cost0, rd_cost1; int rate0, rate1, error0, error1, t0, t1; int best, band, pt; int err_mult = plane_rd_mult[type]; int default_eob; - int const *scan, *bands; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; -#endif + int const *scan; switch (tx_size) { default: case TX_4X4: scan = vp9_default_zig_zag1d_4x4; - bands = vp9_coef_bands_4x4; default_eob = 16; // TODO: this isn't called (for intra4x4 modes), but will be left in // since it could be used later @@ -411,18 +346,13 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, break; case TX_8X8: scan = vp9_default_zig_zag1d_8x8; - bands = vp9_coef_bands_8x8; default_eob = 64; break; case TX_16X16: scan = vp9_default_zig_zag1d_16x16; - bands = vp9_coef_bands_16x16; default_eob = 256; break; } -#if CONFIG_NEWCOEFCONTEXT - neighbors = vp9_get_coef_neighbors_handle(scan); -#endif /* Now set up a Viterbi trellis to evaluate alternative roundings. */ rdmult = mb->rdmult * err_mult; @@ -454,17 +384,12 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, t0 = (vp9_dct_value_tokens_ptr + x)->Token; /* Consider both possible successor states. */ if (next < default_eob) { - band = bands[i + 1]; - pt = vp9_prev_token_class[t0]; -#if CONFIG_NEWCOEFCONTEXT - if (NEWCOEFCONTEXT_BAND_COND(band)) - pt = vp9_get_coef_neighbor_context( - qcoeff_ptr, i0, neighbors, scan[i + 1]); -#endif + band = get_coef_band(tx_size, i + 1); + pt = trellis_get_coeff_context(t0); rate0 += - mb->token_costs[tx_size][type][band][pt][tokens[next][0].token]; + mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token]; rate1 += - mb->token_costs[tx_size][type][band][pt][tokens[next][1].token]; + mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ @@ -506,37 +431,15 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token; } if (next < default_eob) { - band = bands[i + 1]; + band = get_coef_band(tx_size, i + 1); if (t0 != DCT_EOB_TOKEN) { -#if CONFIG_NEWCOEFCONTEXT - int tmp = qcoeff_ptr[scan[i]]; - qcoeff_ptr[scan[i]] = x; - if (NEWCOEFCONTEXT_BAND_COND(band)) - pt = vp9_get_coef_neighbor_context( - qcoeff_ptr, i0, neighbors, scan[i + 1]); - else - pt = vp9_prev_token_class[t0]; - qcoeff_ptr[scan[i]] = tmp; -#else - pt = vp9_prev_token_class[t0]; -#endif - rate0 += mb->token_costs[tx_size][type][band][pt][ + pt = trellis_get_coeff_context(t0); + rate0 += mb->token_costs[tx_size][type][ref][band][pt][ tokens[next][0].token]; } if (t1 != DCT_EOB_TOKEN) { -#if CONFIG_NEWCOEFCONTEXT - int tmp = qcoeff_ptr[scan[i]]; - qcoeff_ptr[scan[i]] = x; - if (NEWCOEFCONTEXT_BAND_COND(band)) - pt = vp9_get_coef_neighbor_context( - qcoeff_ptr, i0, neighbors, scan[i + 1]); - else - pt = vp9_prev_token_class[t1]; - qcoeff_ptr[scan[i]] = tmp; -#else - pt = vp9_prev_token_class[t1]; -#endif - rate1 += mb->token_costs[tx_size][type][band][pt][ + pt = trellis_get_coeff_context(t1); + rate1 += mb->token_costs[tx_size][type][ref][band][pt][ tokens[next][1].token]; } } @@ -563,16 +466,18 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, * add a new trellis node, but we do need to update the costs. */ else { - band = bands[i + 1]; + band = get_coef_band(tx_size, i + 1); t0 = tokens[next][0].token; t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ if (t0 != DCT_EOB_TOKEN) { - tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0]; + tokens[next][0].rate += + mb->token_costs[tx_size][type][ref][band][0][t0]; tokens[next][0].token = ZERO_TOKEN; } if (t1 != DCT_EOB_TOKEN) { - tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1]; + tokens[next][1].rate += + mb->token_costs[tx_size][type][ref][band][0][t1]; tokens[next][1].token = ZERO_TOKEN; } /* Don't update next, because we didn't add a new node. */ @@ -580,7 +485,7 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, } /* Now pick the best path through the whole trellis. */ - band = bands[i + 1]; + band = get_coef_band(tx_size, i + 1); VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; @@ -588,8 +493,8 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][band][pt][t0]; - rate1 += mb->token_costs[tx_size][type][band][pt][t1]; + rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0]; + rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; final_eob = i0 - 1; @@ -606,81 +511,12 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, } final_eob++; - d->eob = final_eob; - *a = *l = (d->eob > !type); -} - -/************************************************************************** -our inverse hadamard transform effectively is weighted sum of all 16 inputs -with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And -dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the -output after inverse wht and idct will be all zero. A sum of absolute value -smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht -fall between -65 and +65. -**************************************************************************/ -#define SUM_2ND_COEFF_THRESH 65 - -static void check_reset_2nd_coeffs(MACROBLOCKD *xd, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { - int sum = 0; - int i; - BLOCKD *bd = &xd->block[24]; - if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH - && bd->dequant[1] >= SUM_2ND_COEFF_THRESH) - return; - - for (i = 0; i < bd->eob; i++) { - int coef = bd->dqcoeff[vp9_default_zig_zag1d_4x4[i]]; - sum += (coef >= 0) ? coef : -coef; - if (sum >= SUM_2ND_COEFF_THRESH) - return; - } - - if (sum < SUM_2ND_COEFF_THRESH) { - for (i = 0; i < bd->eob; i++) { - int rc = vp9_default_zig_zag1d_4x4[i]; - bd->qcoeff[rc] = 0; - bd->dqcoeff[rc] = 0; - } - bd->eob = 0; - *a = *l = (bd->eob != 0); - } -} - -#define SUM_2ND_COEFF_THRESH_8X8 32 -static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { - int sum = 0; - BLOCKD *bd = &xd->block[24]; - int coef; - - coef = bd->dqcoeff[0]; - sum += (coef >= 0) ? coef : -coef; - coef = bd->dqcoeff[1]; - sum += (coef >= 0) ? coef : -coef; - coef = bd->dqcoeff[4]; - sum += (coef >= 0) ? coef : -coef; - coef = bd->dqcoeff[8]; - sum += (coef >= 0) ? coef : -coef; - - if (sum < SUM_2ND_COEFF_THRESH_8X8) { - bd->qcoeff[0] = 0; - bd->dqcoeff[0] = 0; - bd->qcoeff[1] = 0; - bd->dqcoeff[1] = 0; - bd->qcoeff[4] = 0; - bd->dqcoeff[4] = 0; - bd->qcoeff[8] = 0; - bd->dqcoeff[8] = 0; - bd->eob = 0; - *a = *l = (bd->eob != 0); - } + xd->eobs[d - xd->block] = final_eob; + *a = *l = (final_eob > 0); } void vp9_optimize_mby_4x4(MACROBLOCK *x) { int b; - PLANE_TYPE type; - int has_2nd_order; ENTROPY_CONTEXT_PLANES t_above, t_left; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; @@ -694,25 +530,11 @@ void vp9_optimize_mby_4x4(MACROBLOCK *x) { ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; - has_2nd_order = get_2nd_order_usage(&x->e_mbd); - - type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; - for (b = 0; b < 16; b++) { - optimize_b(x, b, type, + optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_4X4][b], tl + vp9_block2left[TX_4X4][b], TX_4X4); } - - if (has_2nd_order) { - b = 24; - optimize_b(x, b, PLANE_TYPE_Y2, - ta + vp9_block2above[TX_4X4][b], - tl + vp9_block2left[TX_4X4][b], TX_4X4); - check_reset_2nd_coeffs(&x->e_mbd, - ta + vp9_block2above[TX_4X4][b], - tl + vp9_block2left[TX_4X4][b]); - } } void vp9_optimize_mbuv_4x4(MACROBLOCK *x) { @@ -744,11 +566,9 @@ static void optimize_mb_4x4(MACROBLOCK *x) { void vp9_optimize_mby_8x8(MACROBLOCK *x) { int b; - PLANE_TYPE type; ENTROPY_CONTEXT_PLANES t_above, t_left; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; - int has_2nd_order = get_2nd_order_usage(&x->e_mbd); if (!x->e_mbd.above_context || !x->e_mbd.left_context) return; @@ -758,28 +578,15 @@ void vp9_optimize_mby_8x8(MACROBLOCK *x) { ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; - type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; for (b = 0; b < 16; b += 4) { ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b]; ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b]; -#if CONFIG_CNVCONTEXT ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0; ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0; -#else - ENTROPY_CONTEXT above_ec = a[0]; - ENTROPY_CONTEXT left_ec = l[0]; -#endif - optimize_b(x, b, type, &above_ec, &left_ec, TX_8X8); + optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, &above_ec, &left_ec, TX_8X8); a[1] = a[0] = above_ec; l[1] = l[0] = left_ec; } - - // 8x8 always have 2nd order block - if (has_2nd_order) { - check_reset_8x8_2nd_coeffs(&x->e_mbd, - ta + vp9_block2above[TX_8X8][24], - tl + vp9_block2left[TX_8X8][24]); - } } void vp9_optimize_mbuv_8x8(MACROBLOCK *x) { @@ -793,13 +600,8 @@ void vp9_optimize_mbuv_8x8(MACROBLOCK *x) { for (b = 16; b < 24; b += 4) { ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b]; ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b]; -#if CONFIG_CNVCONTEXT ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0; ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0; -#else - ENTROPY_CONTEXT above_ec = a[0]; - ENTROPY_CONTEXT left_ec = l[0]; -#endif optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8); } } @@ -817,13 +619,8 @@ void vp9_optimize_mby_16x16(MACROBLOCK *x) { if (!t_above || !t_left) return; -#if CONFIG_CNVCONTEXT ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0; tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0; -#else - ta = t_above->y1[0]; - tl = t_left->y1[0]; -#endif optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16); } @@ -871,21 +668,21 @@ void vp9_fidct_mb(MACROBLOCK *x) { } } -void vp9_encode_inter16x16(MACROBLOCK *x) { +void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col) { MACROBLOCKD *const xd = &x->e_mbd; - vp9_build_inter_predictors_mb(xd); + vp9_build_inter_predictors_mb(xd, mb_row, mb_col); subtract_mb(x); vp9_fidct_mb(x); vp9_recon_mb(xd); } /* this function is used by first pass only */ -void vp9_encode_inter16x16y(MACROBLOCK *x) { +void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) { MACROBLOCKD *xd = &x->e_mbd; BLOCK *b = &x->block[0]; - vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0); + vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col); vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride); diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index f3c679227..6356df215 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -23,14 +23,14 @@ typedef struct { #include "vp9/encoder/vp9_onyx_int.h" struct VP9_ENCODER_RTCD; -void vp9_encode_inter16x16(MACROBLOCK *x); +void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col); void vp9_transform_mbuv_4x4(MACROBLOCK *x); void vp9_transform_mby_4x4(MACROBLOCK *x); void vp9_optimize_mby_4x4(MACROBLOCK *x); void vp9_optimize_mbuv_4x4(MACROBLOCK *x); -void vp9_encode_inter16x16y(MACROBLOCK *x); +void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col); void vp9_transform_mb_8x8(MACROBLOCK *mb); void vp9_transform_mby_8x8(MACROBLOCK *x); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 8df6c20a7..337276d59 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -435,9 +435,11 @@ void vp9_first_pass(VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; int recon_yoffset, recon_uvoffset; - YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; + YV12_BUFFER_CONFIG *lst_yv12 = + &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]]; YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; - YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *gld_yv12 = + &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]]; int recon_y_stride = lst_yv12->y_stride; int recon_uv_stride = lst_yv12->uv_stride; int64_t intra_error = 0; @@ -611,7 +613,7 @@ void vp9_first_pass(VP9_COMP *cpi) { this_error = motion_error; vp9_set_mbmode_and_mvs(x, NEWMV, &mv); xd->mode_info_context->mbmi.txfm_size = TX_4X4; - vp9_encode_inter16x16y(x); + vp9_encode_inter16x16y(x, mb_row, mb_col); sum_mvr += mv.as_mv.row; sum_mvr_abs += abs(mv.as_mv.row); sum_mvc += mv.as_mv.col; @@ -843,16 +845,13 @@ static double calc_correction_factor(double err_per_mb, power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low; power_term = (power_term > pt_high) ? pt_high : power_term; - // Adjustments to error term - // TBD - // Calculate correction factor correction_factor = pow(error_term, power_term); // Clip range correction_factor = (correction_factor < 0.05) - ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor; + ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor; return correction_factor; } @@ -886,8 +885,7 @@ static void adjust_maxq_qrange(VP9_COMP *cpi) { static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh, - int overhead_bits) { + int section_target_bandwitdh) { int Q; int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; @@ -898,7 +896,6 @@ static int estimate_max_q(VP9_COMP *cpi, double err_per_mb = section_err / num_mbs; double err_correction_factor; double speed_correction = 1.0; - double overhead_bits_per_mb; if (section_target_bandwitdh <= 0) return cpi->twopass.maxq_max_limit; // Highest value allowed @@ -950,13 +947,6 @@ static int estimate_max_q(VP9_COMP *cpi, speed_correction = 1.25; } - // Estimate of overhead bits per mb - // Correction to overhead bits for min allowed Q. - // PGW TODO.. This code is broken for the extended Q range - // for now overhead set to 0. - overhead_bits_per_mb = overhead_bits / num_mbs; - overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit); - // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) { @@ -967,23 +957,9 @@ static int estimate_max_q(VP9_COMP *cpi, sr_correction * speed_correction * cpi->twopass.est_max_qcorrection_factor; - if (err_correction_factor < 0.05) - err_correction_factor = 0.05; - else if (err_correction_factor > 5.0) - err_correction_factor = 5.0; bits_per_mb_at_this_q = - vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb; - - bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * - (double)bits_per_mb_at_this_q); - - // Mode and motion overhead - // As Q rises in real encode loop rd code will force overhead down - // We make a crude adjustment for this here as *.98 per Q step. - // PGW TODO.. This code is broken for the extended Q range - // for now overhead set to 0. - // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98); + vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor); if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; @@ -1001,7 +977,7 @@ static int estimate_max_q(VP9_COMP *cpi, // PGW TODO.. This code is broken for the extended Q range if ((cpi->ni_frames > ((int)cpi->twopass.total_stats->count >> 8)) && - (cpi->ni_frames > 150)) { + (cpi->ni_frames > 25)) { adjust_maxq_qrange(cpi); } @@ -1012,8 +988,7 @@ static int estimate_max_q(VP9_COMP *cpi, // complexity and data rate. static int estimate_cq(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh, - int overhead_bits) { + int section_target_bandwitdh) { int Q; int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; @@ -1026,15 +1001,11 @@ static int estimate_cq(VP9_COMP *cpi, double speed_correction = 1.0; double clip_iiratio; double clip_iifactor; - double overhead_bits_per_mb; - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); - // Estimate of overhead bits per mb - overhead_bits_per_mb = overhead_bits / num_mbs; // Corrections for higher compression speed settings // (reduced compression expected) @@ -1073,23 +1044,8 @@ static int estimate_cq(VP9_COMP *cpi, calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) * sr_correction * speed_correction * clip_iifactor; - if (err_correction_factor < 0.05) - err_correction_factor = 0.05; - else if (err_correction_factor > 5.0) - err_correction_factor = 5.0; - bits_per_mb_at_this_q = - vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb; - - bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * - (double)bits_per_mb_at_this_q); - - // Mode and motion overhead - // As Q rises in real encode loop rd code will force overhead down - // We make a crude adjustment for this here as *.98 per Q step. - // PGW TODO.. This code is broken for the extended Q range - // for now overhead set to 0. - overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98); + vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor); if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; @@ -1953,8 +1909,6 @@ void vp9_second_pass(VP9_COMP *cpi) { double this_frame_intra_error; double this_frame_coded_error; - int overhead_bits; - if (!cpi->twopass.stats_in) { return; } @@ -2018,11 +1972,6 @@ void vp9_second_pass(VP9_COMP *cpi) { if (cpi->target_bandwidth < 0) cpi->target_bandwidth = 0; - - // Account for mv, mode and other overheads. - overhead_bits = (int)estimate_modemvcost( - cpi, cpi->twopass.total_left_stats); - // Special case code for first frame. if (cpi->common.current_video_frame == 0) { cpi->twopass.est_max_qcorrection_factor = 1.0; @@ -2034,8 +1983,7 @@ void vp9_second_pass(VP9_COMP *cpi) { est_cq = estimate_cq(cpi, cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)(cpi->twopass.bits_left / frames_left)); cpi->cq_target_quality = cpi->oxcf.cq_level; if (est_cq > cpi->cq_target_quality) @@ -2049,21 +1997,23 @@ void vp9_second_pass(VP9_COMP *cpi) { tmp_q = estimate_max_q( cpi, cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)(cpi->twopass.bits_left / frames_left)); cpi->active_worst_quality = tmp_q; cpi->ni_av_qi = tmp_q; cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); +#ifndef ONE_SHOT_Q_ESTIMATE // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial // estimate for the clip is bad, but helps prevent excessive // variation in Q, especially near the end of a clip // where for example a small overspend may cause Q to crash adjust_maxq_qrange(cpi); +#endif } +#ifndef ONE_SHOT_Q_ESTIMATE // The last few frames of a clip almost always have to few or too many // bits and for the sake of over exact rate control we dont want to make // radical adjustments to the allowed quantizer range just to use up a @@ -2078,13 +2028,13 @@ void vp9_second_pass(VP9_COMP *cpi) { tmp_q = estimate_max_q( cpi, cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)(cpi->twopass.bits_left / frames_left)); // Make a damped adjustment to active max Q cpi->active_worst_quality = adjust_active_maxq(cpi->active_worst_quality, tmp_q); } +#endif cpi->twopass.frames_to_key--; @@ -2092,7 +2042,6 @@ void vp9_second_pass(VP9_COMP *cpi) { subtract_stats(cpi->twopass.total_left_stats, &this_frame); } - static int test_candidate_kf(VP9_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 19bc4d67d..2296a6669 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -11,12 +11,12 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ -extern void vp9_init_first_pass(VP9_COMP *cpi); -extern void vp9_first_pass(VP9_COMP *cpi); -extern void vp9_end_first_pass(VP9_COMP *cpi); +void vp9_init_first_pass(VP9_COMP *cpi); +void vp9_first_pass(VP9_COMP *cpi); +void vp9_end_first_pass(VP9_COMP *cpi); -extern void vp9_init_second_pass(VP9_COMP *cpi); -extern void vp9_second_pass(VP9_COMP *cpi); -extern void vp9_end_second_pass(VP9_COMP *cpi); +void vp9_init_second_pass(VP9_COMP *cpi); +void vp9_second_pass(VP9_COMP *cpi); +void vp9_end_second_pass(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 0ff60c8b0..121de653f 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -20,14 +20,16 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv *ref_mv, - int_mv *dst_mv) { + int_mv *dst_mv, + int mb_row, + int mb_col) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; BLOCK *b = &x->block[0]; BLOCKD *d = &xd->block[0]; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; unsigned int best_err; - int step_param; + int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; @@ -36,11 +38,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv ref_full; // Further step/diamond searches as necessary - if (cpi->Speed < 8) { - step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0); - } else { - step_param = cpi->sf.first_step + 2; - } + int step_param = cpi->sf.first_step + + (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2); vp9_clamp_mv_min_max(x, ref_mv); @@ -72,7 +71,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, } vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv); - vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0); + vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col); best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride, xd->predictor, 16, INT_MAX); @@ -93,8 +92,9 @@ static int do_16x16_motion_search YV12_BUFFER_CONFIG *buf, int buf_mb_y_offset, YV12_BUFFER_CONFIG *ref, - int mb_y_offset -) { + int mb_y_offset, + int mb_row, + int mb_col) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; unsigned int err, tmp_err; @@ -124,7 +124,7 @@ static int do_16x16_motion_search // Test last reference frame using the previous best mv as the // starting point (best reference) for the search - tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv); + tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col); if (tmp_err < err) { err = tmp_err; dst_mv->as_int = tmp_mv.as_int; @@ -136,7 +136,8 @@ static int do_16x16_motion_search int_mv zero_ref_mv, tmp_mv; zero_ref_mv.as_int = 0; - tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv); + tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, + mb_row, mb_col); if (tmp_err < err) { dst_mv->as_int = tmp_mv.as_int; err = tmp_err; @@ -229,7 +230,9 @@ static void update_mbgraph_mb_stats int gld_y_offset, YV12_BUFFER_CONFIG *alt_ref, int_mv *prev_alt_ref_mv, - int arf_y_offset + int arf_y_offset, + int mb_row, + int mb_col ) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -249,7 +252,8 @@ static void update_mbgraph_mb_stats int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv, &stats->ref[GOLDEN_FRAME].m.mv, buf, mb_y_offset, - golden_ref, gld_y_offset); + golden_ref, gld_y_offset, + mb_row, mb_col); stats->ref[GOLDEN_FRAME].err = g_motion_error; } else { stats->ref[GOLDEN_FRAME].err = INT_MAX; @@ -292,6 +296,9 @@ static void update_mbgraph_frame_stats int_mv arf_top_mv, gld_top_mv; MODE_INFO mi_local; + // Make sure the mi context starts in a consistent state. + memset(&mi_local, 0, sizeof(mi_local)); + // Set up limit values for motion vectors to prevent them extending outside the UMV borders arf_top_mv.as_int = 0; gld_top_mv.as_int = 0; @@ -323,7 +330,8 @@ static void update_mbgraph_frame_stats update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref, &gld_left_mv, gld_y_in_offset, - alt_ref, &arf_left_mv, arf_y_in_offset); + alt_ref, &arf_left_mv, arf_y_in_offset, + mb_row, mb_col); arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int; gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int; if (mb_col == 0) { @@ -427,13 +435,11 @@ static void separate_arf_mbs(VP9_COMP *cpi) { vpx_free(arf_not_zz); } -void vp9_update_mbgraph_stats -( - VP9_COMP *cpi -) { +void vp9_update_mbgraph_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int i, n_frames = vp9_lookahead_depth(cpi->lookahead); - YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *golden_ref = + &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]]; // we need to look ahead beyond where the ARF transitions into // being a GF - so exit if we don't look ahead beyond that diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h index db23eca33..c5bca4d01 100644 --- a/vp9/encoder/vp9_mbgraph.h +++ b/vp9/encoder/vp9_mbgraph.h @@ -11,6 +11,6 @@ #ifndef VP9_ENCODER_VP9_MBGRAPH_H_ #define VP9_ENCODER_VP9_MBGRAPH_H_ -extern void vp9_update_mbgraph_stats(VP9_COMP *cpi); +void vp9_update_mbgraph_stats(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 4694a92c6..300d9f85c 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -8,22 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <stdio.h> +#include <limits.h> +#include <math.h> #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_mcomp.h" #include "vpx_mem/vpx_mem.h" #include "./vpx_config.h" -#include <stdio.h> -#include <limits.h> -#include <math.h> #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" -#ifdef ENTROPY_STATS -static int mv_ref_ct [31] [4] [2]; -static int mv_mode_cts [4] [2]; -#endif - void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL + ((ref_mv->as_mv.col & 7) ? 1 : 0); @@ -44,21 +39,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { } int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], - int Weight, int ishp) { + int weight, int ishp) { MV v; - v.row = (mv->as_mv.row - ref->as_mv.row); - v.col = (mv->as_mv.col - ref->as_mv.col); + v.row = mv->as_mv.row - ref->as_mv.row; + v.col = mv->as_mv.col - ref->as_mv.col; return ((mvjcost[vp9_get_mv_joint(v)] + - mvcost[0][v.row] + mvcost[1][v.col]) * - Weight) >> 7; + mvcost[0][v.row] + mvcost[1][v.col]) * weight) >> 7; } static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], int error_per_bit, int ishp) { if (mvcost) { MV v; - v.row = (mv->as_mv.row - ref->as_mv.row); - v.col = (mv->as_mv.col - ref->as_mv.col); + v.row = mv->as_mv.row - ref->as_mv.row; + v.col = mv->as_mv.col - ref->as_mv.col; return ((mvjcost[vp9_get_mv_joint(v)] + mvcost[0][v.row] + mvcost[1][v.col]) * error_per_bit + 128) >> 8; @@ -68,11 +62,10 @@ static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost, int *mvsadcost[2], int error_per_bit) { - if (mvsadcost) { MV v; - v.row = (mv->as_mv.row - ref->as_mv.row); - v.col = (mv->as_mv.col - ref->as_mv.col); + v.row = mv->as_mv.row - ref->as_mv.row; + v.col = mv->as_mv.col - ref->as_mv.col; return ((mvjsadcost[vp9_get_mv_joint(v)] + mvsadcost[0][v.row] + mvsadcost[1][v.col]) * error_per_bit + 128) >> 8; @@ -81,45 +74,39 @@ static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost, } void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { - int Len; + int len; int search_site_count = 0; - // Generate offsets for 4 search sites per step. - Len = MAX_FIRST_STEP; x->ss[search_site_count].mv.col = 0; x->ss[search_site_count].mv.row = 0; x->ss[search_site_count].offset = 0; search_site_count++; - while (Len > 0) { - + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride; search_site_count++; // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.col = -len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = -Len; + x->ss[search_site_count].offset = -len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.col = len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = Len; + x->ss[search_site_count].offset = len; search_site_count++; - - // Contract. - Len /= 2; } x->ss_count = search_site_count; @@ -127,68 +114,63 @@ void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { } void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { - int Len; + int len; int search_site_count = 0; // Generate offsets for 8 search sites per step. - Len = MAX_FIRST_STEP; x->ss[search_site_count].mv.col = 0; x->ss[search_site_count].mv.row = 0; x->ss[search_site_count].offset = 0; search_site_count++; - while (Len > 0) { - + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride; search_site_count++; // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.col = -len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = -Len; + x->ss[search_site_count].offset = -len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.col = len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = Len; + x->ss[search_site_count].offset = len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride - Len; + x->ss[search_site_count].mv.col = -len; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride - len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride + Len; + x->ss[search_site_count].mv.col = len; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride + len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride - Len; + x->ss[search_site_count].mv.col = -len; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride - len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride + Len; + x->ss[search_site_count].mv.col = len; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride + len; search_site_count++; - - // Contract. - Len /= 2; } x->ss_count = search_site_count; @@ -1546,7 +1528,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int in_what_stride = d->pre_stride; int mv_stride = d->pre_stride; uint8_t *bestaddress; - int_mv *best_mv = &d->bmi.as_mv.first; + int_mv *best_mv = &d->bmi.as_mv[0]; int_mv this_mv; int bestsad = INT_MAX; int r, c; @@ -1641,7 +1623,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int in_what_stride = d->pre_stride; int mv_stride = d->pre_stride; uint8_t *bestaddress; - int_mv *best_mv = &d->bmi.as_mv.first; + int_mv *best_mv = &d->bmi.as_mv[0]; int_mv this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1770,7 +1752,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int in_what_stride = d->pre_stride; int mv_stride = d->pre_stride; uint8_t *bestaddress; - int_mv *best_mv = &d->bmi.as_mv.first; + int_mv *best_mv = &d->bmi.as_mv[0]; int_mv this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1787,7 +1769,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int col_min = ref_col - distance; int col_max = ref_col + distance; - DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8); + DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8); unsigned int sad_array[3]; int_mv fcenter_mv; @@ -2023,12 +2005,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, for (i = 0; i < search_range; i++) { int best_site = -1; - int all_in = 1; - - all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min); - all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max); - all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min); - all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max); + int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) & + ((ref_mv->as_mv.row + 1) < x->mv_row_max) & + ((ref_mv->as_mv.col - 1) > x->mv_col_min) & + ((ref_mv->as_mv.col + 1) < x->mv_col_max); if (all_in) { unsigned int sad_array[4]; @@ -2103,21 +2083,22 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #ifdef ENTROPY_STATS -void print_mode_context(void) { +void print_mode_context(VP9_COMMON *pc) { FILE *f = fopen("vp9_modecont.c", "a"); int i, j; fprintf(f, "#include \"vp9_entropy.h\"\n"); - fprintf(f, "const int vp9_mode_contexts[6][4] ="); + fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] ="); fprintf(f, "{\n"); - for (j = 0; j < 6; j++) { + for (j = 0; j < INTER_MODE_CONTEXTS; j++) { fprintf(f, " {/* %d */ ", j); fprintf(f, " "); for (i = 0; i < 4; i++) { int this_prob; // context probs - this_prob = get_binary_prob(mv_ref_ct[j][i][0], mv_ref_ct[j][i][1]); + this_prob = get_binary_prob(pc->fc.mv_ref_ct[j][i][0], + pc->fc.mv_ref_ct[j][i][1]); fprintf(f, "%5d, ", this_prob); } @@ -2128,44 +2109,4 @@ void print_mode_context(void) { fclose(f); } -/* MV ref count ENTROPY_STATS stats code */ -void init_mv_ref_counts() { - vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); - vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts)); -} - -void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) { - if (m == ZEROMV) { - ++mv_ref_ct [ct[0]] [0] [0]; - ++mv_mode_cts[0][0]; - } else { - ++mv_ref_ct [ct[0]] [0] [1]; - ++mv_mode_cts[0][1]; - - if (m == NEARESTMV) { - ++mv_ref_ct [ct[1]] [1] [0]; - ++mv_mode_cts[1][0]; - } else { - ++mv_ref_ct [ct[1]] [1] [1]; - ++mv_mode_cts[1][1]; - - if (m == NEARMV) { - ++mv_ref_ct [ct[2]] [2] [0]; - ++mv_mode_cts[2][0]; - } else { - ++mv_ref_ct [ct[2]] [2] [1]; - ++mv_mode_cts[2][1]; - - if (m == NEWMV) { - ++mv_ref_ct [ct[3]] [3] [0]; - ++mv_mode_cts[3][0]; - } else { - ++mv_ref_ct [ct[3]] [3] [1]; - ++mv_mode_cts[3][1]; - } - } - } - } -} - #endif/* END MV ref count ENTROPY_STATS stats code */ diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 358d10bc6..2479d7235 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -16,9 +16,7 @@ #include "vp9/encoder/vp9_variance.h" #ifdef ENTROPY_STATS -extern void init_mv_ref_counts(); -extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); -void print_mode_context(void); +void print_mode_context(VP9_COMMON *pc); #endif @@ -26,11 +24,12 @@ void print_mode_context(void); #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Max full pel mv specified in 1 pel units #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units -extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); -extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, - int *mvcost[2], int Weight, int ishp); -extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); -extern void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); +void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); +int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, + int *mvcost[2], int weight, int ishp); +void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); +void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); + // Runs sequence of diamond searches in smaller steps for RD struct VP9_COMP; int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b, @@ -39,20 +38,13 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b, vp9_variance_fn_ptr_t *fn_ptr, int_mv *ref_mv, int_mv *dst_mv); -extern int vp9_hex_search -( - MACROBLOCK *x, - BLOCK *b, - BLOCKD *d, - int_mv *ref_mv, - int_mv *best_mv, - int search_param, - int error_per_bit, - const vp9_variance_fn_ptr_t *vf, - int *mvjsadcost, int *mvsadcost[2], - int *mvjcost, int *mvcost[2], - int_mv *center_mv -); +int vp9_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *ref_mv, int_mv *best_mv, + int search_param, int error_per_bit, + const vp9_variance_fn_ptr_t *vf, + int *mvjsadcost, int *mvsadcost[2], + int *mvjcost, int *mvcost[2], + int_mv *center_mv); typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 27e0e48a3..5278ac2a3 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -10,7 +10,9 @@ #include "vpx_config.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_quantize.h" @@ -22,6 +24,7 @@ #include "vp9/common/vp9_extend.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_segmentation.h" #include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" @@ -236,12 +239,12 @@ static void update_base_skip_probs(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME) { vp9_update_skip_probs(cpi); - if (cm->refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { int k; for (k = 0; k < MBSKIP_CONTEXTS; ++k) cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k]; cpi->last_skip_probs_q[2] = cm->base_qindex; - } else if (cpi->common.refresh_golden_frame) { + } else if (cpi->refresh_golden_frame) { int k; for (k = 0; k < MBSKIP_CONTEXTS; ++k) cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k]; @@ -388,7 +391,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { return target_index - start_index; } -static void init_seg_features(VP9_COMP *cpi) { +static void configure_static_seg_features(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &cpi->mb.e_mbd; @@ -408,10 +411,8 @@ static void init_seg_features(VP9_COMP *cpi) { // Clear down the segment features. vp9_clearall_segfeatures(xd); - } - - // If this is an alt ref frame - else if (cm->refresh_alt_ref_frame) { + } else if (cpi->refresh_alt_ref_frame) { + // If this is an alt ref frame // Clear down the global segmentation map vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols)); xd->update_mb_segmentation_map = 0; @@ -448,7 +449,7 @@ static void init_seg_features(VP9_COMP *cpi) { else if (xd->segmentation_enabled) { // First normal frame in a valid gf or alt ref group if (cpi->common.frames_since_golden == 0) { - // Set up segment features for normal frames in an af group + // Set up segment features for normal frames in an arf group if (cpi->source_alt_ref_active) { xd->update_mb_segmentation_map = 0; xd->update_mb_segmentation_data = 1; @@ -465,16 +466,9 @@ static void init_seg_features(VP9_COMP *cpi) { // Segment coding disabled for compred testing if (high_q || (cpi->static_mb_pct == 100)) { - // set_segref(xd, 1, LAST_FRAME); vp9_set_segref(xd, 1, ALTREF_FRAME); vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME); - - vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV); - vp9_enable_segfeature(xd, 1, SEG_LVL_MODE); - - // EOB segment coding not fixed for 8x8 yet - vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0); - vp9_enable_segfeature(xd, 1, SEG_LVL_EOB); + vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP); } } // Disable segmentation and clear down features if alt ref @@ -493,29 +487,23 @@ static void init_seg_features(VP9_COMP *cpi) { } // Special case where we are coding over the top of a previous - // alt ref frame + // alt ref frame. // Segment coding disabled for compred testing else if (cpi->is_src_frame_alt_ref) { - // Enable mode and ref frame features for segment 0 as well + // Enable ref frame features for segment 0 as well vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(xd, 0, SEG_LVL_MODE); vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(xd, 1, SEG_LVL_MODE); - // All mbs should use ALTREF_FRAME, ZEROMV exclusively + // All mbs should use ALTREF_FRAME vp9_clear_segref(xd, 0); vp9_set_segref(xd, 0, ALTREF_FRAME); vp9_clear_segref(xd, 1); vp9_set_segref(xd, 1, ALTREF_FRAME); - vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV); - vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV); - // Skip all MBs if high Q + // Skip all MBs if high Q (0,0 mv and skip coeffs) if (high_q) { - vp9_enable_segfeature(xd, 0, SEG_LVL_EOB); - vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0); - vp9_enable_segfeature(xd, 1, SEG_LVL_EOB); - vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0); + vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP); + vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP); } // Enable data udpate xd->update_mb_segmentation_data = 1; @@ -590,16 +578,165 @@ static void set_default_lf_deltas(VP9_COMP *cpi) { cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv } +static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { + SPEED_FEATURES *sf = &cpi->sf; + int speed_multiplier = speed + 1; + int i; + + // Set baseline threshold values + for (i = 0; i < MAX_MODES; ++i) { + sf->thresh_mult[i] = (mode == 0) ? -500 : 0; + } + + sf->thresh_mult[THR_ZEROMV ] = 0; + sf->thresh_mult[THR_ZEROG ] = 0; + sf->thresh_mult[THR_ZEROA ] = 0; + + sf->thresh_mult[THR_NEARESTMV] = 0; + sf->thresh_mult[THR_NEARESTG ] = 0; + sf->thresh_mult[THR_NEARESTA ] = 0; + + sf->thresh_mult[THR_NEARMV ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEARG ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEARA ] += speed_multiplier * 1000; + + sf->thresh_mult[THR_DC ] = 0; + sf->thresh_mult[THR_TM ] += speed_multiplier * 1000; + sf->thresh_mult[THR_V_PRED ] += speed_multiplier * 1000; + sf->thresh_mult[THR_H_PRED ] += speed_multiplier * 1000; + sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500; + sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500; + sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500; + sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500; + sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500; + sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500; + + sf->thresh_mult[THR_B_PRED ] += speed_multiplier * 2500; + sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500; + + sf->thresh_mult[THR_NEWMV ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEWG ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEWA ] += speed_multiplier * 1000; + + sf->thresh_mult[THR_SPLITMV ] += speed_multiplier * 2500; + sf->thresh_mult[THR_SPLITG ] += speed_multiplier * 2500; + sf->thresh_mult[THR_SPLITA ] += speed_multiplier * 2500; + + sf->thresh_mult[THR_COMP_ZEROLG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_ZEROLA ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_ZEROGA ] += speed_multiplier * 1500; + + sf->thresh_mult[THR_COMP_NEARESTLG] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500; + + sf->thresh_mult[THR_COMP_NEARLG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARLA ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARGA ] += speed_multiplier * 1500; + + sf->thresh_mult[THR_COMP_NEWLG ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_NEWLA ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_NEWGA ] += speed_multiplier * 2000; + + sf->thresh_mult[THR_COMP_SPLITLA ] += speed_multiplier * 4500; + sf->thresh_mult[THR_COMP_SPLITGA ] += speed_multiplier * 4500; + sf->thresh_mult[THR_COMP_SPLITLG ] += speed_multiplier * 4500; + +#if CONFIG_COMP_INTERINTRA_PRED + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] += speed_multiplier * 1500; + + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500; + + sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] += speed_multiplier * 1500; + + sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] += speed_multiplier * 2000; +#endif + + /* disable frame modes if flags not set */ + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { + sf->thresh_mult[THR_NEWMV ] = INT_MAX; + sf->thresh_mult[THR_NEARESTMV] = INT_MAX; + sf->thresh_mult[THR_ZEROMV ] = INT_MAX; + sf->thresh_mult[THR_NEARMV ] = INT_MAX; + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; +#if CONFIG_COMP_INTERINTRA_PRED + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = INT_MAX; +#endif + } + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + sf->thresh_mult[THR_NEARESTG ] = INT_MAX; + sf->thresh_mult[THR_ZEROG ] = INT_MAX; + sf->thresh_mult[THR_NEARG ] = INT_MAX; + sf->thresh_mult[THR_NEWG ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; +#if CONFIG_COMP_INTERINTRA_PRED + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = INT_MAX; +#endif + } + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { + sf->thresh_mult[THR_NEARESTA ] = INT_MAX; + sf->thresh_mult[THR_ZEROA ] = INT_MAX; + sf->thresh_mult[THR_NEARA ] = INT_MAX; + sf->thresh_mult[THR_NEWA ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; +#if CONFIG_COMP_INTERINTRA_PRED + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = INT_MAX; +#endif + } + + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != + (VP9_LAST_FLAG | VP9_GOLD_FLAG)) { + sf->thresh_mult[THR_COMP_ZEROLG ] = INT_MAX; + sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX; + sf->thresh_mult[THR_COMP_NEARLG ] = INT_MAX; + sf->thresh_mult[THR_COMP_NEWLG ] = INT_MAX; + sf->thresh_mult[THR_COMP_SPLITLG ] = INT_MAX; + } + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) { + sf->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; + sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; + sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; + sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; + sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX; + } + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { + sf->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; + sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; + sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; + sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; + sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; + } +} + void vp9_set_speed_features(VP9_COMP *cpi) { SPEED_FEATURES *sf = &cpi->sf; - int Mode = cpi->compressor_speed; - int Speed = cpi->Speed; + int mode = cpi->compressor_speed; + int speed = cpi->Speed; int i; VP9_COMMON *cm = &cpi->common; // Only modes 0 and 1 supported for now in experimental code basae - if (Mode > 1) - Mode = 1; + if (mode > 1) + mode = 1; // Initialise default mode frequency sampling variables for (i = 0; i < MAX_MODES; i ++) { @@ -617,167 +754,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->quarter_pixel_search = 1; sf->half_pixel_search = 1; sf->iterative_sub_pixel = 1; -#if CONFIG_LOSSLESS - sf->optimize_coefficients = 0; -#else - sf->optimize_coefficients = 1; -#endif sf->no_skip_block4x4_search = 1; + if (cpi->oxcf.lossless) + sf->optimize_coefficients = 0; + else + sf->optimize_coefficients = 1; sf->first_step = 0; sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + sf->static_segmentation = 1; + sf->splitmode_breakout = 0; + sf->mb16_breakout = 0; - // default thresholds to 0 - for (i = 0; i < MAX_MODES; i++) - sf->thresh_mult[i] = 0; - - switch (Mode) { + switch (mode) { case 0: // best quality mode - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEARG ] = 0; - sf->thresh_mult[THR_NEARA ] = 0; - - sf->thresh_mult[THR_DC ] = 0; - - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_D45_PRED ] = 1000; - sf->thresh_mult[THR_D135_PRED] = 1000; - sf->thresh_mult[THR_D117_PRED] = 1000; - sf->thresh_mult[THR_D153_PRED] = 1000; - sf->thresh_mult[THR_D27_PRED ] = 1000; - sf->thresh_mult[THR_D63_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2000; - sf->thresh_mult[THR_I8X8_PRED] = 2000; - sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEWMV ] = 1000; - sf->thresh_mult[THR_NEWG ] = 1000; - sf->thresh_mult[THR_NEWA ] = 1000; - - sf->thresh_mult[THR_SPLITMV ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 5000; - sf->thresh_mult[THR_SPLITA ] = 5000; - - sf->thresh_mult[THR_COMP_ZEROLG ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLG] = 0; - sf->thresh_mult[THR_COMP_NEARLG ] = 0; - sf->thresh_mult[THR_COMP_ZEROLA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLA] = 0; - sf->thresh_mult[THR_COMP_NEARLA ] = 0; - sf->thresh_mult[THR_COMP_ZEROGA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTGA] = 0; - sf->thresh_mult[THR_COMP_NEARGA ] = 0; - - sf->thresh_mult[THR_COMP_NEWLG ] = 1000; - sf->thresh_mult[THR_COMP_NEWLA ] = 1000; - sf->thresh_mult[THR_COMP_NEWGA ] = 1000; - - sf->thresh_mult[THR_COMP_SPLITLA ] = 2500; - sf->thresh_mult[THR_COMP_SPLITGA ] = 5000; - sf->thresh_mult[THR_COMP_SPLITLG ] = 5000; - -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif - - sf->first_step = 0; - sf->max_step_search_steps = MAX_MVSEARCH_STEPS; sf->search_best_filter = SEARCH_BEST_FILTER; break; + case 1: - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_D45_PRED ] = 1000; - sf->thresh_mult[THR_D135_PRED] = 1000; - sf->thresh_mult[THR_D117_PRED] = 1000; - sf->thresh_mult[THR_D153_PRED] = 1000; - sf->thresh_mult[THR_D27_PRED ] = 1000; - sf->thresh_mult[THR_D63_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2500; - sf->thresh_mult[THR_I8X8_PRED] = 2500; - sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEARESTG ] = 1000; - sf->thresh_mult[THR_NEARESTA ] = 1000; - - sf->thresh_mult[THR_ZEROG ] = 1000; - sf->thresh_mult[THR_ZEROA ] = 1000; - sf->thresh_mult[THR_NEARG ] = 1000; - sf->thresh_mult[THR_NEARA ] = 1000; - - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEARG ] = 0; - sf->thresh_mult[THR_NEARA ] = 0; - - sf->thresh_mult[THR_NEWMV ] = 1000; - sf->thresh_mult[THR_NEWG ] = 1000; - sf->thresh_mult[THR_NEWA ] = 1000; - - sf->thresh_mult[THR_SPLITMV ] = 1700; - sf->thresh_mult[THR_SPLITG ] = 4500; - sf->thresh_mult[THR_SPLITA ] = 4500; - - sf->thresh_mult[THR_COMP_ZEROLG ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLG] = 0; - sf->thresh_mult[THR_COMP_NEARLG ] = 0; - sf->thresh_mult[THR_COMP_ZEROLA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLA] = 0; - sf->thresh_mult[THR_COMP_NEARLA ] = 0; - sf->thresh_mult[THR_COMP_ZEROGA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTGA] = 0; - sf->thresh_mult[THR_COMP_NEARGA ] = 0; - - sf->thresh_mult[THR_COMP_NEWLG ] = 1000; - sf->thresh_mult[THR_COMP_NEWLA ] = 1000; - sf->thresh_mult[THR_COMP_NEWGA ] = 1000; - - sf->thresh_mult[THR_COMP_SPLITLA ] = 1700; - sf->thresh_mult[THR_COMP_SPLITGA ] = 4500; - sf->thresh_mult[THR_COMP_SPLITLG ] = 4500; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif + sf->static_segmentation = 1; + sf->splitmode_breakout = 1; + sf->mb16_breakout = 0; - if (Speed > 0) { + if (speed > 0) { /* Disable coefficient optimization above speed 0 */ sf->optimize_coefficients = 0; sf->no_skip_block4x4_search = 0; @@ -793,7 +792,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mode_check_freq[THR_COMP_SPLITLA] = 0; } - if (Speed > 1) { + if (speed > 1) { cpi->mode_check_freq[THR_SPLITG] = 4; cpi->mode_check_freq[THR_SPLITA] = 4; cpi->mode_check_freq[THR_SPLITMV] = 2; @@ -801,73 +800,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mode_check_freq[THR_COMP_SPLITGA] = 4; cpi->mode_check_freq[THR_COMP_SPLITLG] = 4; cpi->mode_check_freq[THR_COMP_SPLITLA] = 2; - - sf->thresh_mult[THR_TM ] = 1500; - sf->thresh_mult[THR_V_PRED ] = 1500; - sf->thresh_mult[THR_H_PRED ] = 1500; - sf->thresh_mult[THR_D45_PRED ] = 1500; - sf->thresh_mult[THR_D135_PRED] = 1500; - sf->thresh_mult[THR_D117_PRED] = 1500; - sf->thresh_mult[THR_D153_PRED] = 1500; - sf->thresh_mult[THR_D27_PRED ] = 1500; - sf->thresh_mult[THR_D63_PRED ] = 1500; - sf->thresh_mult[THR_B_PRED ] = 5000; - sf->thresh_mult[THR_I8X8_PRED] = 5000; - - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 10000; - sf->thresh_mult[THR_COMP_SPLITLG ] = 20000; - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - sf->thresh_mult[THR_NEARESTG ] = 1500; - sf->thresh_mult[THR_ZEROG ] = 1500; - sf->thresh_mult[THR_NEARG ] = 1500; - sf->thresh_mult[THR_NEWG ] = 2000; - sf->thresh_mult[THR_SPLITG ] = 20000; - sf->thresh_mult[THR_COMP_SPLITGA ] = 20000; - } - - if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - sf->thresh_mult[THR_NEARESTA ] = 1500; - sf->thresh_mult[THR_ZEROA ] = 1500; - sf->thresh_mult[THR_NEARA ] = 1500; - sf->thresh_mult[THR_NEWA ] = 2000; - sf->thresh_mult[THR_SPLITA ] = 20000; - sf->thresh_mult[THR_COMP_SPLITLA ] = 10000; - } - - sf->thresh_mult[THR_COMP_ZEROLG ] = 1500; - sf->thresh_mult[THR_COMP_NEARESTLG] = 1500; - sf->thresh_mult[THR_COMP_NEARLG ] = 1500; - sf->thresh_mult[THR_COMP_ZEROLA ] = 1500; - sf->thresh_mult[THR_COMP_NEARESTLA] = 1500; - sf->thresh_mult[THR_COMP_NEARLA ] = 1500; - sf->thresh_mult[THR_COMP_ZEROGA ] = 1500; - sf->thresh_mult[THR_COMP_NEARESTGA] = 1500; - sf->thresh_mult[THR_COMP_NEARGA ] = 1500; - - sf->thresh_mult[THR_COMP_NEWLG ] = 2000; - sf->thresh_mult[THR_COMP_NEWLA ] = 2000; - sf->thresh_mult[THR_COMP_NEWGA ] = 2000; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif } - if (Speed > 2) { + if (speed > 2) { cpi->mode_check_freq[THR_SPLITG] = 15; cpi->mode_check_freq[THR_SPLITA] = 15; cpi->mode_check_freq[THR_SPLITMV] = 7; @@ -876,150 +811,19 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mode_check_freq[THR_COMP_SPLITLG] = 15; cpi->mode_check_freq[THR_COMP_SPLITLA] = 7; - sf->thresh_mult[THR_TM ] = 2000; - sf->thresh_mult[THR_V_PRED ] = 2000; - sf->thresh_mult[THR_H_PRED ] = 2000; - sf->thresh_mult[THR_D45_PRED ] = 2000; - sf->thresh_mult[THR_D135_PRED] = 2000; - sf->thresh_mult[THR_D117_PRED] = 2000; - sf->thresh_mult[THR_D153_PRED] = 2000; - sf->thresh_mult[THR_D27_PRED ] = 2000; - sf->thresh_mult[THR_D63_PRED ] = 2000; - sf->thresh_mult[THR_B_PRED ] = 7500; - sf->thresh_mult[THR_I8X8_PRED] = 7500; - - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 25000; - sf->thresh_mult[THR_COMP_SPLITLG ] = 50000; - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - sf->thresh_mult[THR_NEARESTG ] = 2000; - sf->thresh_mult[THR_ZEROG ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEWG ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 50000; - sf->thresh_mult[THR_COMP_SPLITGA ] = 50000; - } - - if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - sf->thresh_mult[THR_NEARESTA ] = 2000; - sf->thresh_mult[THR_ZEROA ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; - sf->thresh_mult[THR_NEWA ] = 2500; - sf->thresh_mult[THR_SPLITA ] = 50000; - sf->thresh_mult[THR_COMP_SPLITLA ] = 25000; - } - - sf->thresh_mult[THR_COMP_ZEROLG ] = 2000; - sf->thresh_mult[THR_COMP_NEARESTLG] = 2000; - sf->thresh_mult[THR_COMP_NEARLG ] = 2000; - sf->thresh_mult[THR_COMP_ZEROLA ] = 2000; - sf->thresh_mult[THR_COMP_NEARESTLA] = 2000; - sf->thresh_mult[THR_COMP_NEARLA ] = 2000; - sf->thresh_mult[THR_COMP_ZEROGA ] = 2000; - sf->thresh_mult[THR_COMP_NEARESTGA] = 2000; - sf->thresh_mult[THR_COMP_NEARGA ] = 2000; - - sf->thresh_mult[THR_COMP_NEWLG ] = 2500; - sf->thresh_mult[THR_COMP_NEWLA ] = 2500; - sf->thresh_mult[THR_COMP_NEWGA ] = 2500; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif - sf->improved_dct = 0; // Only do recode loop on key frames, golden frames and // alt ref frames sf->recode_loop = 2; - } break; }; /* switch */ - /* disable frame modes if flags not set */ - if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { - sf->thresh_mult[THR_NEWMV ] = INT_MAX; - sf->thresh_mult[THR_NEARESTMV] = INT_MAX; - sf->thresh_mult[THR_ZEROMV ] = INT_MAX; - sf->thresh_mult[THR_NEARMV ] = INT_MAX; - sf->thresh_mult[THR_SPLITMV ] = INT_MAX; - } - - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { - sf->thresh_mult[THR_NEARESTG ] = INT_MAX; - sf->thresh_mult[THR_ZEROG ] = INT_MAX; - sf->thresh_mult[THR_NEARG ] = INT_MAX; - sf->thresh_mult[THR_NEWG ] = INT_MAX; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = INT_MAX; -#endif - sf->thresh_mult[THR_SPLITG ] = INT_MAX; - } - - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { - sf->thresh_mult[THR_NEARESTA ] = INT_MAX; - sf->thresh_mult[THR_ZEROA ] = INT_MAX; - sf->thresh_mult[THR_NEARA ] = INT_MAX; - sf->thresh_mult[THR_NEWA ] = INT_MAX; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = INT_MAX; -#endif - sf->thresh_mult[THR_SPLITA ] = INT_MAX; - } - - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) { - sf->thresh_mult[THR_COMP_ZEROLG ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARLG ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEWLG ] = INT_MAX; - sf->thresh_mult[THR_COMP_SPLITLG ] = INT_MAX; - } - - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) { - sf->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX; - } - - if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { - sf->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; - sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; - } -#if CONFIG_COMP_INTERINTRA_PRED - if ((cpi->ref_frame_flags & VP9_LAST_FLAG) != VP9_LAST_FLAG) { - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = INT_MAX; - } -#endif + // Set rd thresholds based on mode and speed setting + set_rd_speed_thresholds(cpi, mode, speed); // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. @@ -1028,36 +832,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->improved_dct = 0; } - if (cpi->sf.search_method == NSTEP) { - vp9_init3smotion_compensation(&cpi->mb, - cm->yv12_fb[cm->lst_fb_idx].y_stride); - } else if (cpi->sf.search_method == DIAMOND) { - vp9_init_dsmotion_compensation(&cpi->mb, - cm->yv12_fb[cm->lst_fb_idx].y_stride); - } + { + int y_stride = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].y_stride; - cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16; - cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8; - cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4; - cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4; - cpi->mb.short_walsh4x4 = vp9_short_walsh4x4; - cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2; + if (cpi->sf.search_method == NSTEP) { + vp9_init3smotion_compensation(&cpi->mb, y_stride); + } else if (cpi->sf.search_method == DIAMOND) { + vp9_init_dsmotion_compensation(&cpi->mb, y_stride); + } + } -#if CONFIG_LOSSLESS - if (cpi->oxcf.lossless) { - cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8; - cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8; - cpi->mb.short_walsh4x4 = vp9_short_walsh4x4; - cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2; - cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless; + cpi->mb.fwd_txm16x16 = vp9_short_fdct16x16; + cpi->mb.fwd_txm8x8 = vp9_short_fdct8x8; + cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; + cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4_x8; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4_x8; } -#endif cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4; cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair; cpi->mb.quantize_b_8x8 = vp9_regular_quantize_b_8x8; cpi->mb.quantize_b_16x16 = vp9_regular_quantize_b_16x16; - cpi->mb.quantize_b_2x2 = vp9_regular_quantize_b_2x2; vp9_init_quantizer(cpi); @@ -1078,6 +875,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { frames_at_speed[cpi->Speed]++; #endif } + static void alloc_raw_frame_buffers(VP9_COMP *cpi) { int width = (cpi->oxcf.Width + 15) & ~15; int height = (cpi->oxcf.Height + 15) & ~15; @@ -1144,7 +942,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); - vpx_free(cpi->tok); { @@ -1199,6 +996,38 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { } +static void update_frame_size(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + /* our internal buffers are always multiples of 16 */ + int width = (cm->Width + 15) & ~15; + int height = (cm->Height + 15) & ~15; + + cm->mb_rows = height >> 4; + cm->mb_cols = width >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + cm->mode_info_stride = cm->mb_cols + 1; + memset(cm->mip, 0, + (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO)); + vp9_update_mode_info_border(cm, cm->mip); + + cm->mi = cm->mip + cm->mode_info_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + vp9_update_mode_info_in_image(cm, cm->mi); + + /* Update size of buffers local to this frame */ + if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf, + width, height, VP9BORDERINPIXELS)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate last frame buffer"); + + if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source, + width, height, VP9BORDERINPIXELS)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate scaled source buffer"); +} + + // TODO perhaps change number of steps expose to outside world when setting // max and min limits. Also this will likely want refining for the extended Q // range. @@ -1239,10 +1068,7 @@ void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) { cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS; // Set Maximum gf/arf interval - cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2); - - if (cpi->max_gf_interval < 12) - cpi->max_gf_interval = 12; + cpi->max_gf_interval = 15; // Extended interval for genuinely static scenes cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1; @@ -1270,10 +1096,26 @@ rescale(int val, int num, int denom) { return (int)(llval * llnum / llden); } +static void set_tile_limits(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int min_log2_tiles, max_log2_tiles; + + cm->log2_tile_columns = cpi->oxcf.tile_columns; + cm->log2_tile_rows = cpi->oxcf.tile_rows; + + vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles); + max_log2_tiles += min_log2_tiles; + if (cm->log2_tile_columns < min_log2_tiles) + cm->log2_tile_columns = min_log2_tiles; + else if (cm->log2_tile_columns > max_log2_tiles) + cm->log2_tile_columns = max_log2_tiles; + cm->tile_columns = 1 << cm->log2_tile_columns; + cm->tile_rows = 1 << cm->log2_tile_rows; +} static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; cpi->oxcf = *oxcf; @@ -1304,6 +1146,12 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->static_mb_pct = 0; + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; + + set_tile_limits(cpi); + #if VP9_TEMPORAL_ALT_REF { int i; @@ -1319,7 +1167,7 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; if (!cpi) return; @@ -1351,7 +1199,6 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { if (cpi->oxcf.cpu_used > 5) cpi->oxcf.cpu_used = 5; - break; case MODE_SECONDPASS_BEST: @@ -1364,20 +1211,14 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; - cpi->mb.e_mbd.inv_xform4x4_1_x8 = vp9_short_idct4x4llm_1; - cpi->mb.e_mbd.inv_xform4x4_x8 = vp9_short_idct4x4llm; - cpi->mb.e_mbd.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1; - cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4; - -#if CONFIG_LOSSLESS cpi->oxcf.lossless = oxcf->lossless; if (cpi->oxcf.lossless) { - cpi->mb.e_mbd.inv_xform4x4_1_x8 = vp9_short_inv_walsh4x4_1_x8; - cpi->mb.e_mbd.inv_xform4x4_x8 = vp9_short_inv_walsh4x4_x8; - cpi->mb.e_mbd.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1_lossless; - cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_inv_walsh4x4_1_x8; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_inv_walsh4x4_x8; + } else { + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4llm_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4llm; } -#endif cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; @@ -1385,8 +1226,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { // cpi->use_golden_frame_only = 0; // cpi->use_last_frame_only = 0; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; cm->refresh_entropy_probs = 1; setup_features(cpi); @@ -1491,14 +1332,18 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; } - if (((cm->Width + 15) & 0xfffffff0) != - cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != - cm->yv12_fb[cm->lst_fb_idx].y_height || - cm->yv12_fb[cm->lst_fb_idx].y_width == 0) { + // Increasing the size of the frame beyond the first seen frame, or some + // otherwise signalled maximum size, is not supported. + // TODO(jkoleszar): exit gracefully. + if (!cpi->initial_width) { alloc_raw_frame_buffers(cpi); vp9_alloc_compressor_data(cpi); + cpi->initial_width = cm->Width; + cpi->initial_height = cm->Height; } + assert(cm->Width <= cpi->initial_width); + assert(cm->Height <= cpi->initial_height); + update_frame_size(cpi); if (cpi->oxcf.fixed_q >= 0) { cpi->last_q[0] = cpi->oxcf.fixed_q; @@ -1526,6 +1371,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->last_frame_distortion = 0; #endif + set_tile_limits(cpi); } #define M_LOG2_E 0.693147180559945309417 @@ -1693,7 +1539,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->source_alt_ref_pending = FALSE; cpi->source_alt_ref_active = FALSE; - cpi->common.refresh_alt_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -1795,10 +1641,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->rd_thresh_mult[i] = 128; } -#ifdef ENTROPY_STATS - init_mv_ref_counts(); -#endif - #define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].vf = VF; \ @@ -1838,14 +1680,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) -#if ARCH_X86 || ARCH_X86_64 - cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn; -#endif - cpi->full_search_sad = vp9_full_search_sad; cpi->diamond_search_sad = vp9_diamond_search_sad; cpi->refining_search_sad = vp9_refining_search_sad; @@ -1885,7 +1719,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) { if (cpi->pass != 1) { print_context_counters(); print_tree_update_probs(); - print_mode_context(); + print_mode_context(&cpi->common); } #endif #ifdef NMV_STATS @@ -1908,7 +1742,8 @@ void vp9_remove_compressor(VP9_PTR *ptr) { print_mode_contexts(&cpi->common); #endif if (cpi->b_calculate_psnr) { - YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; + YV12_BUFFER_CONFIG *lst_yv12 = + &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]]; double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height; double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error); double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2); @@ -2230,18 +2065,18 @@ int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) { if (ref_frame_flags > 7) return -1; - cpi->common.refresh_golden_frame = 0; - cpi->common.refresh_alt_ref_frame = 0; - cpi->common.refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->refresh_last_frame = 0; if (ref_frame_flags & VP9_LAST_FLAG) - cpi->common.refresh_last_frame = 1; + cpi->refresh_last_frame = 1; if (ref_frame_flags & VP9_GOLD_FLAG) - cpi->common.refresh_golden_frame = 1; + cpi->refresh_golden_frame = 1; if (ref_frame_flags & VP9_ALT_FLAG) - cpi->common.refresh_alt_ref_frame = 1; + cpi->refresh_alt_ref_frame = 1; return 0; } @@ -2253,11 +2088,11 @@ int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, int ref_fb_idx; if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->lst_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx]; else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->gld_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx]; else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->alt_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx]; else return -1; @@ -2274,11 +2109,11 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, int ref_fb_idx; if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->lst_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx]; else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->gld_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx]; else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->alt_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx]; else return -1; @@ -2349,9 +2184,73 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { fwrite(src, s->uv_width, 1, yuv_rec_file); src += s->uv_stride; } while (--h); + fflush(yuv_rec_file); } #endif +static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, + YV12_BUFFER_CONFIG *dst_fb) { + const int in_w = src_fb->y_width; + const int in_h = src_fb->y_height; + const int out_w = dst_fb->y_width; + const int out_h = dst_fb->y_height; + int x, y; + + for (y = 0; y < out_h; y += 16) { + for (x = 0; x < out_w; x += 16) { + int x_q4 = x * 16 * in_w / out_w; + int y_q4 = y * 16 * in_h / out_h; + uint8_t *src, *dst; + int src_stride, dst_stride; + + + src = src_fb->y_buffer + + y * in_h / out_h * src_fb->y_stride + + x * in_w / out_w; + dst = dst_fb->y_buffer + + y * dst_fb->y_stride + + x; + src_stride = src_fb->y_stride; + dst_stride = dst_fb->y_stride; + + vp9_convolve8(src, src_stride, dst, dst_stride, + vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, + vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + 16, 16); + + x_q4 >>= 1; + y_q4 >>= 1; + src_stride = src_fb->uv_stride; + dst_stride = dst_fb->uv_stride; + + src = src_fb->u_buffer + + y / 2 * in_h / out_h * src_fb->uv_stride + + x / 2 * in_w / out_w; + dst = dst_fb->u_buffer + + y / 2 * dst_fb->uv_stride + + x / 2; + vp9_convolve8(src, src_stride, dst, dst_stride, + vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, + vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + 8, 8); + + src = src_fb->v_buffer + + y / 2 * in_h / out_h * src_fb->uv_stride + + x / 2 * in_w / out_w; + dst = dst_fb->v_buffer + + y / 2 * dst_fb->uv_stride + + x / 2; + vp9_convolve8(src, src_stride, dst, dst_stride, + vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, + vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + 8, 8); + } + } + + vp8_yv12_extend_frame_borders(dst_fb); +} + + static void update_alt_ref_frame_stats(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; @@ -2374,13 +2273,13 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; // Update the Golden frame usage counts. - if (cm->refresh_golden_frame) { + if (cpi->refresh_golden_frame) { // Update data structure that monitors level of reference to last GF vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; // this frame refreshes means next frames don't unless specified by user - cm->refresh_golden_frame = 0; + cpi->refresh_golden_frame = 0; cpi->common.frames_since_golden = 0; // if ( cm->frame_type == KEY_FRAME ) @@ -2402,7 +2301,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { // ******** Fixed Q test code only ************ // If we are going to use the ALT reference for the next group of frames set a flag to say so. if (cpi->oxcf.fixed_q >= 0 && - cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) { + cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) { cpi->source_alt_ref_pending = TRUE; cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; } @@ -2414,7 +2313,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; - } else if (!cpi->common.refresh_alt_ref_frame) { + } else if (!cpi->refresh_alt_ref_frame) { // Decrement count down till next gf if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; @@ -2535,8 +2434,8 @@ static int recode_loop_test(VP9_COMP *cpi, if ((cpi->sf.recode_loop == 1) || ((cpi->sf.recode_loop == 2) && ((cm->frame_type == KEY_FRAME) || - cm->refresh_golden_frame || - cm->refresh_alt_ref_frame))) { + cpi->refresh_golden_frame || + cpi->refresh_alt_ref_frame))) { // General over and under shoot tests if (((cpi->projected_frame_size > high_limit) && (q < maxq)) || ((cpi->projected_frame_size < low_limit) && (q > minq))) { @@ -2563,86 +2462,56 @@ static int recode_loop_test(VP9_COMP *cpi, return force_recode; } -static void update_reference_frames(VP9_COMMON *cm) { - YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb; +static void update_reference_frames(VP9_COMP * const cpi) { + VP9_COMMON * const cm = &cpi->common; // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. - if (cm->frame_type == KEY_FRAME) { - yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG; - - yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - - cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx; - } else { /* For non key frames */ - if (cm->refresh_alt_ref_frame) { - assert(!cm->copy_buffer_to_arf); - - cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG; - cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - cm->alt_fb_idx = cm->new_fb_idx; - } else if (cm->copy_buffer_to_arf) { - assert(!(cm->copy_buffer_to_arf & ~0x3)); - - if (cm->copy_buffer_to_arf == 1) { - if (cm->alt_fb_idx != cm->lst_fb_idx) { - yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG; - yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - cm->alt_fb_idx = cm->lst_fb_idx; - } - } else { /* if (cm->copy_buffer_to_arf == 2) */ - if (cm->alt_fb_idx != cm->gld_fb_idx) { - yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG; - yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - cm->alt_fb_idx = cm->gld_fb_idx; - } - } + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { + /* Preserve the previously existing golden frame and update the frame in + * the alt ref slot instead. This is highly specific to the current use of + * alt-ref as a forward reference, and this needs to be generalized as + * other uses are implemented (like RTC/temporal scaling) + * + * The update to the buffer in the alt ref slot was signalled in + * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated + * as the golden frame next time. + */ + int tmp; + + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + + tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + } else { /* For non key/golden frames */ + if (cpi->refresh_alt_ref_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); } - if (cm->refresh_golden_frame) { - assert(!cm->copy_buffer_to_gf); - - cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG; - cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - cm->gld_fb_idx = cm->new_fb_idx; - } else if (cm->copy_buffer_to_gf) { - assert(!(cm->copy_buffer_to_arf & ~0x3)); - - if (cm->copy_buffer_to_gf == 1) { - if (cm->gld_fb_idx != cm->lst_fb_idx) { - yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG; - yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - cm->gld_fb_idx = cm->lst_fb_idx; - } - } else { /* if (cm->copy_buffer_to_gf == 2) */ - if (cm->alt_fb_idx != cm->gld_fb_idx) { - yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG; - yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - cm->gld_fb_idx = cm->alt_fb_idx; - } - } + if (cpi->refresh_golden_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); } } - if (cm->refresh_last_frame) { - cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG; - cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG; - cm->lst_fb_idx = cm->new_fb_idx; + if (cpi->refresh_last_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); } } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { - if (cm->no_lpf) { + if (cm->no_lpf || cpi->mb.e_mbd.lossless) { cm->filter_level = 0; - } -#if CONFIG_LOSSLESS - else if (cpi->oxcf.lossless) { - cm->filter_level = 0; - } -#endif - else { + } else { struct vpx_usec_timer timer; vp9_clear_system_state(); @@ -2666,7 +2535,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { } -void select_interp_filter_type(VP9_COMP *cpi) { +void vp9_select_interp_filter_type(VP9_COMP *cpi) { int i; int high_filter_index = 0; unsigned int thresh; @@ -2719,6 +2588,38 @@ static void select_interintra_mode(VP9_COMP *cpi) { } #endif +static void scale_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int i; + + for (i = 0; i < 3; i++) { + YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]]; + + if (ref->y_width != cm->mb_cols * 16 || ref->y_height != cm->mb_rows * 16) { + int new_fb = get_free_fb(cm); + + vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb], + cm->mb_cols * 16, + cm->mb_rows * 16, + VP9BORDERINPIXELS); + scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]); + cpi->scaled_ref_idx[i] = new_fb; + } else { + cpi->scaled_ref_idx[i] = cm->ref_frame_map[i]; + cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++; + } + } +} + +static void release_scaled_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int i; + + for (i = 0; i < 3; i++) { + cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--; + } +} + static void encode_frame_to_data_rate(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, @@ -2735,8 +2636,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, int q_low; int q_high; - int zbin_oq_high; - int zbin_oq_low = 0; int top_index; int bottom_index; @@ -2749,11 +2648,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if RESET_FOREACH_FILTER int q_low0; int q_high0; - int zbin_oq_high0; - int zbin_oq_low0 = 0; int Q0; - int last_zbin_oq; - int last_zbin_oq0; int active_best_quality0; int active_worst_quality0; double rate_correction_factor0; @@ -2773,36 +2668,43 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, int mcomp_filter_index = 0; int64_t mcomp_filter_cost[4]; + /* Scale the source buffer, if required */ + if (cm->mb_cols * 16 != cpi->un_scaled_source->y_width || + cm->mb_rows * 16 != cpi->un_scaled_source->y_height) { + scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source); + cpi->Source = &cpi->scaled_source; + } else { + cpi->Source = cpi->un_scaled_source; + } + + scale_references(cpi); + // Clear down mmx registers to allow floating point in what follows vp9_clear_system_state(); // For an alt ref frame in 2 pass we skip the call to the second // pass function that sets the target bandwidth so must set it here - if (cpi->common.refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { cpi->per_frame_bandwidth = cpi->twopass.gf_bits; // Per frame bit target for the alt ref frame // per second target bitrate cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_frame_rate); } - // Default turn off buffer to buffer copying - cm->copy_buffer_to_gf = 0; - cm->copy_buffer_to_arf = 0; - // Clear zbin over-quant value and mode boost values. - cpi->zbin_over_quant = 0; cpi->zbin_mode_boost = 0; // Enable or disable mode based tweaking of the zbin // For 2 Pass Only used where GF/ARF prediction quality // is above a threshold cpi->zbin_mode_boost = 0; -#if CONFIG_LOSSLESS - cpi->zbin_mode_boost_enabled = FALSE; -#else - cpi->zbin_mode_boost_enabled = TRUE; -#endif + + if (cpi->oxcf.lossless) + cpi->zbin_mode_boost_enabled = FALSE; + else + cpi->zbin_mode_boost_enabled = TRUE; + if (cpi->gfu_boost <= 400) { cpi->zbin_mode_boost_enabled = FALSE; } @@ -2846,10 +2748,22 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, for (i = 0; i < MAX_MODES; i++) { cpi->rd_thresh_mult[i] = 128; } + + cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); + cm->frame_parallel_decoding_mode = + (cpi->oxcf.frame_parallel_decoding_mode != 0); + if (cm->error_resilient_mode) { + cm->frame_parallel_decoding_mode = 1; + cm->refresh_entropy_probs = 0; + } } - // Test code for new segment features - init_seg_features(cpi); + // Configure use of segmentation for enhanced coding of static regions. + // Only allowed for now in second pass of two pass (as requires lagged coding) + // and if the relevent speed feature flag is set. + if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) { + configure_static_seg_features(cpi); + } // Decide how big to make the frame vp9_pick_frame_size(cpi); @@ -2896,9 +2810,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->active_best_quality < cpi->best_quality) cpi->active_best_quality = cpi->best_quality; } - } - - else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) { + } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { int high = 2000; int low = 400; @@ -2971,17 +2883,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Determine initial Q to try Q = vp9_regulate_q(cpi, cpi->this_frame_target); } -#if RESET_FOREACH_FILTER - last_zbin_oq = cpi->zbin_over_quant; -#endif - - // Set highest allowed value for Zbin over quant - if (cm->frame_type == KEY_FRAME) - zbin_oq_high = 0; // ZBIN_OQ_MAX/16 - else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active)) - zbin_oq_high = 16; - else - zbin_oq_high = ZBIN_OQ_MAX; vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit); @@ -3064,9 +2965,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, q_low0 = q_low; q_high0 = q_high; Q0 = Q; - zbin_oq_low0 = zbin_oq_low; - zbin_oq_high0 = zbin_oq_high; - last_zbin_oq0 = last_zbin_oq; rate_correction_factor0 = cpi->rate_correction_factor; gf_rate_correction_factor0 = cpi->gf_rate_correction_factor; active_best_quality0 = cpi->active_best_quality; @@ -3087,12 +2985,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k]; if (cm->frame_type != KEY_FRAME) { - if (cpi->common.refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { for (k = 0; k < MBSKIP_CONTEXTS; k++) { if (cpi->last_skip_false_probs[2][k] != 0) cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k]; } - } else if (cpi->common.refresh_golden_frame) { + } else if (cpi->refresh_golden_frame) { for (k = 0; k < MBSKIP_CONTEXTS; k++) { if (cpi->last_skip_false_probs[1][k] != 0) cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k]; @@ -3124,10 +3022,21 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Set up entropy depending on frame type. - if (cm->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) { + /* Choose which entropy context to use. When using a forward reference + * frame, it immediately follows the keyframe, and thus benefits from + * using the same entropy context established by the keyframe. Otherwise, + * use the default context 0. + */ + cm->frame_context_idx = cpi->oxcf.play_alternate; vp9_setup_key_frame(cpi); - else + } else { + /* Choose which entropy context to use. Currently there are only two + * contexts used, one for normal frames and one for alt ref frames. + */ + cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; vp9_setup_inter_frame(cpi); + } } // transform / motion compensation build reconstruction frame @@ -3214,23 +3123,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->projected_frame_size > cpi->this_frame_target) { q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value - if (cpi->zbin_over_quant > 0) // If we are using over quant do the same for zbin_oq_low - zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high; - if (undershoot_seen || (loop_count > 1)) { // Update rate_correction_factor unless cpi->active_worst_quality has changed. if (!active_worst_qchanged) vp9_update_rate_correction_factors(cpi, 1); Q = (q_high + q_low + 1) / 2; - - // Adjust cpi->zbin_over_quant (only allowed when Q is max) - if (Q < MAXQ) - cpi->zbin_over_quant = 0; - else { - zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high; - cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; - } } else { // Update rate_correction_factor unless cpi->active_worst_quality has changed. if (!active_worst_qchanged) @@ -3238,7 +3136,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, Q = vp9_regulate_q(cpi, cpi->this_frame_target); - while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) { + while ((Q < q_low) && (Retries < 10)) { vp9_update_rate_correction_factors(cpi, 0); Q = vp9_regulate_q(cpi, cpi->this_frame_target); Retries++; @@ -3249,10 +3147,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Frame is too small else { - if (cpi->zbin_over_quant == 0) - q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant - else // else lower zbin_oq_high - zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low; + q_high = (Q > q_low) ? (Q - 1) : q_low; if (overshoot_seen || (loop_count > 1)) { // Update rate_correction_factor unless cpi->active_worst_quality has changed. @@ -3260,12 +3155,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_update_rate_correction_factors(cpi, 1); Q = (q_high + q_low) / 2; - - // Adjust cpi->zbin_over_quant (only allowed when Q is max) - if (Q < MAXQ) - cpi->zbin_over_quant = 0; - else - cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; } else { // Update rate_correction_factor unless cpi->active_worst_quality has changed. if (!active_worst_qchanged) @@ -3282,7 +3171,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, q_low = Q; } - while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) { + while ((Q > q_high) && (Retries < 10)) { vp9_update_rate_correction_factors(cpi, 0); Q = vp9_regulate_q(cpi, cpi->this_frame_target); Retries++; @@ -3298,16 +3187,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, else if (Q < q_low) Q = q_low; - // Clamp cpi->zbin_over_quant - cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? - zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? - zbin_oq_high : cpi->zbin_over_quant; - - // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE; Loop = ((Q != last_q)) ? TRUE : FALSE; -#if RESET_FOREACH_FILTER - last_zbin_oq = cpi->zbin_over_quant; -#endif } else Loop = FALSE; @@ -3351,12 +3231,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (Loop == TRUE) { overshoot_seen = FALSE; undershoot_seen = FALSE; - zbin_oq_low = zbin_oq_low0; - zbin_oq_high = zbin_oq_high0; q_low = q_low0; q_high = q_high0; Q = Q0; - cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0; cpi->rate_correction_factor = rate_correction_factor0; cpi->gf_rate_correction_factor = gf_rate_correction_factor0; cpi->active_best_quality = active_best_quality0; @@ -3412,12 +3289,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_update_gf_useage_maps(cpi, cm, &cpi->mb); if (cm->frame_type == KEY_FRAME) - cm->refresh_last_frame = 1; + cpi->refresh_last_frame = 1; #if 0 { FILE *f = fopen("gfactive.stt", "a"); - fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame); + fprintf(f, "%8d %8d %8d %8d %8d\n", + cm->current_video_frame, + (100 * cpi->gf_active_count) + / (cpi->common.mb_rows * cpi->common.mb_cols), + cpi->this_iiratio, + cpi->next_iiratio, + cpi->refresh_golden_frame); fclose(f); } #endif @@ -3444,18 +3327,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, update_reference_segmentation_map(cpi); } - update_reference_frames(cm); + release_scaled_references(cpi); + update_reference_frames(cpi); vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4); - vp9_copy(cpi->common.fc.hybrid_coef_counts_4x4, - cpi->hybrid_coef_counts_4x4); vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8); - vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, - cpi->hybrid_coef_counts_8x8); vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16); - vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16, - cpi->hybrid_coef_counts_16x16); vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32); - vp9_adapt_coef_probs(&cpi->common); + if (!cpi->common.error_resilient_mode && + !cpi->common.frame_parallel_decoding_mode) + vp9_adapt_coef_probs(&cpi->common); if (cpi->common.frame_type != KEY_FRAME) { vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count); vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count); @@ -3467,14 +3347,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if CONFIG_COMP_INTERINTRA_PRED vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count); #endif - vp9_adapt_mode_probs(&cpi->common); - cpi->common.fc.NMVcount = cpi->NMVcount; - /* - printf("2: %d %d %d %d\n", cpi->NMVcount.joints[0], cpi->NMVcount.joints[1], - cpi->NMVcount.joints[2], cpi->NMVcount.joints[3]); - */ - vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + if (!cpi->common.error_resilient_mode && + !cpi->common.frame_parallel_decoding_mode) { + vp9_adapt_mode_probs(&cpi->common); + vp9_adapt_mode_context(&cpi->common); + vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + } } #if CONFIG_COMP_INTERINTRA_PRED if (cm->frame_type != KEY_FRAME) @@ -3502,8 +3381,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if ((cm->base_qindex < cpi->last_boosted_qindex) || ((cpi->static_mb_pct < 100) && ((cm->frame_type == KEY_FRAME) || - cm->refresh_alt_ref_frame || - (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) { + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) { cpi->last_boosted_qindex = cm->base_qindex; } @@ -3516,7 +3395,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2; // Keep a record from which we can calculate the average Q excluding GF updates and key frames - if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) { + if ((cm->frame_type != KEY_FRAME) + && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { cpi->ni_frames++; cpi->tot_q += vp9_convert_qindex_to_q(Q); cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames; @@ -3538,11 +3418,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; - // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass. - cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; - cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; - cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; - cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32; + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (cm->frame_type != KEY_FRAME) { + cpi->rolling_target_bits = + ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; + cpi->rolling_actual_bits = + ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; + cpi->long_rolling_target_bits = + ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; + cpi->long_rolling_actual_bits = + ((cpi->long_rolling_actual_bits * 31) + + cpi->projected_frame_size + 16) / 32; + } // Actual bits spent cpi->total_actual_bits += cpi->projected_frame_size; @@ -3558,7 +3446,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->twopass.kf_group_bits < 0) cpi->twopass.kf_group_bits = 0; - } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) { + } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size; if (cpi->twopass.gf_group_bits < 0) @@ -3582,7 +3470,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->twopass.total_left_stats->coded_error != 0.0) fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d" "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%6d %5d %5d %5d %8d %8.2f %10d %10.3f" + "%6d %5d %5d %5d %8.2f %10d %10.3f" "%10.3f %8d %10d %10d %10d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, 0, //loop_size_estimate, @@ -3597,9 +3485,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_q, vp9_convert_qindex_to_q(cpi->ni_av_qi), vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->zbin_over_quant, - // cpi->avg_frame_qindex, cpi->zbin_over_quant, - cm->refresh_golden_frame, cm->refresh_alt_ref_frame, + cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, @@ -3611,7 +3497,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, else fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d" "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%6d %5d %5d %5d %8d %8.2f %10d %10.3f" + "%5d %5d %8d %8d %8.2f %10d %10.3f" "%8d %10d %10d %10d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, @@ -3627,9 +3513,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_q, vp9_convert_qindex_to_q(cpi->ni_av_qi), vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->zbin_over_quant, - // cpi->avg_frame_qindex, cpi->zbin_over_quant, - cm->refresh_golden_frame, cm->refresh_alt_ref_frame, + cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, @@ -3645,8 +3529,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, - cm->frame_type, cm->refresh_golden_frame, - cm->refresh_alt_ref_frame); + cm->frame_type, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame); for (i = 0; i < MAX_MODES; i++) fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); @@ -3665,33 +3549,34 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #endif // If this was a kf or Gf note the Q - if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame) + if ((cm->frame_type == KEY_FRAME) + || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) cm->last_kf_gf_q = cm->base_qindex; - if (cm->refresh_golden_frame == 1) + if (cpi->refresh_golden_frame == 1) cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; else cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN; - if (cm->refresh_alt_ref_frame == 1) + if (cpi->refresh_alt_ref_frame == 1) cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF; else cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF; - if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed + if (cpi->refresh_last_frame & cpi->refresh_golden_frame) cpi->gold_is_last = 1; - else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other + else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame) cpi->gold_is_last = 0; - if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed + if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame) cpi->alt_is_last = 1; - else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other + else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame) cpi->alt_is_last = 0; - if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed + if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame) cpi->gold_is_alt = 1; - else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other + else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame) cpi->gold_is_alt = 0; cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; @@ -3705,7 +3590,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->gold_is_alt) cpi->ref_frame_flags &= ~VP9_ALT_FLAG; - if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME)) + if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame + && (cm->frame_type != KEY_FRAME)) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); else @@ -3727,6 +3613,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, xd->update_mb_segmentation_data = 0; xd->mode_ref_lf_delta_update = 0; + // keep track of the last coded dimensions + cm->last_width = cm->Width; + cm->last_height = cm->Height; // Dont increment frame counters if this was an altref buffer update not a real frame if (cm->show_frame) { @@ -3744,8 +3633,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, FILE *recon_file; sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); recon_file = fopen(filename, "wb"); - fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc, - cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file); + fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc, + cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size, + 1, recon_file); fclose(recon_file); } #endif @@ -3765,13 +3655,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) { - if (!cpi->common.refresh_alt_ref_frame) + if (!cpi->refresh_alt_ref_frame) vp9_second_pass(cpi); encode_frame_to_data_rate(cpi, size, dest, frame_flags); + +#ifdef DISABLE_RC_LONG_TERM_MEM + cpi->twopass.bits_left -= cpi->this_frame_target; +#else cpi->twopass.bits_left -= 8 * *size; +#endif - if (!cpi->common.refresh_alt_ref_frame) { + if (!cpi->refresh_alt_ref_frame) { double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); @@ -3808,9 +3703,8 @@ static int frame_is_reference(const VP9_COMP *cpi) { const VP9_COMMON *cm = &cpi->common; const MACROBLOCKD *xd = &cpi->mb.e_mbd; - return cm->frame_type == KEY_FRAME || cm->refresh_last_frame - || cm->refresh_golden_frame || cm->refresh_alt_ref_frame - || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf + return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame + || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame || cm->refresh_entropy_probs || xd->mode_ref_lf_delta_update || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data; @@ -3846,9 +3740,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, force_src_buffer = &cpi->alt_ref_buffer; } cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; - cm->refresh_alt_ref_frame = 1; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 0; + cpi->refresh_alt_ref_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; cm->show_frame = 0; cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag. cpi->is_src_frame_alt_ref = 0; @@ -3889,7 +3783,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } // adjust frame rates based on timestamps given - if (!cm->refresh_alt_ref_frame) { + if (!cpi->refresh_alt_ref_frame) { int64_t this_duration; int step = 0; @@ -3945,28 +3839,34 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #if 0 - if (cm->refresh_alt_ref_frame) { - // cm->refresh_golden_frame = 1; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 0; + if (cpi->refresh_alt_ref_frame) { + // cpi->refresh_golden_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; } else { - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; } #endif - /* find a free buffer for the new frame */ - { - int i = 0; - for (; i < NUM_YV12_BUFFERS; i++) { - if (!cm->yv12_fb[i].flags) { - cm->new_fb_idx = i; - break; - } - } - assert(i < NUM_YV12_BUFFERS); - } + /* find a free buffer for the new frame, releasing the reference previously + * held. + */ + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + cm->new_fb_idx = get_free_fb(cm); + + /* Get the mapping of L/G/A to the reference buffer pool */ + cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx]; + cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx]; + cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx]; + + /* Reset the frame pointers to the current frame size */ + vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], + cm->mb_cols * 16, cm->mb_rows * 16, + VP9BORDERINPIXELS); + + vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); if (cpi->pass == 1) { Pass1Encode(cpi, size, dest, frame_flags); } else if (cpi->pass == 2) { @@ -3976,10 +3876,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } if (cm->refresh_entropy_probs) { - if (cm->refresh_alt_ref_frame) - vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc)); - else - vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc)); + vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc, + sizeof(cm->fc)); } // if its a dropped frame honor the requests on subsequent frames @@ -3988,9 +3886,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, // return to normal state cm->refresh_entropy_probs = 1; - cm->refresh_alt_ref_frame = 0; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; cm->frame_type = INTER_FRAME; } @@ -4113,7 +4011,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags) { VP9_COMP *cpi = (VP9_COMP *) comp; - if (cpi->common.refresh_alt_ref_frame) + if (cpi->refresh_alt_ref_frame) return -1; else { int ret; @@ -4217,17 +4115,31 @@ int vp9_set_active_map(VP9_PTR comp, unsigned char *map, int vp9_set_internal_size(VP9_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode) { VP9_COMP *cpi = (VP9_COMP *) comp; + VP9_COMMON *cm = &cpi->common; - if (horiz_mode <= ONETWO) - cpi->common.horiz_scale = horiz_mode; - else + if (horiz_mode > ONETWO) return -1; - if (vert_mode <= ONETWO) - cpi->common.vert_scale = vert_mode; - else + if (vert_mode > ONETWO) return -1; + if (cm->horiz_scale != horiz_mode || cm->vert_scale != vert_mode) { + int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); + int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + + cm->horiz_scale = horiz_mode; + cm->vert_scale = vert_mode; + + Scale2Ratio(cm->horiz_scale, &hr, &hs); + Scale2Ratio(cm->vert_scale, &vr, &vs); + + // always go to the next whole number + cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; + cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; + } + assert(cm->Width <= cpi->initial_width); + assert(cm->Height <= cpi->initial_height); + update_frame_size(cpi); return 0; } diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 74a58b430..02a371964 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -29,6 +29,10 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/encoder/vp9_lookahead.h" +// Experimental rate control switches +// #define ONE_SHOT_Q_ESTIMATE 1 +// #define DISABLE_RC_LONG_TERM_MEM 1 + // #define SPEEDSTATS 1 #define MIN_GF_INTERVAL 4 #define DEFAULT_GF_INTERVAL 7 @@ -53,7 +57,6 @@ #define GF_ZEROMV_ZBIN_BOOST 12 #define LF_ZEROMV_ZBIN_BOOST 6 #define MV_ZBIN_BOOST 4 -#define ZBIN_OQ_MAX 192 #define VP9_TEMPORAL_ALT_REF 1 @@ -86,12 +89,9 @@ typedef struct { // 0 = BPRED, ZERO_MV, MV, SPLIT signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; - vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]; + vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES]; vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32]; vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1]; @@ -259,7 +259,9 @@ typedef struct { int optimize_coefficients; int no_skip_block4x4_search; int search_best_filter; - + int splitmode_breakout; + int mb16_breakout; + int static_segmentation; } SPEED_FEATURES; typedef struct { @@ -301,41 +303,14 @@ typedef struct VP9_COMP { DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); - DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]); - - DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]); - - DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]); - MACROBLOCK mb; VP9_COMMON common; VP9_CONFIG oxcf; @@ -357,11 +332,17 @@ typedef struct VP9_COMP { int alt_is_last; // Alt reference frame same as last ( short circuit altref search) int gold_is_alt; // don't do both alt and gold search ( just do gold). - // int refresh_alt_ref_frame; + int scaled_ref_idx[3]; + int lst_fb_idx; + int gld_fb_idx; + int alt_fb_idx; + int refresh_last_frame; + int refresh_golden_frame; + int refresh_alt_ref_frame; YV12_BUFFER_CONFIG last_frame_uf; TOKENEXTRA *tok; - unsigned int tok_count; + unsigned int tok_count[1 << 6]; unsigned int frames_since_key; @@ -441,7 +422,6 @@ typedef struct VP9_COMP { double tot_q; double avg_q; - int zbin_over_quant; int zbin_mode_boost; int zbin_mode_boost_enabled; @@ -484,26 +464,17 @@ typedef struct VP9_COMP { nmv_context_counts NMVcount; - vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs frame_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_stats frame_hybrid_branch_ct_4x4[BLOCK_TYPES_4X4]; - - vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs frame_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_stats frame_hybrid_branch_ct_8x8[BLOCK_TYPES_8X8]; - - vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs frame_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_stats frame_hybrid_branch_ct_16x16[BLOCK_TYPES_16X16]; + vp9_coeff_count coef_counts_4x4[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES]; + + vp9_coeff_count coef_counts_8x8[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES]; + + vp9_coeff_count coef_counts_16x16[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES]; vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32]; vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES_32X32]; @@ -683,9 +654,6 @@ typedef struct VP9_COMP { int droppable; - // TODO Do we still need this?? - int update_context; - int dummy_packing; /* flag to indicate if packing is dummy */ unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1] @@ -696,6 +664,8 @@ typedef struct VP9_COMP { unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; #endif + int initial_width; + int initial_height; } VP9_COMP; void vp9_encode_frame(VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index b443ede6f..6f9333521 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ - +#include <assert.h> #include "vp9/common/vp9_onyxc_int.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_picklpf.h" @@ -27,6 +27,7 @@ void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, int yoffset; int linestocopy; + assert(src_ybc->y_stride == dst_ybc->y_stride); yheight = src_ybc->y_height; ystride = src_ybc->y_stride; diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c index eb00f4159..94394341d 100644 --- a/vp9/encoder/vp9_psnr.c +++ b/vp9/encoder/vp9_psnr.c @@ -11,17 +11,16 @@ #include "vpx_scale/yv12config.h" #include "math.h" -#include "vp9/common/vp9_systemdependent.h" /* for vp9_clear_system_state() */ #define MAX_PSNR 100 -double vp9_mse2psnr(double Samples, double Peak, double Mse) { +double vp9_mse2psnr(double samples, double peak, double mse) { double psnr; - if ((double)Mse > 0.0) - psnr = 10.0 * log10(Peak * Peak * Samples / Mse); + if (mse > 0.0) + psnr = 10.0 * log10(peak * peak * samples / mse); else - psnr = MAX_PSNR; // Limit to prevent / 0 + psnr = MAX_PSNR; // Limit to prevent / 0 if (psnr > MAX_PSNR) psnr = MAX_PSNR; diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h index 121f0dc98..15dd8366b 100644 --- a/vp9/encoder/vp9_psnr.h +++ b/vp9/encoder/vp9_psnr.h @@ -12,6 +12,6 @@ #ifndef VP9_ENCODER_VP9_PSNR_H_ #define VP9_ENCODER_VP9_PSNR_H_ -extern double vp9_mse2psnr(double Samples, double Peak, double Mse); +double vp9_mse2psnr(double samples, double peak, double mse); #endif // VP9_ENCODER_VP9_PSNR_H_ diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 36b656713..399e8ecda 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -21,7 +21,10 @@ extern int enc_debug; #endif -void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) { +void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) { + MACROBLOCKD *const xd = &mb->e_mbd; + BLOCK *const b = &mb->block[b_idx]; + BLOCKD *const d = &xd->block[b_idx]; int i, rc, eob; int zbin; int x, y, z, sz; @@ -57,35 +60,40 @@ void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) { eob = -1; - for (i = 0; i < b->eob_max_offset; i++) { - rc = pt_scan[i]; - z = coeff_ptr[rc]; - - zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; - zbin_boost_ptr ++; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + if (!b->skip_block) { + for (i = 0; i < 16; i++) { + rc = pt_scan[i]; + z = coeff_ptr[rc]; + + zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; + zbin_boost_ptr++; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += round_ptr[rc]; + y = (((x * quant_ptr[rc]) >> 16) + x) + >> quant_shift_ptr[rc]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs + zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + } } } } - d->eob = eob + 1; + xd->eobs[b_idx] = eob + 1; } -void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) { +void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) { + MACROBLOCKD *const xd = &mb->e_mbd; + BLOCK *const b = &mb->block[b_idx]; + BLOCKD *const d = &xd->block[b_idx]; int i, rc, eob; int zbin; int x, y, z, sz; @@ -105,64 +113,55 @@ void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) { eob = -1; - for (i = 0; i < b->eob_max_offset; i++) { - rc = vp9_default_zig_zag1d_4x4[i]; - z = coeff_ptr[rc]; + if (!b->skip_block) { + for (i = 0; i < 16; i++) { + rc = vp9_default_zig_zag1d_4x4[i]; + z = coeff_ptr[rc]; - zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; - zbin_boost_ptr ++; + zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; + zbin_boost_ptr++; - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) - if (x >= zbin) { - x += round_ptr[rc]; + if (x >= zbin) { + x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + y = (((x * quant_ptr[rc]) >> 16) + x) + >> quant_shift_ptr[rc]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + if (y) { + eob = i; // last nonzero coeffs + zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + } } } } - d->eob = eob + 1; + xd->eobs[b_idx] = eob + 1; } void vp9_quantize_mby_4x4_c(MACROBLOCK *x) { int i; - int has_2nd_order = get_2nd_order_usage(&x->e_mbd); for (i = 0; i < 16; i++) { TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, &x->e_mbd.block[i]); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_ht_quantize_b_4x4(&x->block[i], &x->e_mbd.block[i], tx_type); + vp9_ht_quantize_b_4x4(x, i, tx_type); } else { - x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]); + x->quantize_b_4x4(x, i); } } - if (has_2nd_order) { - x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]); - } else { - vpx_memset(x->e_mbd.block[24].qcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].qcoeff[0])); - vpx_memset(x->e_mbd.block[24].dqcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].dqcoeff[0])); - x->e_mbd.block[24].eob = 0; - } } void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) { int i; for (i = 16; i < 24; i++) - x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]); + x->quantize_b_4x4(x, i); } void vp9_quantize_mb_4x4_c(MACROBLOCK *x) { @@ -170,138 +169,101 @@ void vp9_quantize_mb_4x4_c(MACROBLOCK *x) { vp9_quantize_mbuv_4x4_c(x); } -void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) { - int i, rc, eob; - int zbin; - int x, y, z, sz; - int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int zbin_zrun_index = 0; - int16_t *coeff_ptr = b->coeff; - int16_t *zbin_ptr = b->zbin; - int16_t *round_ptr = b->round; - int16_t *quant_ptr = b->quant; - uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; - int16_t *dequant_ptr = d->dequant; - int zbin_oq_value = b->zbin_extra; - // double q2nd = 4; - vpx_memset(qcoeff_ptr, 0, 32); - vpx_memset(dqcoeff_ptr, 0, 32); - - eob = -1; - - for (i = 0; i < b->eob_max_offset_8x8; i++) { - rc = vp9_default_zig_zag1d_4x4[i]; - z = coeff_ptr[rc]; - - zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index]; - zbin_zrun_index += 4; - zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value); - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += (round_ptr[rc]); - y = ((int)((int)(x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_zrun_index = 0; - } - } - } - - d->eob = eob + 1; -} - -void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) { - int i, rc, eob; - int zbin; - int x, y, z, sz; - int16_t *zbin_boost_ptr = b->zrun_zbin_boost_8x8; - int16_t *coeff_ptr = b->coeff; - int16_t *zbin_ptr = b->zbin_8x8; - int16_t *round_ptr = b->round; - int16_t *quant_ptr = b->quant; - uint8_t *quant_shift_ptr = b->quant_shift; +void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) { + MACROBLOCKD *const xd = &mb->e_mbd; + BLOCK *const b = &mb->block[b_idx]; + BLOCKD *const d = &xd->block[b_idx]; int16_t *qcoeff_ptr = d->qcoeff; int16_t *dqcoeff_ptr = d->dqcoeff; - int16_t *dequant_ptr = d->dequant; - int zbin_oq_value = b->zbin_extra; vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t)); - eob = -1; - - for (i = 0; i < b->eob_max_offset_8x8; i++) { - rc = vp9_default_zig_zag1d_8x8[i]; - z = coeff_ptr[rc]; - - zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value); - zbin_boost_ptr++; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += (round_ptr[rc != 0]); - y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) - >> quant_shift_ptr[rc != 0]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost_8x8; + if (!b->skip_block) { + int i, rc, eob; + int zbin; + int x, y, z, sz; + int zero_run; + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + uint8_t *quant_shift_ptr = b->quant_shift; + int16_t *dequant_ptr = d->dequant; + int zbin_oq_value = b->zbin_extra; + + eob = -1; + + // Special case for DC as it is the one triggering access in various + // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0] + { + z = coeff_ptr[0]; + zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value); + zero_run = 1; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[0]); + y = ((int)(((int)(x * quant_ptr[0]) >> 16) + x)) + >> quant_shift_ptr[0]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[0] = x; // write to destination + dqcoeff_ptr[0] = x * dequant_ptr[0]; // dequantized value + + if (y) { + eob = 0; // last nonzero coeffs + zero_run = 0; + } + } + } + for (i = 1; i < 64; i++) { + rc = vp9_default_zig_zag1d_8x8[i]; + z = coeff_ptr[rc]; + zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value); + // The original code was incrementing zero_run while keeping it at + // maximum 15 by adding "(zero_run < 15)". The same is achieved by + // removing the opposite of the sign mask of "(zero_run - 15)". + zero_run -= (zero_run - 15) >> 31; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = ((int)(((int)(x * quant_ptr[1]) >> 16) + x)) + >> quant_shift_ptr[1]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[1]; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs + zero_run = 0; + } } } + xd->eobs[b_idx] = eob + 1; + } else { + xd->eobs[b_idx] = 0; } - - d->eob = eob + 1; } void vp9_quantize_mby_8x8(MACROBLOCK *x) { int i; - int has_2nd_order = get_2nd_order_usage(&x->e_mbd); - for (i = 0; i < 16; i ++) { - x->e_mbd.block[i].eob = 0; - } - x->e_mbd.block[24].eob = 0; for (i = 0; i < 16; i += 4) { - int ib = (i & 8) + ((i & 4) >> 1); - TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, &x->e_mbd.block[ib]); - if (tx_type != DCT_DCT) - assert(has_2nd_order == 0); - x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]); - } - - if (has_2nd_order) { - x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]); - } else { - vpx_memset(x->e_mbd.block[24].qcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].qcoeff[0])); - vpx_memset(x->e_mbd.block[24].dqcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].dqcoeff[0])); - x->e_mbd.block[24].eob = 0; + x->quantize_b_8x8(x, i); } } void vp9_quantize_mbuv_8x8(MACROBLOCK *x) { int i; - for (i = 16; i < 24; i ++) - x->e_mbd.block[i].eob = 0; for (i = 16; i < 24; i += 4) - x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]); + x->quantize_b_8x8(x, i); } void vp9_quantize_mb_8x8(MACROBLOCK *x) { @@ -310,12 +272,7 @@ void vp9_quantize_mb_8x8(MACROBLOCK *x) { } void vp9_quantize_mby_16x16(MACROBLOCK *x) { - int i; - - for (i = 0; i < 16; i++) - x->e_mbd.block[i].eob = 0; - x->e_mbd.block[24].eob = 0; - x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]); + x->quantize_b_16x16(x, 0); } void vp9_quantize_mb_16x16(MACROBLOCK *x) { @@ -324,42 +281,46 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x) { } static void quantize(int16_t *zbin_boost_orig_ptr, - int16_t *coeff_ptr, int n_coeffs, int max_coeffs, + int16_t *coeff_ptr, int n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, uint8_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, - int *eob_ptr, const int *scan, int mul) { + uint16_t *eob_ptr, const int *scan, int mul) { int i, rc, eob; int zbin; int x, y, z, sz; + int zero_run = 0; int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); eob = -1; - for (i = 0; i < max_coeffs; i++) { - rc = scan[i]; - z = coeff_ptr[rc] * mul; - - zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value); - zbin_boost_ptr ++; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += (round_ptr[rc!=0]); - y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) - >> quant_shift_ptr[rc!=0]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = zbin_boost_orig_ptr; + + if (!skip_block) { + for (i = 0; i < n_coeffs; i++) { + rc = scan[i]; + z = coeff_ptr[rc] * mul; + + zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value); + zero_run += (zero_run < 15); + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) + >> quant_shift_ptr[rc != 0]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs + zero_run = 0; + } } } } @@ -367,49 +328,54 @@ static void quantize(int16_t *zbin_boost_orig_ptr, *eob_ptr = eob + 1; } -void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) { - quantize(b->zrun_zbin_boost_16x16, +void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) { + MACROBLOCKD *const xd = &mb->e_mbd; + BLOCK *const b = &mb->block[b_idx]; + BLOCKD *const d = &xd->block[b_idx]; + quantize(b->zrun_zbin_boost, b->coeff, - 256, b->eob_max_offset_16x16, - b->zbin_16x16, b->round, b->quant, b->quant_shift, + 256, b->skip_block, + b->zbin, b->round, b->quant, b->quant_shift, d->qcoeff, d->dqcoeff, d->dequant, b->zbin_extra, - &d->eob, vp9_default_zig_zag1d_16x16, 1); + &xd->eobs[b_idx], vp9_default_zig_zag1d_16x16, 1); } void vp9_quantize_sby_32x32(MACROBLOCK *x) { - x->e_mbd.block[0].eob = 0; - quantize(x->block[0].zrun_zbin_boost_32x32, + MACROBLOCKD *xd = &x->e_mbd; + BLOCK *b = &x->block[0]; + BLOCKD *d = &xd->block[0]; + + quantize(b->zrun_zbin_boost, x->sb_coeff_data.coeff, - 1024, x->block[0].eob_max_offset_32x32, - x->block[0].zbin_32x32, - x->block[0].round, x->block[0].quant, x->block[0].quant_shift, - x->e_mbd.sb_coeff_data.qcoeff, - x->e_mbd.sb_coeff_data.dqcoeff, - x->e_mbd.block[0].dequant, - x->block[0].zbin_extra, - &x->e_mbd.block[0].eob, + 1024, b->skip_block, + b->zbin, + b->round, b->quant, b->quant_shift, + xd->sb_coeff_data.qcoeff, + xd->sb_coeff_data.dqcoeff, + d->dequant, + b->zbin_extra, + &xd->eobs[0], vp9_default_zig_zag1d_32x32, 2); } void vp9_quantize_sbuv_16x16(MACROBLOCK *x) { int i; + MACROBLOCKD *xd = &x->e_mbd; - x->e_mbd.block[16].eob = 0; - x->e_mbd.block[20].eob = 0; for (i = 16; i < 24; i += 4) - quantize(x->block[i].zrun_zbin_boost_16x16, + quantize(x->block[i].zrun_zbin_boost, x->sb_coeff_data.coeff + 1024 + (i - 16) * 64, - 256, x->block[i].eob_max_offset_16x16, - x->block[i].zbin_16x16, + 256, x->block[i].skip_block, + x->block[i].zbin, x->block[i].round, x->block[0].quant, x->block[i].quant_shift, - x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64, - x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64, - x->e_mbd.block[i].dequant, + xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64, + xd->sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64, + xd->block[i].dequant, x->block[i].zbin_extra, - &x->e_mbd.block[i].eob, + &xd->eobs[i], vp9_default_zig_zag1d_16x16, 1); } @@ -417,10 +383,9 @@ void vp9_quantize_sbuv_16x16(MACROBLOCK *x) { * these two C functions if corresponding optimized routine is not available. * NEON optimized version implements currently the fast quantization for pair * of blocks. */ -void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2, - BLOCKD *d1, BLOCKD *d2) { - vp9_regular_quantize_b_4x4(b1, d1); - vp9_regular_quantize_b_4x4(b2, d2); +void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2) { + vp9_regular_quantize_b_4x4(x, b_idx1); + vp9_regular_quantize_b_4x4(x, b_idx2); } static void invert_quant(int16_t *quant, @@ -439,164 +404,33 @@ void vp9_init_quantizer(VP9_COMP *cpi) { int i; int quant_val; int Q; - static const int zbin_boost[16] = { 0, 0, 8, 10, 12, 14, 16, 20, - 24, 28, 32, 36, 40, 44, 44, 44 - }; - - static const int zbin_boost_8x8[64] = { 0, 0, 0, 8, 8, 8, 10, 12, - 14, 16, 18, 20, 22, 24, 26, 28, - 30, 32, 34, 36, 38, 40, 42, 44, - 46, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48 - }; - static const int zbin_boost_16x16[256] = { - 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, - 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - }; - static const int zbin_boost_32x32[1024] = { - 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, - 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - }; - int qrounding_factor = 48; + static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12, + 14, 16, 20, 24, 28, 32, 36, 40 }; for (Q = 0; Q < QINDEX_RANGE; Q++) { int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80; - -#if CONFIG_LOSSLESS - if (cpi->oxcf.lossless) { - if (Q == 0) { - qzbin_factor = 64; - qrounding_factor = 64; - } + int qrounding_factor = 48; + if (Q == 0) { + qzbin_factor = 64; + qrounding_factor = 64; } -#endif - // dc values quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q); invert_quant(cpi->Y1quant[Q] + 0, cpi->Y1quant_shift[Q] + 0, quant_val); cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.Y1dequant[Q][0] = quant_val; cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; - cpi->zrun_zbin_boost_y1_8x8[Q][0] = - ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_y1_16x16[Q][0] = - ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; - cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_32x32[Q][0] = - ((quant_val * zbin_boost_32x32[0]) + 64) >> 7; - - - quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q); - invert_quant(cpi->Y2quant[Q] + 0, - cpi->Y2quant_shift[Q] + 0, quant_val); - cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.Y2dequant[Q][0] = quant_val; - cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; - cpi->zrun_zbin_boost_y2_8x8[Q][0] = - ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_y2_16x16[Q][0] = - ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q); invert_quant(cpi->UVquant[Q] + 0, cpi->UVquant_shift[Q] + 0, quant_val); cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.UVdequant[Q][0] = quant_val; cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7; - cpi->zrun_zbin_boost_uv_8x8[Q][0] = - ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_uv_16x16[Q][0] = - ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; // all the 4x4 ac values =; for (i = 1; i < 16; i++) { @@ -611,15 +445,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->zrun_zbin_boost_y1[Q][i] = ((quant_val * zbin_boost[i]) + 64) >> 7; - quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); - invert_quant(cpi->Y2quant[Q] + rc, - cpi->Y2quant_shift[Q] + rc, quant_val); - cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7; - cpi->common.Y2dequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_y2[Q][i] = - ((quant_val * zbin_boost[i]) + 64) >> 7; - quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); invert_quant(cpi->UVquant[Q] + rc, cpi->UVquant_shift[Q] + rc, quant_val); @@ -629,57 +454,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->zrun_zbin_boost_uv[Q][i] = ((quant_val * zbin_boost[i]) + 64) >> 7; } - - // 8x8 structures... only zbin seperated out for now - // This needs cleaning up for 8x8 especially if we are to add - // support for non flat Q matices - for (i = 1; i < 64; i++) { - int rc = vp9_default_zig_zag1d_8x8[i]; - - quant_val = vp9_ac_yquant(Q); - cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_8x8[Q][i] = - ((quant_val * zbin_boost_8x8[i]) + 64) >> 7; - - quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); - cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y2_8x8[Q][i] = - ((quant_val * zbin_boost_8x8[i]) + 64) >> 7; - - quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); - cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_uv_8x8[Q][i] = - ((quant_val * zbin_boost_8x8[i]) + 64) >> 7; - } - - // 16x16 structures. Same comment above applies. - for (i = 1; i < 256; i++) { - int rc = vp9_default_zig_zag1d_16x16[i]; - - quant_val = vp9_ac_yquant(Q); - cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_16x16[Q][i] = - ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; - - quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); - cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y2_16x16[Q][i] = - ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; - - quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); - cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_uv_16x16[Q][i] = - ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; - } - // 32x32 structures. Same comment above applies. - for (i = 1; i < 1024; i++) { - int rc = vp9_default_zig_zag1d_32x32[i]; - - quant_val = vp9_ac_yquant(Q); - cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_32x32[Q][i] = - ((quant_val * zbin_boost_32x32[i]) + 64) >> 7; - } } } @@ -709,106 +483,40 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { // Y zbin_extra = (cpi->common.Y1dequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 0; i < 16; i++) { x->block[i].quant = cpi->Y1quant[QIndex]; x->block[i].quant_shift = cpi->Y1quant_shift[QIndex]; x->block[i].zbin = cpi->Y1zbin[QIndex]; - x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex]; - x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex]; - x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex]; x->block[i].round = cpi->Y1round[QIndex]; x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex]; x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex]; - x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex]; - x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex]; - x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex]; x->block[i].zbin_extra = (int16_t)zbin_extra; - // Segment max eob offset feature. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) { - x->block[i].eob_max_offset = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_8x8 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_16x16 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_32x32 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - } else { - x->block[i].eob_max_offset = 16; - x->block[i].eob_max_offset_8x8 = 64; - x->block[i].eob_max_offset_16x16 = 256; - x->block[i].eob_max_offset_32x32 = 1024; - } + // Segment skip feature. + x->block[i].skip_block = + vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); } // UV zbin_extra = (cpi->common.UVdequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 16; i < 24; i++) { x->block[i].quant = cpi->UVquant[QIndex]; x->block[i].quant_shift = cpi->UVquant_shift[QIndex]; x->block[i].zbin = cpi->UVzbin[QIndex]; - x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex]; - x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex]; x->block[i].round = cpi->UVround[QIndex]; x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex]; x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex]; - x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex]; - x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex]; - x->block[i].zbin_extra = (int16_t)zbin_extra; - // Segment max eob offset feature. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) { - x->block[i].eob_max_offset = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_8x8 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_16x16 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - } else { - x->block[i].eob_max_offset = 16; - x->block[i].eob_max_offset_8x8 = 64; - x->block[i].eob_max_offset_16x16 = 256; - } - } - - // Y2 - zbin_extra = (cpi->common.Y2dequant[QIndex][1] * - ((cpi->zbin_over_quant / 2) + - cpi->zbin_mode_boost + - x->act_zbin_adj)) >> 7; - - x->block[24].quant = cpi->Y2quant[QIndex]; - x->block[24].quant_shift = cpi->Y2quant_shift[QIndex]; - x->block[24].zbin = cpi->Y2zbin[QIndex]; - x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex]; - x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex]; - x->block[24].round = cpi->Y2round[QIndex]; - x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex]; - x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex]; - x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex]; - x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex]; - x->block[24].zbin_extra = (int16_t)zbin_extra; - - // TBD perhaps not use for Y2 - // Segment max eob offset feature. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) { - x->block[24].eob_max_offset = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[24].eob_max_offset_8x8 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - } else { - x->block[24].eob_max_offset = 16; - x->block[24].eob_max_offset_8x8 = 4; + // Segment skip feature. + x->block[i].skip_block = + vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); } /* save this macroblock QIndex for vp9_update_zbin_extra() */ @@ -822,8 +530,7 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { // Y zbin_extra = (cpi->common.Y1dequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 0; i < 16; i++) { x->block[i].zbin_extra = (int16_t)zbin_extra; @@ -831,21 +538,12 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { // UV zbin_extra = (cpi->common.UVdequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 16; i < 24; i++) { x->block[i].zbin_extra = (int16_t)zbin_extra; } - - // Y2 - zbin_extra = (cpi->common.Y2dequant[QIndex][1] * - ((cpi->zbin_over_quant / 2) + - cpi->zbin_mode_boost + - x->act_zbin_adj)) >> 7; - - x->block[24].zbin_extra = (int16_t)zbin_extra; } void vp9_frame_init_quantizer(VP9_COMP *cpi) { @@ -864,10 +562,8 @@ void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) { // if any of the delta_q values are changing update flag will // have to be set. cm->y1dc_delta_q = 0; - cm->y2ac_delta_q = 0; cm->uvdc_delta_q = 0; cm->uvac_delta_q = 0; - cm->y2dc_delta_q = 0; // quantizer has to be reinitialized if any delta_q changes. // As there are not any here for now this is inactive code. diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index ac44a751c..d338e620a 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -14,10 +14,10 @@ #include "vp9/encoder/vp9_block.h" #define prototype_quantize_block(sym) \ - void (sym)(BLOCK *b,BLOCKD *d) + void (sym)(MACROBLOCK *mb, int b_idx) #define prototype_quantize_block_pair(sym) \ - void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) + void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2) #define prototype_quantize_mb(sym) \ void (sym)(MACROBLOCK *x) @@ -27,7 +27,7 @@ #endif #define prototype_quantize_block_type(sym) \ - void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type) + void (sym)(MACROBLOCK *mb, int b_ix, TX_TYPE type) extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4); #ifndef vp9_quantize_quantb_4x4 @@ -50,11 +50,6 @@ extern prototype_quantize_block(vp9_quantize_quantb_8x8); #endif extern prototype_quantize_block(vp9_quantize_quantb_16x16); -#ifndef vp9_quantize_quantb_2x2 -#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_2x2); - #ifndef vp9_quantize_mb_4x4 #define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c #endif diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index f663b56c9..a2a79574d 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -14,8 +14,8 @@ #include <string.h> #include <limits.h> #include <assert.h> +#include <math.h> -#include "math.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_modecont.h" #include "vp9/common/vp9_common.h" @@ -25,9 +25,10 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" -#define MIN_BPB_FACTOR 0.005 -#define MAX_BPB_FACTOR 50 +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 #ifdef MODE_STATS extern unsigned int y_modes[VP9_YMODES]; @@ -113,13 +114,19 @@ static int kfboost_qadjust(int qindex) { return retval; } -int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) { - if (frame_type == KEY_FRAME) - return (int)(4500000 / vp9_convert_qindex_to_q(qindex)); - else - return (int)(2850000 / vp9_convert_qindex_to_q(qindex)); -} +int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor) { + int enumerator; + double q = vp9_convert_qindex_to_q(qindex); + + if (frame_type == KEY_FRAME) { + enumerator = 4500000; + } else { + enumerator = 2850000; + } + return (int)(0.5 + (enumerator * correction_factor / q)); +} void vp9_save_coding_context(VP9_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; @@ -168,11 +175,8 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas); vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4); - vp9_copy(cc->hybrid_coef_probs_4x4, cm->fc.hybrid_coef_probs_4x4); vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8); - vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8); vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16); - vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16); vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32); vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); #if CONFIG_COMP_INTERINTRA_PRED @@ -226,11 +230,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas); vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4); - vp9_copy(cm->fc.hybrid_coef_probs_4x4, cc->hybrid_coef_probs_4x4); vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8); - vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8); vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16); - vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16); vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32); vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); #if CONFIG_COMP_INTERINTRA_PRED @@ -238,68 +239,33 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { #endif } - void vp9_setup_key_frame(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - // Setup for Key frame: - vp9_default_coef_probs(& cpi->common); - vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob); - vp9_init_mbmode_probs(& cpi->common); - vp9_default_bmode_probs(cm->fc.bmode_prob); - - if(cm->last_frame_seg_map) - vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols)); - - vp9_init_mv_probs(& cpi->common); + MACROBLOCKD *xd = &cpi->mb.e_mbd; - // cpi->common.filter_level = 0; // Reset every key frame. - cpi->common.filter_level = cpi->common.base_qindex * 3 / 8; + vp9_setup_past_independence(cm, xd); // interval before next GF cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - - cpi->common.refresh_golden_frame = TRUE; - cpi->common.refresh_alt_ref_frame = TRUE; - - vp9_init_mode_contexts(&cpi->common); - vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc)); - vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc)); - - vpx_memset(cm->prev_mip, 0, - (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO)); - vpx_memset(cm->mip, 0, - (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO)); - - vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_in_image(cm, cm->mi); - -#if CONFIG_NEW_MVREF - if (1) { - MACROBLOCKD *xd = &cpi->mb.e_mbd; - - // Defaults probabilities for encoding the MV ref id signal - vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB, - sizeof(xd->mb_mv_ref_probs)); - } -#endif + /* All buffers are implicitly updated on key frames. */ + cpi->refresh_golden_frame = TRUE; + cpi->refresh_alt_ref_frame = TRUE; } void vp9_setup_inter_frame(VP9_COMP *cpi) { - if (cpi->common.refresh_alt_ref_frame) { - vpx_memcpy(&cpi->common.fc, - &cpi->common.lfc_a, - sizeof(cpi->common.fc)); - } else { - vpx_memcpy(&cpi->common.fc, - &cpi->common.lfc, - sizeof(cpi->common.fc)); + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &cpi->mb.e_mbd; + if (cm->error_resilient_mode) { + vp9_setup_past_independence(cm, xd); } + assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS); + vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx], + sizeof(cm->fc)); } - static int estimate_bits_at_q(int frame_kind, int Q, int MBs, double correction_factor) { - int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q)); + int Bpm = (int)(vp9_bits_per_mb(frame_kind, Q, correction_factor)); /* Attempt to retain reasonable accuracy without overflow. The cutoff is * chosen such that the maximum product of Bpm and MBs fits 31 bits. The @@ -358,7 +324,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { // Special alt reference frame case - if (cpi->common.refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { // Per frame bit target for the alt ref frame cpi->per_frame_bandwidth = cpi->twopass.gf_bits; cpi->this_frame_target = cpi->per_frame_bandwidth; @@ -377,7 +343,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { if (cpi->this_frame_target < min_frame_target) cpi->this_frame_target = min_frame_target; - if (!cpi->common.refresh_alt_ref_frame) + if (!cpi->refresh_alt_ref_frame) // Note the baseline target data rate for this inter frame. cpi->inter_frame_target = cpi->this_frame_target; @@ -386,7 +352,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { // int Boost = 0; int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; - cpi->common.refresh_golden_frame = TRUE; + cpi->refresh_golden_frame = TRUE; calc_gf_params(cpi); @@ -431,35 +397,18 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) { rate_correction_factor = cpi->key_frame_rate_correction_factor; } else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) rate_correction_factor = cpi->gf_rate_correction_factor; else rate_correction_factor = cpi->rate_correction_factor; } - // Work out how big we would have expected the frame to be at this Q given the current correction factor. + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. // Stay in double to avoid int overflow when values are large projected_size_based_on_q = - (int)(((.5 + rate_correction_factor * - vp9_bits_per_mb(cpi->common.frame_type, Q)) * - cpi->common.MBs) / (1 << BPER_MB_NORMBITS)); - - // Make some allowance for cpi->zbin_over_quant - if (cpi->zbin_over_quant > 0) { - int Z = cpi->zbin_over_quant; - double Factor = 0.99; - double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX; - - while (Z > 0) { - Z--; - projected_size_based_on_q = - (int)(Factor * projected_size_based_on_q); - Factor += factor_adjustment; - - if (Factor >= 0.999) - Factor = 0.999; - } - } + estimate_bits_at_q(cpi->common.frame_type, Q, + cpi->common.MBs, rate_correction_factor); // Work out a size correction factor. // if ( cpi->this_frame_target > 0 ) @@ -505,7 +454,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) cpi->key_frame_rate_correction_factor = rate_correction_factor; else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) cpi->gf_rate_correction_factor = rate_correction_factor; else cpi->rate_correction_factor = rate_correction_factor; @@ -522,14 +471,11 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { int bits_per_mb_at_this_q; double correction_factor; - // Reset Zbin OQ value - cpi->zbin_over_quant = 0; - // Select the appropriate correction factor based upon type of frame. if (cpi->common.frame_type == KEY_FRAME) correction_factor = cpi->key_frame_rate_correction_factor; else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) correction_factor = cpi->gf_rate_correction_factor; else correction_factor = cpi->rate_correction_factor; @@ -545,8 +491,7 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { do { bits_per_mb_at_this_q = - (int)(.5 + correction_factor * - vp9_bits_per_mb(cpi->common.frame_type, i)); + (int)(vp9_bits_per_mb(cpi->common.frame_type, i, correction_factor)); if (bits_per_mb_at_this_q <= target_bits_per_mb) { if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) @@ -559,45 +504,6 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { last_error = bits_per_mb_at_this_q - target_bits_per_mb; } while (++i <= cpi->active_worst_quality); - - // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like - // the RD multiplier and zero bin size. - if (Q >= MAXQ) { - int zbin_oqmax; - - double Factor = 0.99; - double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX; - - if (cpi->common.frame_type == KEY_FRAME) - zbin_oqmax = 0; // ZBIN_OQ_MAX/16 - else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active)) - zbin_oqmax = 16; - else - zbin_oqmax = ZBIN_OQ_MAX; - - // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true. - // The effect will be highly clip dependent and may well have sudden steps. - // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero - // bin and hence decreasing the number of low magnitude non zero coefficients. - while (cpi->zbin_over_quant < zbin_oqmax) { - cpi->zbin_over_quant++; - - if (cpi->zbin_over_quant > zbin_oqmax) - cpi->zbin_over_quant = zbin_oqmax; - - // Adjust bits_per_mb_at_this_q estimate - bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q); - Factor += factor_adjustment; - - if (Factor >= 0.999) - Factor = 0.999; - - if (bits_per_mb_at_this_q <= target_bits_per_mb) // Break out if we get down to the target rate - break; - } - - } - return Q; } @@ -671,7 +577,7 @@ void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit, *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; } else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) { + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) { *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; } else { diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index c6484817f..473317605 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -16,23 +16,24 @@ #define FRAME_OVERHEAD_BITS 200 -extern void vp9_save_coding_context(VP9_COMP *cpi); -extern void vp9_restore_coding_context(VP9_COMP *cpi); +void vp9_save_coding_context(VP9_COMP *cpi); +void vp9_restore_coding_context(VP9_COMP *cpi); -extern void vp9_setup_key_frame(VP9_COMP *cpi); -extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var); -extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame); -extern void vp9_adjust_key_frame_context(VP9_COMP *cpi); -extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi, - int *frame_under_shoot_limit, - int *frame_over_shoot_limit); +void vp9_setup_key_frame(VP9_COMP *cpi); +void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var); +int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame); +void vp9_adjust_key_frame_context(VP9_COMP *cpi); +void vp9_compute_frame_size_bounds(VP9_COMP *cpi, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); // return of 0 means drop frame -extern int vp9_pick_frame_size(VP9_COMP *cpi); +int vp9_pick_frame_size(VP9_COMP *cpi); -extern double vp9_convert_qindex_to_q(int qindex); -extern int vp9_gfboost_qadjust(int qindex); -extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex); +double vp9_convert_qindex_to_q(int qindex); +int vp9_gfboost_qadjust(int qindex); +extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor); void vp9_setup_inter_frame(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e8d0cc68e..59e33a464 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -23,7 +23,6 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/encoder/vp9_encodemb.h" @@ -151,20 +150,21 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { static void fill_token_costs(vp9_coeff_count *c, vp9_coeff_probs *p, int block_type_counts) { - int i, j, k; + int i, j, k, l; for (i = 0; i < block_type_counts; i++) - for (j = 0; j < COEF_BANDS; j++) - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0))) - vp9_cost_tokens_skip((int *)(c[i][j][k]), - p[i][j][k], - vp9_coef_tree); - else - vp9_cost_tokens((int *)(c[i][j][k]), - p[i][j][k], - vp9_coef_tree); - } + for (j = 0; j < REF_TYPES; j++) + for (k = 0; k < COEF_BANDS; k++) + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + if (l == 0 && k > 0) + vp9_cost_tokens_skip((int *)(c[i][j][k][l]), + p[i][j][k][l], + vp9_coef_tree); + else + vp9_cost_tokens((int *)(c[i][j][k][l]), + p[i][j][k][l], + vp9_coef_tree); + } } @@ -218,16 +218,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) { cpi->RDMULT = compute_rd_mult(QIndex); - // Extend rate multiplier along side quantizer zbin increases - if (cpi->zbin_over_quant > 0) { - double oq_factor; - - // Experimental code using the same basic equation as used for Q above - // The units of cpi->zbin_over_quant are 1/128 of Q bin size - oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant); - cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor); - } - if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { if (cpi->twopass.next_iiratio > 31) cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4; @@ -279,20 +269,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) { } fill_token_costs(cpi->mb.token_costs[TX_4X4], - cpi->common.fc.coef_probs_4x4, BLOCK_TYPES_4X4); - fill_token_costs(cpi->mb.hybrid_token_costs[TX_4X4], - cpi->common.fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4); - + cpi->common.fc.coef_probs_4x4, BLOCK_TYPES); fill_token_costs(cpi->mb.token_costs[TX_8X8], - cpi->common.fc.coef_probs_8x8, BLOCK_TYPES_8X8); - fill_token_costs(cpi->mb.hybrid_token_costs[TX_8X8], - cpi->common.fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8); - + cpi->common.fc.coef_probs_8x8, BLOCK_TYPES); fill_token_costs(cpi->mb.token_costs[TX_16X16], - cpi->common.fc.coef_probs_16x16, BLOCK_TYPES_16X16); - fill_token_costs(cpi->mb.hybrid_token_costs[TX_16X16], - cpi->common.fc.hybrid_coef_probs_16x16, BLOCK_TYPES_16X16); - + cpi->common.fc.coef_probs_16x16, BLOCK_TYPES); fill_token_costs(cpi->mb.token_costs[TX_32X32], cpi->common.fc.coef_probs_32x32, BLOCK_TYPES_32X32); @@ -321,26 +302,7 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) { return error; } -int vp9_mbblock_error_8x8_c(MACROBLOCK *mb, int dc) { - BLOCK *be; - BLOCKD *bd; - int i, j; - int berror, error = 0; - - for (i = 0; i < 16; i+=4) { - be = &mb->block[i]; - bd = &mb->e_mbd.block[i]; - berror = 0; - for (j = dc; j < 64; j++) { - int this_diff = be->coeff[j] - bd->dqcoeff[j]; - berror += this_diff * this_diff; - } - error += berror; - } - return error; -} - -int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) { +int vp9_mbblock_error_c(MACROBLOCK *mb) { BLOCK *be; BLOCKD *bd; int i, j; @@ -350,7 +312,7 @@ int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) { be = &mb->block[i]; bd = &mb->e_mbd.block[i]; berror = 0; - for (j = dc; j < 16; j++) { + for (j = 0; j < 16; j++) { int this_diff = be->coeff[j] - bd->dqcoeff[j]; berror += this_diff * this_diff; } @@ -417,41 +379,36 @@ int vp9_uvsse(MACROBLOCK *x) { sse2 += sse1; } return sse2; - } -#if CONFIG_NEWCOEFCONTEXT -#define PT pn -#else -#define PT pt -#endif -static int cost_coeffs(MACROBLOCK *mb, - BLOCKD *b, PLANE_TYPE type, - ENTROPY_CONTEXT *a, - ENTROPY_CONTEXT *l, - TX_SIZE tx_size) { +static INLINE int cost_coeffs(MACROBLOCK *mb, + BLOCKD *b, PLANE_TYPE type, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + TX_SIZE tx_size) { int pt; - const int eob = b->eob; - MACROBLOCKD *xd = &mb->e_mbd; + MACROBLOCKD *const xd = &mb->e_mbd; const int ib = (int)(b - xd->block); - int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0; + const int eob = xd->eobs[ib]; + int c = 0; int cost = 0, seg_eob; const int segment_id = xd->mode_info_context->mbmi.segment_id; - const int *scan, *band; + const int *scan; int16_t *qcoeff_ptr = b->qcoeff; + const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME; const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? get_tx_type(xd, b) : DCT_DCT; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; - int pn; -#endif - + unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = + mb->token_costs[tx_size][type][ref]; ENTROPY_CONTEXT a_ec = *a, l_ec = *l; + ENTROPY_CONTEXT *const a1 = a + + sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT); + ENTROPY_CONTEXT *const l1 = l + + sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT); switch (tx_size) { case TX_4X4: scan = vp9_default_zig_zag1d_4x4; - band = vp9_coef_bands_4x4; seg_eob = 16; if (type == PLANE_TYPE_Y_WITH_DC) { if (tx_type == ADST_DCT) { @@ -462,30 +419,32 @@ static int cost_coeffs(MACROBLOCK *mb, } break; case TX_8X8: - if (type == PLANE_TYPE_Y2) { - scan = vp9_default_zig_zag1d_4x4; - band = vp9_coef_bands_4x4; - seg_eob = 4; - } else { - scan = vp9_default_zig_zag1d_8x8; - band = vp9_coef_bands_8x8; - seg_eob = 64; - } + a_ec = (a[0] + a[1]) != 0; + l_ec = (l[0] + l[1]) != 0; + scan = vp9_default_zig_zag1d_8x8; + seg_eob = 64; break; case TX_16X16: scan = vp9_default_zig_zag1d_16x16; - band = vp9_coef_bands_16x16; seg_eob = 256; if (type == PLANE_TYPE_UV) { const int uv_idx = ib - 16; qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx; + a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; + l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; + } else { + a_ec = (a[0] + a[1] + a[2] + a[3]) != 0; + l_ec = (l[0] + l[1] + l[2] + l[3]) != 0; } break; case TX_32X32: scan = vp9_default_zig_zag1d_32x32; - band = vp9_coef_bands_32x32; seg_eob = 1024; qcoeff_ptr = xd->sb_coeff_data.qcoeff; + a_ec = (a[0] + a[1] + a[2] + a[3] + + a1[0] + a1[1] + a1[2] + a1[3]) != 0; + l_ec = (l[0] + l[1] + l[2] + l[3] + + l1[0] + l1[1] + l1[2] + l1[3]) != 0; break; default: abort(); @@ -493,59 +452,45 @@ static int cost_coeffs(MACROBLOCK *mb, } VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec); -#if CONFIG_NEWCOEFCONTEXT - neighbors = vp9_get_coef_neighbors_handle(scan); - pn = pt; -#endif - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) - seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) + seg_eob = 0; - if (tx_type != DCT_DCT) { - for (; c < eob; c++) { - int v = qcoeff_ptr[scan[c]]; - int t = vp9_dct_value_tokens_ptr[v].Token; - cost += mb->hybrid_token_costs[tx_size][type][band[c]][PT][t]; - cost += vp9_dct_value_cost_ptr[v]; - pt = vp9_prev_token_class[t]; -#if CONFIG_NEWCOEFCONTEXT - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1])) - pn = vp9_get_coef_neighbor_context( - qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]); - else - pn = pt; -#endif - } - if (c < seg_eob) - cost += mb->hybrid_token_costs[tx_size][type][band[c]] - [PT][DCT_EOB_TOKEN]; - } else { + { + int recent_energy = 0; for (; c < eob; c++) { int v = qcoeff_ptr[scan[c]]; int t = vp9_dct_value_tokens_ptr[v].Token; - cost += mb->token_costs[tx_size][type][band[c]][pt][t]; + cost += token_costs[get_coef_band(tx_size, c)][pt][t]; cost += vp9_dct_value_cost_ptr[v]; - pt = vp9_prev_token_class[t]; -#if CONFIG_NEWCOEFCONTEXT - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1])) - pn = vp9_get_coef_neighbor_context( - qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]); - else - pn = pt; -#endif + pt = vp9_get_coef_context(&recent_energy, t); } if (c < seg_eob) - cost += mb->token_costs[tx_size][type][band[c]] - [PT][DCT_EOB_TOKEN]; + cost += mb->token_costs[tx_size][type][ref][get_coef_band(tx_size, c)] + [pt][DCT_EOB_TOKEN]; } // is eob first coefficient; - pt = (c > !type); + pt = (c > 0); *a = *l = pt; + if (tx_size >= TX_8X8) { + a[1] = l[1] = pt; + if (tx_size >= TX_16X16) { + if (type == PLANE_TYPE_UV) { + a1[0] = a1[1] = l1[0] = l1[1] = pt; + } else { + a[2] = a[3] = l[2] = l[3] = pt; + if (tx_size >= TX_32X32) { + a1[0] = a1[1] = a1[2] = a1[3] = pt; + l1[0] = l1[1] = l1[2] = l1[3] = pt; + } + } + } + } return cost; } -static int rdcost_mby_4x4(MACROBLOCK *mb, int has_2nd_order, int backup) { +static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) { int cost = 0; int b; MACROBLOCKD *xd = &mb->e_mbd; @@ -565,19 +510,11 @@ static int rdcost_mby_4x4(MACROBLOCK *mb, int has_2nd_order, int backup) { } for (b = 0; b < 16; b++) - cost += cost_coeffs(mb, xd->block + b, - (has_2nd_order ? - PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC), + cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_4X4][b], tl + vp9_block2left[TX_4X4][b], TX_4X4); - if (has_2nd_order) - cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2, - ta + vp9_block2above[TX_4X4][24], - tl + vp9_block2left[TX_4X4][24], - TX_4X4); - return cost; } @@ -586,26 +523,17 @@ static void macro_block_yrd_4x4(MACROBLOCK *mb, int *Distortion, int *skippable, int backup) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const mb_y2 = mb->block + 24; - BLOCKD *const x_y2 = xd->block + 24; - int d, has_2nd_order; xd->mode_info_context->mbmi.txfm_size = TX_4X4; - has_2nd_order = get_2nd_order_usage(xd); - // Fdct and building the 2nd order block vp9_transform_mby_4x4(mb); vp9_quantize_mby_4x4(mb); - d = vp9_mbblock_error(mb, has_2nd_order); - if (has_2nd_order) - d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16); - - *Distortion = (d >> 2); - // rate - *Rate = rdcost_mby_4x4(mb, has_2nd_order, backup); - *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, has_2nd_order); + + *Distortion = vp9_mbblock_error(mb) >> 2; + *Rate = rdcost_mby_4x4(mb, backup); + *skippable = vp9_mby_is_skippable_4x4(xd); } -static int rdcost_mby_8x8(MACROBLOCK *mb, int has_2nd_order, int backup) { +static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) { int cost = 0; int b; MACROBLOCKD *xd = &mb->e_mbd; @@ -625,18 +553,11 @@ static int rdcost_mby_8x8(MACROBLOCK *mb, int has_2nd_order, int backup) { } for (b = 0; b < 16; b += 4) - cost += cost_coeffs(mb, xd->block + b, - (has_2nd_order ? - PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC), + cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_8X8][b], tl + vp9_block2left[TX_8X8][b], TX_8X8); - if (has_2nd_order) - cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2, - ta + vp9_block2above[TX_8X8][24], - tl + vp9_block2left[TX_8X8][24], - TX_8X8); return cost; } @@ -645,23 +566,14 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb, int *Distortion, int *skippable, int backup) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const mb_y2 = mb->block + 24; - BLOCKD *const x_y2 = xd->block + 24; - int d, has_2nd_order; xd->mode_info_context->mbmi.txfm_size = TX_8X8; - vp9_transform_mby_8x8(mb); vp9_quantize_mby_8x8(mb); - has_2nd_order = get_2nd_order_usage(xd); - d = vp9_mbblock_error_8x8_c(mb, has_2nd_order); - if (has_2nd_order) - d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16); - - *Distortion = (d >> 2); - // rate - *Rate = rdcost_mby_8x8(mb, has_2nd_order, backup); - *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, has_2nd_order); + + *Distortion = vp9_mbblock_error(mb) >> 2; + *Rate = rdcost_mby_8x8(mb, backup); + *skippable = vp9_mby_is_skippable_8x8(xd); } static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) { @@ -687,7 +599,6 @@ static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) { static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion, int *skippable, int backup) { - int d; MACROBLOCKD *xd = &mb->e_mbd; xd->mode_info_context->mbmi.txfm_size = TX_16X16; @@ -696,15 +607,13 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion, // TODO(jingning) is it possible to quickly determine whether to force // trailing coefficients to be zero, instead of running trellis // optimization in the rate-distortion optimization loop? - if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED) + if (mb->optimize && + xd->mode_info_context->mbmi.mode < I8X8_PRED) vp9_optimize_mby_16x16(mb); - d = vp9_mbblock_error(mb, 0); - - *Distortion = (d >> 2); - // rate + *Distortion = vp9_mbblock_error(mb) >> 2; *Rate = rdcost_mby_16x16(mb, backup); - *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd); + *skippable = vp9_mby_is_skippable_16x16(xd); } static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, @@ -820,15 +729,15 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) { static int rdcost_sby_32x32(MACROBLOCK *x, int backup) { MACROBLOCKD * const xd = &x->e_mbd; - ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; ENTROPY_CONTEXT *ta, *tl; if (backup) { ta = (ENTROPY_CONTEXT *) &t_above, tl = (ENTROPY_CONTEXT *) &t_left; - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); + vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); } else { ta = (ENTROPY_CONTEXT *) xd->above_context; tl = (ENTROPY_CONTEXT *) xd->left_context; @@ -857,21 +766,18 @@ static void super_block_yrd_32x32(MACROBLOCK *x, SUPERBLOCK * const x_sb = &x->sb_coeff_data; MACROBLOCKD * const xd = &x->e_mbd; SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data; -#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID +#if DEBUG_ERROR int16_t out[1024]; #endif vp9_transform_sby_32x32(x); vp9_quantize_sby_32x32(x); -#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID +#if DEBUG_ERROR vp9_short_idct32x32(xd_sb->dqcoeff, out, 64); #endif -#if !CONFIG_DWTDCTHYBRID *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024); -#else - *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4; -#endif + #if DEBUG_ERROR printf("IDCT/FDCT error 32x32: %d (d: %d)\n", vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion); @@ -1129,17 +1035,17 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, rate = bmode_costs[mode]; #endif - vp9_intra4x4_predict(b, mode, b->predictor); + vp9_intra4x4_predict(xd, b, mode, b->predictor); vp9_subtract_b(be, b, 16); b->bmi.as_mode.first = mode; tx_type = get_tx_type_4x4(xd, b); if (tx_type != DCT_DCT) { - vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, be - x->block, tx_type); } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, be - x->block); } tempa = ta; @@ -1168,9 +1074,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, // inverse transform if (best_tx_type != DCT_DCT) - vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob); + vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type); else - xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32); + xd->inv_txm4x4(best_dqcoeff, b->diff, 32); vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); @@ -1179,8 +1085,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, - int *Distortion, int64_t best_rd, - int update_contexts) { + int *Distortion, int64_t best_rd) { int i; MACROBLOCKD *const xd = &mb->e_mbd; int cost = mb->mbmode_cost [xd->frame_type] [B_PRED]; @@ -1191,18 +1096,13 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, ENTROPY_CONTEXT *ta, *tl; int *bmode_costs; - if (update_contexts) { - ta = (ENTROPY_CONTEXT *)xd->above_context; - tl = (ENTROPY_CONTEXT *)xd->left_context; - } else { - vpx_memcpy(&t_above, xd->above_context, - sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, - sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_above, xd->above_context, + sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_left, xd->left_context, + sizeof(ENTROPY_CONTEXT_PLANES)); - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; xd->mode_info_context->mbmi.mode = B_PRED; bmode_costs = mb->inter_bmode_costs; @@ -1407,8 +1307,9 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, int distortion = 0, rate = 0; BLOCK *be = x->block + ib; BLOCKD *b = xd->block + ib; - ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0; - ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0; + ENTROPY_CONTEXT_PLANES ta, tl; + ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0; + ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0; /* * The predictor buffer is a 2d buffer with a stride of 16. Create @@ -1430,58 +1331,75 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, rate = mode_costs[mode]; b->bmi.as_mode.first = mode; - vp9_intra8x8_predict(b, mode, b->predictor); + vp9_intra8x8_predict(xd, b, mode, b->predictor); vp9_subtract_4b_c(be, b, 16); - assert(get_2nd_order_usage(xd) == 0); if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { TX_TYPE tx_type = get_tx_type_8x8(xd, b); if (tx_type != DCT_DCT) - vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8); + vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); else - x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32); - x->quantize_b_8x8(x->block + idx, xd->block + idx); + x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32); + x->quantize_b_8x8(x, idx); // compute quantization mse of 8x8 block distortion = vp9_block_error_c((x->block + idx)->coeff, (xd->block + idx)->dqcoeff, 64); - ta0 = a[vp9_block2above[TX_8X8][idx]]; - tl0 = l[vp9_block2left[TX_8X8][idx]]; + + vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx]; + tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx]; + ta1 = ta0 + 1; + tl1 = tl0 + 1; rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC, - &ta0, &tl0, TX_8X8); + ta0, tl0, TX_8X8); rate += rate_t; - ta1 = ta0; - tl1 = tl0; } else { static const int iblock[4] = {0, 1, 4, 5}; TX_TYPE tx_type; int i; - ta0 = a[vp9_block2above[TX_4X4][ib]]; - ta1 = a[vp9_block2above[TX_4X4][ib + 1]]; - tl0 = l[vp9_block2left[TX_4X4][ib]]; - tl1 = l[vp9_block2left[TX_4X4][ib + 4]]; + vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES)); + ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib]; + tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib]; + ta1 = ta0 + 1; + tl1 = tl0 + 1; distortion = 0; rate_t = 0; for (i = 0; i < 4; ++i) { + int do_two = 0; b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; tx_type = get_tx_type_4x4(xd, b); if (tx_type != DCT_DCT) { - vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); + } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) { + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1); + do_two = 1; } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, ib + iblock[i]); } - distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16); + distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two); rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, - // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0, - &ta0, &tl0, + i&1 ? ta1 : ta0, i&2 ? tl1 : tl0, TX_4X4); + if (do_two) { + i++; + rate_t += cost_coeffs(x, b + 1, PLANE_TYPE_Y_WITH_DC, + i&1 ? ta1 : ta0, i&2 ? tl1 : tl0, + TX_4X4); + } } + b = &xd->block[ib]; + be = &x->block[ib]; rate += rate_t; } @@ -1491,10 +1409,10 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, *bestrate = rate; *bestratey = rate_t; *bestdistortion = distortion; - besta0 = ta0; - besta1 = ta1; - bestl0 = tl0; - bestl1 = tl1; + besta0 = *ta0; + besta1 = *ta1; + bestl0 = *tl0; + bestl1 = *tl1; best_rd = this_rd; *best_mode = mode; copy_predictor_8x8(best_predictor, b->predictor); @@ -1647,12 +1565,12 @@ static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) { int b; int cost = 0; MACROBLOCKD *const xd = &x->e_mbd; - ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; ENTROPY_CONTEXT *ta, *tl; if (backup) { - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); + vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); ta = (ENTROPY_CONTEXT *) &t_above; tl = (ENTROPY_CONTEXT *) &t_left; @@ -1752,8 +1670,9 @@ static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, } static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, - int *distortion, int *skip, int fullpixel) { - vp9_build_inter4x4_predictors_mbuv(&x->e_mbd); + int *distortion, int *skip, int fullpixel, + int mb_row, int mb_col) { + vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col); vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1); @@ -2082,12 +2001,8 @@ int vp9_cost_mv_ref(VP9_COMP *cpi, MACROBLOCKD *xd = &cpi->mb.e_mbd; int segment_id = xd->mode_info_context->mbmi.segment_id; - // If the mode coding is done entirely at the segment level - // we should not account for it at the per mb level in rd code. - // Note that if the segment level coding is expanded from single mode - // to multiple mode masks as per reference frame coding we will need - // to do something different here. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { + // Dont account for mode here if segment skip is enabled. + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { VP9_COMMON *pc = &cpi->common; vp9_prob p [VP9_MVREFS - 1]; @@ -2156,14 +2071,18 @@ static int labels2mode( } break; case LEFT4X4: - this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i); + this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int : + left_block_mv(xd, mic, i); if (mbmi->second_ref_frame > 0) - this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i); + this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int : + left_block_second_mv(xd, mic, i); break; case ABOVE4X4: - this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis); + this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int : + above_block_mv(mic, i, mis); if (mbmi->second_ref_frame > 0) - this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis); + this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int : + above_block_second_mv(mic, i, mis); break; case ZERO4X4: this_mv->as_int = 0; @@ -2178,11 +2097,11 @@ static int labels2mode( int_mv left_mv, left_second_mv; left_second_mv.as_int = 0; - left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int : - left_block_mv(mic, i); + left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int : + left_block_mv(xd, mic, i); if (mbmi->second_ref_frame > 0) - left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int : - left_block_second_mv(mic, i); + left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int : + left_block_second_mv(xd, mic, i); if (left_mv.as_int == this_mv->as_int && (mbmi->second_ref_frame <= 0 || @@ -2198,9 +2117,9 @@ static int labels2mode( #endif } - d->bmi.as_mv.first.as_int = this_mv->as_int; + d->bmi.as_mv[0].as_int = this_mv->as_int; if (mbmi->second_ref_frame > 0) - d->bmi.as_mv.second.as_int = this_second_mv->as_int; + d->bmi.as_mv[1].as_int = this_second_mv->as_int; x->partition_info->bmi[i].mode = m; x->partition_info->bmi[i].mv.as_int = this_mv->as_int; @@ -2230,12 +2149,25 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x, BLOCK *be = &x->block[i]; int thisdistortion; - vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4); - if (xd->mode_info_context->mbmi.second_ref_frame > 0) - vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4); + vp9_build_inter_predictor(*(bd->base_pre) + bd->pre, + bd->pre_stride, + bd->predictor, 16, + &bd->bmi.as_mv[0], + &xd->scale_factor[0], + 4, 4, 0 /* no avg */, &xd->subpix); + + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + vp9_build_inter_predictor(*(bd->base_second_pre) + bd->pre, + bd->pre_stride, + bd->predictor, 16, + &bd->bmi.as_mv[1], + &xd->scale_factor[1], + 4, 4, 1 /* avg */, &xd->subpix); + } + vp9_subtract_b(be, bd, 16); - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, bd); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, i); thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16); *distortion += thisdistortion; *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC, @@ -2274,20 +2206,31 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x, int ib = vp9_i8x8_block[i]; if (labels[ib] == which_label) { + const int use_second_ref = + xd->mode_info_context->mbmi.second_ref_frame > 0; + int which_mv; int idx = (ib & 8) + ((ib & 2) << 1); BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx]; BLOCK *be = &x->block[ib], *be2 = &x->block[idx]; int thisdistortion; - vp9_build_inter_predictors4b(xd, bd, 16); - if (xd->mode_info_context->mbmi.second_ref_frame > 0) - vp9_build_2nd_inter_predictors4b(xd, bd, 16); + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre; + + vp9_build_inter_predictor(*base_pre + bd->pre, + bd->pre_stride, + bd->predictor, 16, + &bd->bmi.as_mv[which_mv], + &xd->scale_factor[which_mv], + 8, 8, which_mv, &xd->subpix); + } + vp9_subtract_4b_c(be, bd, 16); if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { if (otherrd) { - x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32); - x->quantize_b_8x8(be2, bd2); + x->fwd_txm8x8(be->src_diff, be2->coeff, 32); + x->quantize_b_8x8(x, idx); thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64); otherdist += thisdistortion; othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC, @@ -2298,8 +2241,8 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x, for (j = 0; j < 4; j += 2) { bd = &xd->block[ib + iblock[j]]; be = &x->block[ib + iblock[j]]; - x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1); + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1); thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32); *distortion += thisdistortion; *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC, @@ -2316,8 +2259,8 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x, for (j = 0; j < 4; j += 2) { BLOCKD *bd = &xd->block[ib + iblock[j]]; BLOCK *be = &x->block[ib + iblock[j]]; - x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1); + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]); thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32); otherdist += thisdistortion; othercost += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC, @@ -2330,8 +2273,8 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x, TX_4X4); } } - x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32); - x->quantize_b_8x8(be2, bd2); + x->fwd_txm8x8(be->src_diff, be2->coeff, 32); + x->quantize_b_8x8(x, idx); thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64); *distortion += thisdistortion; *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC, @@ -2373,8 +2316,7 @@ typedef struct { } BEST_SEG_INFO; -static __inline -int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { +static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { int r = 0; r |= (mv->as_mv.row >> 3) < x->mv_row_min; r |= (mv->as_mv.row >> 3) > x->mv_row_max; @@ -2487,9 +2429,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // use previous block's result as next block's MV predictor. if (segmentation == PARTITIONING_4X4 && i > 0) { - bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int; + bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int; if (i == 4 || i == 8 || i == 12) - bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int; + bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int; step_param = 2; } } @@ -2528,11 +2470,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (thissme < bestsme) { bestsme = thissme; - mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int; + mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int; } else { /* The full search result is actually worse so re-instate the * previous best vector */ - e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int; + e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int; } } } @@ -2595,13 +2537,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) { for (j = 0; j < 16; j++) if (labels[j] == i) - best_eobs[j] = x->e_mbd.block[j].eob; + best_eobs[j] = x->e_mbd.eobs[j]; } else { for (j = 0; j < 4; j++) { int ib = vp9_i8x8_block[j], idx = j * 4; if (labels[ib] == i) - best_eobs[idx] = x->e_mbd.block[idx].eob; + best_eobs[idx] = x->e_mbd.eobs[idx]; } } if (other_rd < best_other_rd) @@ -2734,8 +2676,9 @@ static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x, if (base_rd < txfm_cache[ONLY_4X4]) { txfm_cache[ONLY_4X4] = base_rd; } - if (base_rd + diff < txfm_cache[1]) { - txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff; + if (base_rd + diff < txfm_cache[ALLOW_8X8]) { + txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = + txfm_cache[ALLOW_32X32] = base_rd + diff; } if (diff < 0) { base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0); @@ -2749,7 +2692,7 @@ static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x, } } -static __inline void cal_step_param(int sr, int *sp) { +static INLINE void cal_step_param(int sr, int *sp) { int step = 0; if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP; @@ -2872,18 +2815,18 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < 16; i++) { BLOCKD *bd = &x->e_mbd.block[i]; - bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int; + bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int; if (mbmi->second_ref_frame > 0) - bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int; - bd->eob = bsi.eobs[i]; + bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int; + x->e_mbd.eobs[i] = bsi.eobs[i]; } *returntotrate = bsi.r; *returndistortion = bsi.d; *returnyrate = bsi.segment_yrate; *skippable = bsi.txfm_size == TX_4X4 ? - vp9_mby_is_skippable_4x4(&x->e_mbd, 0) : - vp9_mby_is_skippable_8x8(&x->e_mbd, 0); + vp9_mby_is_skippable_4x4(&x->e_mbd) : + vp9_mby_is_skippable_8x8(&x->e_mbd); /* save partitions */ mbmi->txfm_size = bsi.txfm_size; @@ -3016,7 +2959,8 @@ static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], } } -static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) { +static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, + int idx, int val, int weight) { unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0; unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0; // weight is 16-bit fixed point, so this basically calculates: @@ -3160,43 +3104,104 @@ static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, int idx, MV_REFERENCE_FRAME frame_type, int block_size, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], int frame_mdcounts[4][4], - uint8_t *y_buffer[4], - uint8_t *u_buffer[4], - uint8_t *v_buffer[4]) { - YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx]; + YV12_BUFFER_CONFIG yv12_mb[4], + struct scale_factors scale[MAX_REF_FRAMES]) { + VP9_COMMON *cm = &cpi->common; + YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]]; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + int use_prev_in_find_mv_refs, use_prev_in_find_best_ref; + + // set up scaling factors + scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1]; + scale[frame_type].x_offset_q4 = + (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf; + scale[frame_type].y_offset_q4 = + (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf; - y_buffer[frame_type] = yv12->y_buffer + recon_yoffset; - u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset; - v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset; + // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this + // use the UV scaling factors. + setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col, + &scale[frame_type], &scale[frame_type]); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(xd, xd->mode_info_context, - xd->prev_mode_info_context, + use_prev_in_find_mv_refs = cm->Width == cm->last_width && + cm->Height == cm->last_height && + !cpi->common.error_resilient_mode; + vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context, + use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL, frame_type, mbmi->ref_mvs[frame_type], cpi->common.ref_frame_sign_bias); // Candidate refinement carried out at encoder and decoder - vp9_find_best_ref_mvs(xd, y_buffer[frame_type], + use_prev_in_find_best_ref = + scale[frame_type].x_num == scale[frame_type].x_den && + scale[frame_type].y_num == scale[frame_type].y_den && + !cm->error_resilient_mode && + !cm->frame_parallel_decoding_mode; + vp9_find_best_ref_mvs(xd, + use_prev_in_find_best_ref ? + yv12_mb[frame_type].y_buffer : NULL, yv12->y_stride, mbmi->ref_mvs[frame_type], &frame_nearest_mv[frame_type], &frame_near_mv[frame_type]); - // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. - mv_pred(cpi, x, y_buffer[frame_type], yv12->y_stride, + mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride, frame_type, block_size); } +static void model_rd_from_var_lapndz(int var, int n, int qstep, + int *rate, int *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + // The function is implemented as piecewise approximation to the + // exact computation. + // TODO(debargha): Implement the functions by interpolating from a + // look-up table + vp9_clear_system_state(); + { + double D, R; + double s2 = (double) var / n; + double s = sqrt(s2); + double x = qstep / s; + if (x > 1.0) { + double y = exp(-x / 2); + double y2 = y * y; + D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275; + R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017; + } else { + double x2 = x * x; + D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807; + if (x > 0.125) + R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x + + 0.1626989668625); + else + R = -1.442252874826093 * log(x) + 1.944647760719664; + } + if (R < 0) { + *rate = 0; + *dist = var; + } else { + *rate = (n * R * 256 + 0.5); + *dist = (n * D * s2 + 0.5); + } + } + vp9_clear_system_state(); +} + static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, enum BlockSize block_size, int *saddone, int near_sadidx[], @@ -3209,9 +3214,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate_y, int *distortion_y, int *rate_uv, int *distortion_uv, int *mode_excluded, int *disable_skip, - int recon_yoffset, int mode_index, + int mode_index, + INTERPOLATIONFILTERTYPE *best_filter, int_mv frame_mv[MB_MODE_COUNT] - [MAX_REF_FRAMES]) { + [MAX_REF_FRAMES], + YV12_BUFFER_CONFIG *scaled_ref_frame, + int mb_row, int mb_col) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; @@ -3229,6 +3237,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv cur_mv[2]; int_mv ref_mv[2]; int64_t this_rd = 0; + unsigned char tmp_ybuf[64 * 64]; + unsigned char tmp_ubuf[32 * 32]; + unsigned char tmp_vbuf[32 * 32]; + int pred_exists = 0; + int interpolating_intpel_seen = 0; + int intpel_mv; + int64_t rd, best_rd = INT64_MAX; switch (this_mode) { case NEWMV: @@ -3248,6 +3263,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, 96, x->e_mbd.allow_high_precision_mv); } else { + YV12_BUFFER_CONFIG backup_yv12 = xd->pre; int bestsme = INT_MAX; int further_steps, step_param = cpi->sf.first_step; int sadpb = x->sadperbit16; @@ -3259,6 +3275,16 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int tmp_row_min = x->mv_row_min; int tmp_row_max = x->mv_row_max; + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + xd->pre = *scaled_ref_frame; + xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16; + xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8; + xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8; + } + vp9_clamp_mv_min_max(x, &ref_mv[0]); // mvp_full.as_int = ref_mv[0].as_int; @@ -3267,9 +3293,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.as_mv.col >>= 3; mvp_full.as_mv.row >>= 3; - if (mvp_full.as_int != mvp_full.as_int) { - mvp_full.as_int = mvp_full.as_int; - } // adjust search range according to sr from mv prediction step_param = MAX(step_param, sr); @@ -3297,22 +3320,22 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, &dis, &sse); } - d->bmi.as_mv.first.as_int = tmp_mv.as_int; - frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int; + d->bmi.as_mv[0].as_int = tmp_mv.as_int; + frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int; // Add the new motion vector cost to our rolling cost variable *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0], x->nmvjointcost, x->mvcost, 96, xd->allow_high_precision_mv); + + // restore the predictor, if required + if (scaled_ref_frame) { + xd->pre = backup_yv12; + } } break; - case NEARESTMV: case NEARMV: - // Do not bother proceeding if the vector (from newmv, nearest or - // near) is 0,0 as this should then be coded using the zeromv mode. - for (i = 0; i < num_refs; ++i) - if (frame_mv[this_mode][refs[i]].as_int == 0) - return INT64_MAX; + case NEARESTMV: case ZEROMV: default: break; @@ -3326,11 +3349,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mv[i].as_int = cur_mv[i].as_int; } - if (cpi->common.mcomp_filter_type == SWITCHABLE) { - const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); - const int m = vp9_switchable_interp_map[mbmi->interp_filter]; - *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; - } /* We don't include the cost of the second reference here, because there * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other @@ -3355,36 +3373,363 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } #endif + pred_exists = 0; + interpolating_intpel_seen = 0; + // Are all MVs integer pel for Y and UV + intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 && + (mbmi->mv[0].as_mv.col & 15) == 0; + if (is_comp_pred) + intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 && + (mbmi->mv[1].as_mv.col & 15) == 0; + // Search for best switchable filter by checking the variance of + // pred error irrespective of whether the filter will be used if (block_size == BLOCK_64X64) { - vp9_build_inter64x64_predictors_sb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); + int switchable_filter_index, newbest; + int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0; + int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0; + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int rs = 0; + mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; + } + if (interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i, + tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i); + } else { + unsigned int sse, var; + int tmp_rate_y, tmp_rate_u, tmp_rate_v; + int tmp_dist_y, tmp_dist_u, tmp_dist_v; + vp9_build_inter64x64_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + var = vp9_variance64x64(*(b->base_src), b->src_stride, + xd->dst.y_buffer, xd->dst.y_stride, &sse); + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + model_rd_from_var_lapndz(var, 64 * 64, xd->block[0].dequant[1] >> 3, + &tmp_rate_y, &tmp_dist_y); + var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride, + xd->dst.u_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 32 * 32, xd->block[16].dequant[1] >> 3, + &tmp_rate_u, &tmp_dist_u); + var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride, + xd->dst.v_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 32 * 32, xd->block[20].dequant[1] >> 3, + &tmp_rate_v, &tmp_dist_v); + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y + tmp_rate_u + tmp_rate_v, + tmp_dist_y + tmp_dist_u + tmp_dist_v); + if (!interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + tmp_rate_y_i = tmp_rate_y; + tmp_rate_u_i = tmp_rate_u; + tmp_rate_v_i = tmp_rate_v; + tmp_dist_y_i = tmp_dist_y; + tmp_dist_u_i = tmp_dist_u; + tmp_dist_v_i = tmp_dist_v; + } + } + newbest = (switchable_filter_index == 0 || rd < best_rd); + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + } + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + int i; + for (i = 0; i < 64; ++i) + vpx_memcpy(tmp_ybuf + i * 64, + xd->dst.y_buffer + i * xd->dst.y_stride, + sizeof(unsigned char) * 64); + for (i = 0; i < 32; ++i) + vpx_memcpy(tmp_ubuf + i * 32, + xd->dst.u_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 32); + for (i = 0; i < 32; ++i) + vpx_memcpy(tmp_vbuf + i * 32, + xd->dst.v_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 32); + pred_exists = 1; + } + interpolating_intpel_seen |= + intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter]; + } } else if (block_size == BLOCK_32X32) { - vp9_build_inter32x32_predictors_sb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); + int switchable_filter_index, newbest; + int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0; + int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0; + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int rs = 0; + mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; + } + if (interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i, + tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i); + } else { + unsigned int sse, var; + int tmp_rate_y, tmp_rate_u, tmp_rate_v; + int tmp_dist_y, tmp_dist_u, tmp_dist_v; + vp9_build_inter32x32_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + var = vp9_variance32x32(*(b->base_src), b->src_stride, + xd->dst.y_buffer, xd->dst.y_stride, &sse); + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + model_rd_from_var_lapndz(var, 32 * 32, xd->block[0].dequant[1] >> 3, + &tmp_rate_y, &tmp_dist_y); + var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride, + xd->dst.u_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 16 * 16, xd->block[16].dequant[1] >> 3, + &tmp_rate_u, &tmp_dist_u); + var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride, + xd->dst.v_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 16 * 16, xd->block[20].dequant[1] >> 3, + &tmp_rate_v, &tmp_dist_v); + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y + tmp_rate_u + tmp_rate_v, + tmp_dist_y + tmp_dist_u + tmp_dist_v); + if (!interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + tmp_rate_y_i = tmp_rate_y; + tmp_rate_u_i = tmp_rate_u; + tmp_rate_v_i = tmp_rate_v; + tmp_dist_y_i = tmp_dist_y; + tmp_dist_u_i = tmp_dist_u; + tmp_dist_v_i = tmp_dist_v; + } + } + newbest = (switchable_filter_index == 0 || rd < best_rd); + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + } + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + int i; + for (i = 0; i < 32; ++i) + vpx_memcpy(tmp_ybuf + i * 64, + xd->dst.y_buffer + i * xd->dst.y_stride, + sizeof(unsigned char) * 32); + for (i = 0; i < 16; ++i) + vpx_memcpy(tmp_ubuf + i * 32, + xd->dst.u_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 16); + for (i = 0; i < 16; ++i) + vpx_memcpy(tmp_vbuf + i * 32, + xd->dst.v_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 16); + pred_exists = 1; + } + interpolating_intpel_seen |= + intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter]; + } } else { + int switchable_filter_index, newbest; + int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0; + int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0; assert(block_size == BLOCK_16X16); - vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0); - if (is_comp_pred) - vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16); + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int rs = 0; + mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; + } + if (interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i, + tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i); + } else { + unsigned int sse, var; + int tmp_rate_y, tmp_rate_u, tmp_rate_v; + int tmp_dist_y, tmp_dist_u, tmp_dist_v; + // TODO(jkoleszar): these 2 y/uv should be replaced with one call to + // vp9_build_interintra_16x16_predictors_mb(). + vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, + mb_row, mb_col); + #if CONFIG_COMP_INTERINTRA_PRED - if (is_comp_interintra_pred) { - vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16); + if (is_comp_interintra_pred) { + vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16); + } +#endif + + vp9_build_inter16x16_predictors_mbuv(xd, xd->predictor + 256, + xd->predictor + 320, 8, + mb_row, mb_col); + +#if CONFIG_COMP_INTERINTRA_PRED + if (is_comp_interintra_pred) { + vp9_build_interintra_16x16_predictors_mbuv(xd, xd->predictor + 256, + xd->predictor + 320, 8); + } +#endif + var = vp9_variance16x16(*(b->base_src), b->src_stride, + xd->predictor, 16, &sse); + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3, + &tmp_rate_y, &tmp_dist_y); + var = vp9_variance8x8(x->src.u_buffer, x->src.uv_stride, + &xd->predictor[256], 8, &sse); + model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3, + &tmp_rate_u, &tmp_dist_u); + var = vp9_variance8x8(x->src.v_buffer, x->src.uv_stride, + &xd->predictor[320], 8, &sse); + model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3, + &tmp_rate_v, &tmp_dist_v); + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y + tmp_rate_u + tmp_rate_v, + tmp_dist_y + tmp_dist_u + tmp_dist_v); + if (!interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + tmp_rate_y_i = tmp_rate_y; + tmp_rate_u_i = tmp_rate_u; + tmp_rate_v_i = tmp_rate_v; + tmp_dist_y_i = tmp_dist_y; + tmp_dist_u_i = tmp_dist_u; + tmp_dist_v_i = tmp_dist_v; + } + } + newbest = (switchable_filter_index == 0 || rd < best_rd); + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + } + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + vpx_memcpy(tmp_ybuf, xd->predictor, sizeof(unsigned char) * 256); + vpx_memcpy(tmp_ubuf, xd->predictor + 256, sizeof(unsigned char) * 64); + vpx_memcpy(tmp_vbuf, xd->predictor + 320, sizeof(unsigned char) * 64); + pred_exists = 1; + } + interpolating_intpel_seen |= + intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter]; } + } + + // Set the appripriate filter + if (cm->mcomp_filter_type != SWITCHABLE) + mbmi->interp_filter = cm->mcomp_filter_type; + else + mbmi->interp_filter = *best_filter; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + if (pred_exists) { + if (block_size == BLOCK_64X64) { + for (i = 0; i < 64; ++i) + vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride, tmp_ybuf + i * 64, + sizeof(unsigned char) * 64); + for (i = 0; i < 32; ++i) + vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32, + sizeof(unsigned char) * 32); + for (i = 0; i < 32; ++i) + vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32, + sizeof(unsigned char) * 32); + } else if (block_size == BLOCK_32X32) { + for (i = 0; i < 32; ++i) + vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride, tmp_ybuf + i * 64, + sizeof(unsigned char) * 32); + for (i = 0; i < 16; ++i) + vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32, + sizeof(unsigned char) * 16); + for (i = 0; i < 16; ++i) + vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32, + sizeof(unsigned char) * 16); + } else { + vpx_memcpy(xd->predictor, tmp_ybuf, sizeof(unsigned char) * 256); + vpx_memcpy(xd->predictor + 256, tmp_ubuf, sizeof(unsigned char) * 64); + vpx_memcpy(xd->predictor + 320, tmp_vbuf, sizeof(unsigned char) * 64); + } + } else { + // Handles the special case when a filter that is not in the + // switchable list (ex. bilinear, 6-tap) is indicated at the frame level + if (block_size == BLOCK_64X64) { + vp9_build_inter64x64_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + } else if (block_size == BLOCK_32X32) { + vp9_build_inter32x32_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + } else { + // TODO(jkoleszar): These y/uv fns can be replaced with their mb + // equivalent + vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, + mb_row, mb_col); +#if CONFIG_COMP_INTERINTRA_PRED + if (is_comp_interintra_pred) { + vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16); + } +#endif + vp9_build_inter16x16_predictors_mbuv(xd, &xd->predictor[256], + &xd->predictor[320], 8, + mb_row, mb_col); +#if CONFIG_COMP_INTERINTRA_PRED + if (is_comp_interintra_pred) { + vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256], + &xd->predictor[320], 8); + } #endif + } + } + + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; } if (cpi->active_map_enabled && x->active_ptr[0] == 0) x->skip = 1; else if (x->encode_breakout) { - unsigned int sse, var; + unsigned int var, sse; int threshold = (xd->block[0].dequant[1] * xd->block[0].dequant[1] >> 4); @@ -3404,9 +3749,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if ((int)sse < threshold) { - unsigned int q2dc = xd->block[24].dequant[0]; + unsigned int q2dc = xd->block[0].dequant[0]; /* If there is no codeable 2nd order dc - or a very small uniform pixel change change */ + or a very small uniform pixel change change */ if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { // Check u and v to make sure skip is ok @@ -3447,17 +3792,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - if (!(*mode_excluded)) { - if (is_comp_pred) { - *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); - } else { - *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); - } -#if CONFIG_COMP_INTERINTRA_PRED - if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1; -#endif - } - if (!x->skip) { if (block_size == BLOCK_64X64) { int skippable_y, skippable_uv; @@ -3491,30 +3825,32 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *skippable = skippable_y && skippable_uv; } else { assert(block_size == BLOCK_16X16); - - vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256], - &xd->predictor[320], 8); - if (is_comp_pred) - vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256], - &xd->predictor[320], 8); -#if CONFIG_COMP_INTERINTRA_PRED - if (is_comp_interintra_pred) { - vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256], - &xd->predictor[320], 8); - } -#endif inter_mode_cost(cpi, x, rate2, distortion, rate_y, distortion_y, rate_uv, distortion_uv, skippable, txfm_cache); } } + + if (!(*mode_excluded)) { + if (is_comp_pred) { + *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); + } else { + *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); + } +#if CONFIG_COMP_INTERINTRA_PRED + if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1; +#endif + } + return this_rd; // if 0, this will be re-calculated by caller } static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion, int64_t *returnintra) { + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; union b_mode_info best_bmodes[16]; @@ -3544,6 +3880,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #endif int64_t best_overall_rd = INT64_MAX; INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; + INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly; int uv_intra_skippable = 0; int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0; @@ -3551,7 +3888,6 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int rate_y, UNINITIALIZED_IS_SAFE(rate_uv); int distortion_uv = INT_MAX; int64_t best_yrd = INT64_MAX; - int switchable_filter_index = 0; MB_PREDICTION_MODE uv_intra_mode; MB_PREDICTION_MODE uv_intra_mode_8x8 = 0; @@ -3561,7 +3897,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; int frame_mdcounts[4][4]; - uint8_t *y_buffer[4], *u_buffer[4], *v_buffer[4]; + YV12_BUFFER_CONFIG yv12_mb[4]; unsigned int ref_costs[MAX_REF_FRAMES]; int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1]; @@ -3569,6 +3905,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex, cpi->common.y1dc_delta_q); + struct scale_factors scale_factor[4]; + vpx_memset(mode8x8, 0, sizeof(mode8x8)); vpx_memset(&frame_mv, 0, sizeof(frame_mv)); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); @@ -3592,24 +3930,24 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME, - BLOCK_16X16, recon_yoffset, recon_uvoffset, + setup_buffer_inter(cpi, x, cpi->lst_fb_idx, + LAST_FRAME, BLOCK_16X16, mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, y_buffer, u_buffer, v_buffer); + frame_mdcounts, yv12_mb, scale_factor); } if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME, - BLOCK_16X16, recon_yoffset, recon_uvoffset, + setup_buffer_inter(cpi, x, cpi->gld_fb_idx, + GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, y_buffer, u_buffer, v_buffer); + frame_mdcounts, yv12_mb, scale_factor); } if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME, - BLOCK_16X16, recon_yoffset, recon_uvoffset, + setup_buffer_inter(cpi, x, cpi->alt_fb_idx, + ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, y_buffer, u_buffer, v_buffer); + frame_mdcounts, yv12_mb, scale_factor); } *returnintra = INT64_MAX; @@ -3638,8 +3976,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // that depend on the current prediction etc. estimate_ref_frame_costs(cpi, segment_id, ref_costs); - for (mode_index = 0; mode_index < MAX_MODES; - mode_index += (!switchable_filter_index)) { + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { int64_t this_rd = INT64_MAX; int disable_skip = 0, skippable = 0; int other_cost = 0; @@ -3649,6 +3986,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #endif int mode_excluded = 0; int64_t txfm_cache[NB_TXFM_MODES] = { 0 }; + YV12_BUFFER_CONFIG *scaled_ref_frame; // These variables hold are rolling total cost and distortion for this mode rate2 = 0; @@ -3664,24 +4002,38 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame; mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame; - // Evaluate all sub-pel filters irrespective of whether we can use - // them for this frame. - if (this_mode >= NEARESTMV && this_mode <= SPLITMV) { - mbmi->interp_filter = - vp9_switchable_interp[switchable_filter_index++]; - if (switchable_filter_index == VP9_SWITCHABLE_FILTERS) - switchable_filter_index = 0; - if ((cm->mcomp_filter_type != SWITCHABLE) && - (cm->mcomp_filter_type != mbmi->interp_filter)) { - mode_excluded = 1; - } - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - } + mbmi->interp_filter = cm->mcomp_filter_type; + + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); + + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); // Test best rd so far against threshold for trying this mode. if (best_rd <= cpi->rd_threshes[mode_index]) continue; + // Ensure that the references used by this mode are available. + if (mbmi->ref_frame && + !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame])) + continue; + + if (mbmi->second_ref_frame > 0 && + !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame])) + continue; + + // only scale on zeromv. + if (mbmi->ref_frame > 0 && + (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 || + yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) && + this_mode != ZEROMV) + continue; + if (mbmi->second_ref_frame > 0 && + (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 || + yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) && + this_mode != ZEROMV) + continue; + // current coding mode under rate-distortion optimization test loop #if CONFIG_COMP_INTERINTRA_PRED mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1); @@ -3693,18 +4045,16 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) { continue; - // If the segment mode feature is enabled.... + // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && - (this_mode != - vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) && + (this_mode != ZEROMV)) { continue; - // Disable this drop out case if either the mode or ref frame - // segment level feature is enabled for this segment. This is to + // Disable this drop out case if the ref frame segment + // level feature is enabled for this segment. This is to // prevent the possibility that the we end up unable to pick any mode. - } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { - // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { @@ -3716,22 +4066,31 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } /* everything but intra */ + scaled_ref_frame = NULL; if (mbmi->ref_frame) { int ref = mbmi->ref_frame; + int fb; - xd->pre.y_buffer = y_buffer[ref]; - xd->pre.u_buffer = u_buffer[ref]; - xd->pre.v_buffer = v_buffer[ref]; + xd->pre = yv12_mb[ref]; best_ref_mv = mbmi->ref_mvs[ref][0]; vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts)); + + if (mbmi->ref_frame == LAST_FRAME) { + fb = cpi->lst_fb_idx; + } else if (mbmi->ref_frame == GOLDEN_FRAME) { + fb = cpi->gld_fb_idx; + } else { + fb = cpi->alt_fb_idx; + } + + if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb]) + scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; } if (mbmi->second_ref_frame > 0) { int ref = mbmi->second_ref_frame; - xd->second_pre.y_buffer = y_buffer[ref]; - xd->second_pre.u_buffer = u_buffer[ref]; - xd->second_pre.v_buffer = v_buffer[ref]; + xd->second_pre = yv12_mb[ref]; second_best_ref_mv = mbmi->ref_mvs[ref][0]; } @@ -3798,8 +4157,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED]; mbmi->txfm_size = TX_4X4; tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, - &distortion, best_yrd, - cpi->update_context); + &distortion, best_yrd); rate2 += rate; rate2 += intra_cost_penalty; distortion2 += distortion; @@ -3898,29 +4256,108 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // special case it. else if (this_mode == SPLITMV) { const int is_comp_pred = mbmi->second_ref_frame > 0; - int64_t tmp_rd, this_rd_thresh; + int64_t this_rd_thresh; + int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; + int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; + int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0; + int switchable_filter_index; int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL; + union b_mode_info tmp_best_bmodes[16]; + MB_MODE_INFO tmp_best_mbmode; + PARTITION_INFO tmp_best_partition; + int pred_exists = 0; this_rd_thresh = - (mbmi->ref_frame == LAST_FRAME) ? + (mbmi->ref_frame == LAST_FRAME) ? cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA]; this_rd_thresh = - (mbmi->ref_frame == GOLDEN_FRAME) ? + (mbmi->ref_frame == GOLDEN_FRAME) ? cpi->rd_threshes[THR_NEWG] : this_rd_thresh; - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, - second_ref, best_yrd, mdcounts, - &rate, &rate_y, &distortion, - &skippable, - (int)this_rd_thresh, seg_mvs, - txfm_cache); + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int newbest; + mbmi->interp_filter = + vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, + second_ref, best_yrd, mdcounts, + &rate, &rate_y, &distortion, + &skippable, + (int)this_rd_thresh, seg_mvs, + txfm_cache); + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs + [vp9_get_pred_context(&cpi->common, xd, + PRED_SWITCHABLE_INTERP)] + [vp9_switchable_interp_map[mbmi->interp_filter]]; + tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0); + } + newbest = (tmp_rd < tmp_best_rd); + if (newbest) { + tmp_best_filter = mbmi->interp_filter; + tmp_best_rd = tmp_rd; + } + if ((newbest && cm->mcomp_filter_type == SWITCHABLE) || + (mbmi->interp_filter == cm->mcomp_filter_type && + cm->mcomp_filter_type != SWITCHABLE)) { + tmp_best_rdu = tmp_rd; + tmp_best_rate = rate; + tmp_best_ratey = rate_y; + tmp_best_distortion = distortion; + tmp_best_skippable = skippable; + vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO)); + vpx_memcpy(&tmp_best_partition, x->partition_info, + sizeof(PARTITION_INFO)); + for (i = 0; i < 16; i++) { + tmp_best_bmodes[i] = xd->block[i].bmi; + } + pred_exists = 1; + } + } // switchable_filter_index loop + + mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ? + tmp_best_filter : cm->mcomp_filter_type); + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + if (!pred_exists) { + // Handles the special case when a filter that is not in the + // switchable list (bilinear, 6-tap) is indicated at the frame level + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, + second_ref, best_yrd, mdcounts, + &rate, &rate_y, &distortion, + &skippable, + (int)this_rd_thresh, seg_mvs, + txfm_cache); + } else { + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs + [vp9_get_pred_context(&cpi->common, xd, + PRED_SWITCHABLE_INTERP)] + [vp9_switchable_interp_map[mbmi->interp_filter]]; + tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0); + } + tmp_rd = tmp_best_rdu; + rate = tmp_best_rate; + rate_y = tmp_best_ratey; + distortion = tmp_best_distortion; + skippable = tmp_best_skippable; + vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO)); + vpx_memcpy(x->partition_info, &tmp_best_partition, + sizeof(PARTITION_INFO)); + for (i = 0; i < 16; i++) { + xd->block[i].bmi = tmp_best_bmodes[i]; + } + } + rate2 += rate; distortion2 += distortion; if (cpi->common.mcomp_filter_type == SWITCHABLE) rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)] - [vp9_switchable_interp_map[mbmi->interp_filter]]; + [vp9_switchable_interp_map[mbmi->interp_filter]]; // If even the 'Y' rd value of split is higher than best so far // then dont bother looking at UV @@ -3928,7 +4365,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int uv_skippable; rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, - cpi->common.full_pixel); + cpi->common.full_pixel, mb_row, mb_col); rate2 += rate_uv; distortion2 += distortion_uv; skippable = skippable && uv_skippable; @@ -3969,8 +4406,9 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #endif &rate_y, &distortion, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, recon_yoffset, - mode_index, frame_mv); + &mode_excluded, &disable_skip, + mode_index, &tmp_best_filter, frame_mv, + scaled_ref_frame, mb_row, mb_col); if (this_rd == INT64_MAX) continue; } @@ -3995,10 +4433,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->common.mb_no_coeff_skip) { int mb_skip_allowed; - // Is Mb level skip allowed for this mb. - mb_skip_allowed = - !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + // Is Mb level skip allowed (i.e. not coded at segment level). + mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); if (skippable) { mbmi->mb_skip_coeff = 1; @@ -4061,7 +4497,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_rd < best_overall_rd) { best_overall_rd = this_rd; - best_filter = mbmi->interp_filter; + best_filter = tmp_best_filter; best_mode = this_mode; #if CONFIG_COMP_INTERINTRA_PRED is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME); @@ -4175,7 +4611,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (x->skip && !mode_excluded) break; - } + } assert((cm->mcomp_filter_type == SWITCHABLE) || (cm->mcomp_filter_type == best_mbmode.interp_filter) || @@ -4204,12 +4640,11 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, cpi->rd_thresh_mult[best_mode_index]; } - // This code force Altref,0,0 and skip for the frame that overlays a + // This code forces Altref,0,0 and skip for the frame that overlays a // an alrtef unless Altref is filtered. However, this is unsafe if - // segment level coding of ref frame or mode is enabled for this + // segment level coding of ref frame is enabled for this // segment. if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0) && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) { @@ -4224,6 +4659,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; mbmi->partitioning = 0; + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff)); vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff)); @@ -4244,10 +4681,12 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (best_mbmode.mode == SPLITMV) { for (i = 0; i < 16; i++) - xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int; + xd->mode_info_context->bmi[i].as_mv[0].as_int = + best_bmodes[i].as_mv[0].as_int; if (mbmi->second_ref_frame > 0) for (i = 0; i < 16; i++) - xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int; + xd->mode_info_context->bmi[i].as_mv[1].as_int = + best_bmodes[i].as_mv[1].as_int; vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO)); @@ -4265,7 +4704,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { for (i = 0; i < NB_TXFM_MODES; i++) { if (best_txfm_rd[i] == INT64_MAX) - best_txfm_diff[i] = INT_MIN; + best_txfm_diff[i] = 0; else best_txfm_diff[i] = best_rd - best_txfm_rd[i]; } @@ -4274,6 +4713,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } end: + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index], best_mode_index, &best_partition, &mbmi->ref_mvs[mbmi->ref_frame][0], @@ -4291,22 +4732,28 @@ void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, int rate_y_tokenonly = 0, rate_uv_tokenonly; int dist_y = 0, dist_uv; int y_skip = 0, uv_skip; - int64_t txfm_cache[NB_TXFM_MODES]; + int64_t txfm_cache[NB_TXFM_MODES], err; + int i; - rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, - &dist_y, &y_skip, txfm_cache); + err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, txfm_cache); rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip); + &dist_uv, &uv_skip); if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); *returndist = dist_y + (dist_uv >> 2); + memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0, + sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff)); } else { *returnrate = rate_y + rate_uv; if (cpi->common.mb_no_coeff_skip) *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); *returndist = dist_y + (dist_uv >> 2); + for (i = 0; i < NB_TXFM_MODES; i++) { + x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i]; + } } } @@ -4319,22 +4766,28 @@ void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, int rate_y_tokenonly = 0, rate_uv_tokenonly; int dist_y = 0, dist_uv; int y_skip = 0, uv_skip; - int64_t txfm_cache[NB_TXFM_MODES]; + int64_t txfm_cache[NB_TXFM_MODES], err; + int i; - rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly, - &dist_y, &y_skip, txfm_cache); + err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, txfm_cache); rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip); + &dist_uv, &uv_skip); if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); *returndist = dist_y + (dist_uv >> 2); + memset(x->sb64_context.txfm_rd_diff, 0, + sizeof(x->sb64_context.txfm_rd_diff)); } else { *returnrate = rate_y + rate_uv; if (cm->mb_no_coeff_skip) *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); *returndist = dist_y + (dist_uv >> 2); + for (i = 0; i < NB_TXFM_MODES; i++) { + x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i]; + } } } @@ -4392,10 +4845,10 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, mode8x8[2]= xd->mode_info_context->bmi[8].as_mode.first; mode8x8[3]= xd->mode_info_context->bmi[10].as_mode.first; + mbmi->txfm_size = TX_4X4; error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, - &dist4x4, error16x16, - cpi->update_context); + &dist4x4, error16x16); mbmi->mb_skip_coeff = 0; if (cpi->common.mb_no_coeff_skip && @@ -4457,7 +4910,7 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, } static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion, int block_size) { @@ -4471,13 +4924,13 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; int frame_mdcounts[4][4]; - uint8_t *y_buffer[4]; - uint8_t *u_buffer[4]; - uint8_t *v_buffer[4]; + YV12_BUFFER_CONFIG yv12_mb[4]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; - int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx, - cpi->common.alt_fb_idx }; + int idx_list[4] = {0, + cpi->lst_fb_idx, + cpi->gld_fb_idx, + cpi->alt_fb_idx}; int mdcounts[4]; int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; int saddone = 0; @@ -4496,16 +4949,16 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, #endif int64_t best_overall_rd = INT64_MAX; INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; + INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0, rate_uv_tokenonly_8x8 = 0; int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0; MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV; - int switchable_filter_index = 0; int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0; int dist_uv_16x16 = 0, uv_skip_16x16 = 0; MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV; + struct scale_factors scale_factor[4]; - x->skip = 0; xd->mode_info_context->mbmi.segment_id = segment_id; estimate_ref_frame_costs(cpi, segment_id, ref_costs); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); @@ -4518,9 +4971,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size, - recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV], + mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], frame_mdcounts, - y_buffer, u_buffer, v_buffer); + yv12_mb, scale_factor); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; @@ -4570,8 +5023,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - for (mode_index = 0; mode_index < MAX_MODES; - mode_index += (!switchable_filter_index)) { + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { int mode_excluded = 0; int64_t this_rd = INT64_MAX; int disable_skip = 0; @@ -4588,10 +5040,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Test best rd so far against threshold for trying this mode. if (best_rd <= cpi->rd_threshes[mode_index] || cpi->rd_threshes[mode_index] == INT_MAX) { - switchable_filter_index = 0; continue; } + x->skip = 0; this_mode = vp9_mode_order[mode_index].mode; ref_frame = vp9_mode_order[mode_index].ref_frame; if (!(ref_frame == INTRA_FRAME || @@ -4600,6 +5052,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } mbmi->ref_frame = ref_frame; mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame; + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); comp_pred = mbmi->second_ref_frame > INTRA_FRAME; mbmi->mode = this_mode; mbmi->uv_mode = DC_PRED; @@ -4607,19 +5061,11 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1); mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1); #endif + // Evaluate all sub-pel filters irrespective of whether we can use // them for this frame. - if (this_mode >= NEARESTMV && this_mode <= SPLITMV) { - mbmi->interp_filter = - vp9_switchable_interp[switchable_filter_index++]; - if (switchable_filter_index == VP9_SWITCHABLE_FILTERS) - switchable_filter_index = 0; - if ((cm->mcomp_filter_type != SWITCHABLE) && - (cm->mcomp_filter_type != mbmi->interp_filter)) { - mode_excluded = 1; - } - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - } + mbmi->interp_filter = cm->mcomp_filter_type; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); // if (!(cpi->ref_frame_flags & flag_list[ref_frame])) // continue; @@ -4640,10 +5086,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!(cpi->ref_frame_flags & flag_list[second_ref])) continue; mbmi->second_ref_frame = second_ref; + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); - xd->second_pre.y_buffer = y_buffer[second_ref]; - xd->second_pre.u_buffer = u_buffer[second_ref]; - xd->second_pre.v_buffer = v_buffer[second_ref]; + xd->second_pre = yv12_mb[second_ref]; mode_excluded = mode_excluded ? mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; @@ -4661,9 +5107,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - xd->pre.y_buffer = y_buffer[ref_frame]; - xd->pre.u_buffer = u_buffer[ref_frame]; - xd->pre.v_buffer = v_buffer[ref_frame]; + xd->pre = yv12_mb[ref_frame]; vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts)); // If the segment reference frame feature is enabled.... @@ -4671,16 +5115,15 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && !vp9_check_segref(xd, segment_id, ref_frame)) { continue; - // If the segment mode feature is enabled.... + // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && - (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) && + (this_mode != ZEROMV)) { continue; - // Disable this drop out case if either the mode or ref frame + // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { + } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative @@ -4722,6 +5165,20 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv; distortion2 = distortion_y + distortion_uv; } else { + YV12_BUFFER_CONFIG *scaled_ref_frame = NULL; + int fb; + + if (mbmi->ref_frame == LAST_FRAME) { + fb = cpi->lst_fb_idx; + } else if (mbmi->ref_frame == GOLDEN_FRAME) { + fb = cpi->gld_fb_idx; + } else { + fb = cpi->alt_fb_idx; + } + + if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb]) + scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; + #if CONFIG_COMP_INTERINTRA_PRED if (mbmi->second_ref_frame == INTRA_FRAME) { if (best_intra16_mode == DC_PRED - 1) continue; @@ -4742,8 +5199,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, #endif &rate_y, &distortion_y, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, recon_yoffset, - mode_index, frame_mv); + &mode_excluded, &disable_skip, + mode_index, &tmp_best_filter, frame_mv, + scaled_ref_frame, mb_row, mb_col); if (this_rd == INT64_MAX) continue; } @@ -4769,10 +5227,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->common.mb_no_coeff_skip) { int mb_skip_allowed; - // Is Mb level skip allowed for this mb. - mb_skip_allowed = - !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + // Is Mb level skip allowed (i.e. not coded at segment level). + mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); if (skippable) { // Back out the coefficient coding costs @@ -4832,7 +5288,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (this_rd < best_overall_rd) { best_overall_rd = this_rd; - best_filter = mbmi->interp_filter; + best_filter = tmp_best_filter; best_mode = this_mode; #if CONFIG_COMP_INTERINTRA_PRED is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME); @@ -4956,10 +5412,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // This code forces Altref,0,0 and skip for the frame that overlays a // an alrtef unless Altref is filtered. However, this is unsafe if - // segment level coding of ref frame or mode is enabled for this - // segment. + // segment level coding of ref frame is enabled for this segment. if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0) && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) { @@ -4971,7 +5425,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; mbmi->partitioning = 0; mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ? - TX_16X16 : cm->txfm_mode; + TX_32X32 : cm->txfm_mode; vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff)); vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff)); @@ -4991,7 +5445,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { for (i = 0; i < NB_TXFM_MODES; i++) { if (best_txfm_rd[i] == INT64_MAX) - best_txfm_diff[i] = INT_MIN; + best_txfm_diff[i] = 0; else best_txfm_diff[i] = best_rd - best_txfm_rd[i]; } @@ -5000,6 +5454,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } end: + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); { PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ? &x->sb32_context[xd->sb_index] : @@ -5015,24 +5471,23 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion) { - return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset, + return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col, returnrate, returndistortion, BLOCK_32X32); } int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion) { - return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset, + return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col, returnrate, returndistortion, BLOCK_64X64); } void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, - int recon_uvoffset, + int mb_row, int mb_col, int *totalrate, int *totaldist) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; @@ -5050,7 +5505,7 @@ void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, { int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled; - rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + rd_pick_inter_mode(cpi, x, mb_row, mb_col, &rate, &distortion, &intra_error); /* restore cpi->zbin_mode_boost_enabled */ diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 8ee2c0bf9..01b156044 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -29,15 +29,15 @@ extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, int *r, int *d); extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, - int ref_yoffset, int ref_uvoffset, + int mb_row, int mb_col, int *r, int *d); extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, - int ref_yoffset, int ref_uvoffset, + int mb_row, int mb_col, int *r, int *d); extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, - int ref_yoffset, int ref_uvoffset, + int mb_row, int mb_col, int *r, int *d); extern void vp9_init_me_luts(); diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index 84121f79c..dc21f02f6 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -13,12 +13,13 @@ #include "vp9/common/vp9_sadmxn.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "./vp9_rtcd.h" unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64); } @@ -26,7 +27,7 @@ unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32); } @@ -34,7 +35,7 @@ unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16); } @@ -42,7 +43,7 @@ unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8); } @@ -51,7 +52,7 @@ unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8); } @@ -59,7 +60,7 @@ unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); } @@ -68,7 +69,7 @@ unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4); } @@ -77,12 +78,12 @@ void vp9_sad64x64x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride, + 0x7fffffff); } void vp9_sad32x32x3_c(const uint8_t *src_ptr, @@ -90,74 +91,74 @@ void vp9_sad32x32x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + unsigned int *sad_array) { + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + unsigned int *sad_array) { + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad16x16x3_c(const uint8_t *src_ptr, @@ -165,43 +166,43 @@ void vp9_sad16x16x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad16x8x3_c(const uint8_t *src_ptr, @@ -209,43 +210,43 @@ void vp9_sad16x8x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad8x8x3_c(const uint8_t *src_ptr, @@ -253,43 +254,43 @@ void vp9_sad8x8x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad8x16x3_c(const uint8_t *src_ptr, @@ -297,43 +298,43 @@ void vp9_sad8x16x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad4x4x3_c(const uint8_t *src_ptr, @@ -341,204 +342,147 @@ void vp9_sad4x4x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t *ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } -/* Copy 2 macroblocks to a buffer */ -void vp9_copy32xn_c(uint8_t *src_ptr, - int src_stride, - uint8_t *dst_ptr, - int dst_stride, - int height) { - int r; - - for (r = 0; r < height; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr[4] = src_ptr[4]; - dst_ptr[5] = src_ptr[5]; - dst_ptr[6] = src_ptr[6]; - dst_ptr[7] = src_ptr[7]; - dst_ptr[8] = src_ptr[8]; - dst_ptr[9] = src_ptr[9]; - dst_ptr[10] = src_ptr[10]; - dst_ptr[11] = src_ptr[11]; - dst_ptr[12] = src_ptr[12]; - dst_ptr[13] = src_ptr[13]; - dst_ptr[14] = src_ptr[14]; - dst_ptr[15] = src_ptr[15]; - dst_ptr[16] = src_ptr[16]; - dst_ptr[17] = src_ptr[17]; - dst_ptr[18] = src_ptr[18]; - dst_ptr[19] = src_ptr[19]; - dst_ptr[20] = src_ptr[20]; - dst_ptr[21] = src_ptr[21]; - dst_ptr[22] = src_ptr[22]; - dst_ptr[23] = src_ptr[23]; - dst_ptr[24] = src_ptr[24]; - dst_ptr[25] = src_ptr[25]; - dst_ptr[26] = src_ptr[26]; - dst_ptr[27] = src_ptr[27]; - dst_ptr[28] = src_ptr[28]; - dst_ptr[29] = src_ptr[29]; - dst_ptr[30] = src_ptr[30]; - dst_ptr[31] = src_ptr[31]; -#else - ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0]; - ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1]; - ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2]; - ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3]; - ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4]; - ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5]; - ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6]; - ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7]; -#endif - src_ptr += src_stride; - dst_ptr += dst_stride; - - } -} diff --git a/vp9/encoder/vp9_satd_c.c b/vp9/encoder/vp9_satd_c.c deleted file mode 100644 index 212c2243d..000000000 --- a/vp9/encoder/vp9_satd_c.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> -#include "vpx_ports/mem.h" -#include "./vp9_rtcd.h" - -unsigned int vp9_satd16x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *psatd) { - int r, c, i; - unsigned int satd = 0; - DECLARE_ALIGNED(16, int16_t, diff_in[256]); - DECLARE_ALIGNED(16, int16_t, diff_out[16]); - int16_t *in; - - for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { - diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c]; - } - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - in = diff_in; - for (r = 0; r < 16; r += 4) { - for (c = 0; c < 16; c += 4) { - vp9_short_walsh4x4_c(in + c, diff_out, 32); - for (i = 0; i < 16; i++) - satd += abs(diff_out[i]); - } - in += 64; - } - - if (psatd) - *psatd = satd; - - return satd; -} diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 49195e80c..cfaf5f592 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -9,10 +9,11 @@ */ -#include "limits.h" +#include <limits.h> #include "vpx_mem/vpx_mem.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_tile_common.h" void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) { int mb_row, mb_col; @@ -21,7 +22,7 @@ void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) { x->gf_active_ptr = (signed char *)cpi->gf_active_flags; - if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) { + if ((cm->frame_type == KEY_FRAME) || (cpi->refresh_golden_frame)) { // Reset Gf useage monitors vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; @@ -143,11 +144,74 @@ static int cost_segmap(MACROBLOCKD *xd, return cost; } +// Based on set of segment counts calculate a probability tree +static void calc_segtree_probs_pred(MACROBLOCKD *xd, + int (*segcounts)[MAX_MB_SEGMENTS], + vp9_prob *segment_tree_probs, + vp9_prob *mod_probs) { + int count[4]; + + assert(!segcounts[0][0] && !segcounts[1][1] && + !segcounts[2][2] && !segcounts[3][3]); + + // Total count for all segments + count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0]; + count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1]; + count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2]; + count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3]; + + // Work out probabilities of each segment + segment_tree_probs[0] = get_binary_prob(count[0] + count[1], + count[2] + count[3]); + segment_tree_probs[1] = get_binary_prob(count[0], count[1]); + segment_tree_probs[2] = get_binary_prob(count[2], count[3]); + + // now work out modified counts that the decoder would have + count[0] = segment_tree_probs[0] * segment_tree_probs[1]; + count[1] = segment_tree_probs[0] * (256 - segment_tree_probs[1]); + count[2] = (256 - segment_tree_probs[0]) * segment_tree_probs[2]; + count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]); + + // Work out modified probabilties depending on what segment was predicted + mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]); + mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]); + mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]); + mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]); +} + +// Based on set of segment counts and probabilities calculate a cost estimate +static int cost_segmap_pred(MACROBLOCKD *xd, + int (*segcounts)[MAX_MB_SEGMENTS], + vp9_prob *probs, vp9_prob *mod_probs) { + int pred_seg, cost = 0; + + for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) { + int count1, count2; + + // Cost the top node of the tree + count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1]; + count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3]; + cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) + + count2 * vp9_cost_one(mod_probs[pred_seg]); + + // Now add the cost of each individual segment branch + if (pred_seg >= 2 && count1) { + cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) + + segcounts[pred_seg][1] * vp9_cost_one(probs[1]); + } else if (pred_seg < 2 && count2 > 0) { + cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) + + segcounts[pred_seg][3] * vp9_cost_one(probs[2]); + } + } + + return cost; +} + static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], - int *t_unpred_seg_counts, + int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS], int mb_size, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; @@ -166,8 +230,8 @@ static void count_segs(VP9_COMP *cpi, // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { // Test to see if the segment id matches the predicted value. - const int seg_predicted = - (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index)); + const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, segmap_index); + const int seg_predicted = (segment_id == pred_seg_id); // Get the segment id prediction context const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID); @@ -179,7 +243,7 @@ static void count_segs(VP9_COMP *cpi, if (!seg_predicted) // Update the "unpredicted" segment count - t_unpred_seg_counts[segment_id]++; + t_unpred_seg_counts[pred_seg_id][segment_id]++; } } @@ -191,18 +255,19 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { int t_pred_cost = INT_MAX; int i; - int mb_row, mb_col; + int tile_col, mb_row, mb_col; int temporal_predictor_count[PREDICTION_PROBS][2]; int no_pred_segcounts[MAX_MB_SEGMENTS]; - int t_unpred_seg_counts[MAX_MB_SEGMENTS]; + int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS]; vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS]; vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS]; + vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; const int mis = cm->mode_info_stride; - MODE_INFO *mi_ptr = cm->mi, *mi; + MODE_INFO *mi_ptr, *mi; // Set default state for the segment tree probabilities and the // temporal coding probabilities @@ -218,42 +283,49 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // First of all generate stats regarding how well the last segment map // predicts this one - for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) { - mi = mi_ptr; - for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) { - if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) { - count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 4, mb_row, mb_col); - } else { - for (i = 0; i < 4; i++) { - int x_idx = (i & 1) << 1, y_idx = i & 2; - MODE_INFO *sb_mi = mi + y_idx * mis + x_idx; - - if (mb_col + x_idx >= cm->mb_cols || - mb_row + y_idx >= cm->mb_rows) { - continue; - } - - if (sb_mi->mbmi.sb_type) { - assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32); - count_segs(cpi, sb_mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 2, mb_row + y_idx, mb_col + x_idx); - } else { - int j; - - for (j = 0; j < 4; j++) { - const int x_idx_mb = x_idx + (j & 1), y_idx_mb = y_idx + (j >> 1); - MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis; + for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) { + vp9_get_tile_col_offsets(cm, tile_col); + mi_ptr = cm->mi + cm->cur_tile_mb_col_start; + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) { + mi = mi_ptr; + for (mb_col = cm->cur_tile_mb_col_start; + mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) { + if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, 4, mb_row, mb_col); + } else { + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) << 1, y_idx = i & 2; + MODE_INFO *sb_mi = mi + y_idx * mis + x_idx; + + if (mb_col + x_idx >= cm->mb_cols || + mb_row + y_idx >= cm->mb_rows) { + continue; + } - if (mb_col + x_idx_mb >= cm->mb_cols || - mb_row + y_idx_mb >= cm->mb_rows) { - continue; + if (sb_mi->mbmi.sb_type) { + assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32); + count_segs(cpi, sb_mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, 2, + mb_row + y_idx, mb_col + x_idx); + } else { + int j; + + for (j = 0; j < 4; j++) { + const int x_idx_mb = x_idx + (j & 1); + const int y_idx_mb = y_idx + (j >> 1); + MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis; + + if (mb_col + x_idx_mb >= cm->mb_cols || + mb_row + y_idx_mb >= cm->mb_rows) { + continue; + } + + assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16); + count_segs(cpi, mb_mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + 1, mb_row + y_idx_mb, mb_col + x_idx_mb); } - - assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16); - count_segs(cpi, mb_mi, no_pred_segcounts, - temporal_predictor_count, t_unpred_seg_counts, - 1, mb_row + y_idx_mb, mb_col + x_idx_mb); } } } @@ -270,8 +342,10 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME) { // Work out probability tree for coding those segments not // predicted using the temporal method and the cost. - calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree); - t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree); + calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree, + t_pred_tree_mod); + t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree, + t_pred_tree_mod); // Add in the cost of the signalling for each prediction context for (i = 0; i < PREDICTION_PROBS; i++) { @@ -291,6 +365,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { cm->temporal_update = 1; vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree)); + vpx_memcpy(xd->mb_segment_mispred_tree_probs, + t_pred_tree_mod, sizeof(t_pred_tree_mod)); vpx_memcpy(&cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob)); } else { diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h index 3c75c68d8..1c90c2f2d 100644 --- a/vp9/encoder/vp9_segmentation.h +++ b/vp9/encoder/vp9_segmentation.h @@ -9,23 +9,20 @@ */ -#include "string.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/encoder/vp9_onyx_int.h" - #ifndef VP9_ENCODER_VP9_SEGMENTATION_H_ #define VP9_ENCODER_VP9_SEGMENTATION_H_ -extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, - MACROBLOCK *x); +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_onyx_int.h" + +void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x); -extern void vp9_enable_segmentation(VP9_PTR ptr); -extern void vp9_disable_segmentation(VP9_PTR ptr); +void vp9_enable_segmentation(VP9_PTR ptr); +void vp9_disable_segmentation(VP9_PTR ptr); // Valid values for a segment are 0 to 3 // Segmentation map is arrange as [Rows][Columns] -extern void vp9_set_segmentation_map(VP9_PTR ptr, - unsigned char *segmentation_map); +void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map); // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. @@ -37,10 +34,9 @@ extern void vp9_set_segmentation_map(VP9_PTR ptr, // // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use // the absolute values given). -// -extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data, - unsigned char abs_delta); +void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data, + unsigned char abs_delta); -extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi); +void vp9_choose_segmap_coding_method(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 8bbe53486..a6cd1c0c3 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -8,8 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> +#include <limits.h> #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_quantize.h" @@ -26,9 +29,6 @@ #include "vp9/common/vp9_swapyv12buffer.h" #include "vpx_ports/vpx_timer.h" -#include <math.h> -#include <limits.h> - #define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering @@ -43,39 +43,35 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, int mv_row, int mv_col, uint8_t *pred) { - int offset; - uint8_t *yptr, *uptr, *vptr; - int omv_row, omv_col; - - // Y - yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3); + const int which_mv = 0; + int_mv subpel_mv; + int_mv fullpel_mv; + + subpel_mv.as_mv.row = mv_row; + subpel_mv.as_mv.col = mv_col; + // TODO(jkoleszar): Make this rounding consistent with the rest of the code + fullpel_mv.as_mv.row = (mv_row >> 1) & ~7; + fullpel_mv.as_mv.col = (mv_col >> 1) & ~7; + + vp9_build_inter_predictor(y_mb_ptr, stride, + &pred[0], 16, + &subpel_mv, + &xd->scale_factor[which_mv], + 16, 16, which_mv, &xd->subpix); - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict16x16(yptr, stride, - (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16); - } else { - vp9_copy_mem16x16(yptr, stride, &pred[0], 16); - } - - // U & V - omv_row = mv_row; - omv_col = mv_col; - mv_row >>= 1; - mv_col >>= 1; stride = (stride + 1) >> 1; - offset = (mv_row >> 3) * stride + (mv_col >> 3); - uptr = u_mb_ptr + offset; - vptr = v_mb_ptr + offset; - - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict8x8(uptr, stride, - (omv_col & 15), (omv_row & 15), &pred[256], 8); - xd->subpixel_predict8x8(vptr, stride, - (omv_col & 15), (omv_row & 15), &pred[320], 8); - } else { - vp9_copy_mem8x8(uptr, stride, &pred[256], 8); - vp9_copy_mem8x8(vptr, stride, &pred[320], 8); - } + + vp9_build_inter_predictor_q4(u_mb_ptr, stride, + &pred[256], 8, + &fullpel_mv, &subpel_mv, + &xd->scale_factor_uv[which_mv], + 8, 8, which_mv, &xd->subpix); + + vp9_build_inter_predictor_q4(v_mb_ptr, stride, + &pred[320], 8, + &fullpel_mv, &subpel_mv, + &xd->scale_factor_uv[which_mv], + 8, 8, which_mv, &xd->subpix); } void vp9_temporal_filter_apply_c(uint8_t *frame1, @@ -170,7 +166,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, /*cpi->sf.search_method == HEX*/ // TODO Check that the 16x16 vf & sdf are selected here // Ignore mv costing by sending NULL pointer instead of cost arrays - bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first, + bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0], step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], NULL, NULL, NULL, NULL, &best_ref_mv1); @@ -182,7 +178,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, int distortion; unsigned int sse; // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first, + bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0], &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], @@ -262,8 +258,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (cpi->frames[frame] == NULL) continue; - mbd->block[0].bmi.as_mv.first.as_mv.row = 0; - mbd->block[0].bmi.as_mv.first.as_mv.col = 0; + mbd->block[0].bmi.as_mv[0].as_mv.row = 0; + mbd->block[0].bmi.as_mv[0].as_mv.col = 0; if (frame == alt_ref_index) { filter_weight = 2; @@ -296,8 +292,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->frames[frame]->u_buffer + mb_uv_offset, cpi->frames[frame]->v_buffer + mb_uv_offset, cpi->frames[frame]->y_stride, - mbd->block[0].bmi.as_mv.first.as_mv.row, - mbd->block[0].bmi.as_mv.first.as_mv.col, + mbd->block[0].bmi.as_mv[0].as_mv.row, + mbd->block[0].bmi.as_mv[0].as_mv.col, predictor); // Apply the filter (YUV) @@ -375,11 +371,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, mbd->pre.v_buffer = v_buffer; } -void vp9_temporal_filter_prepare -( - VP9_COMP *cpi, - int distance -) { +void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { int frame = 0; int num_frames_backward = 0; @@ -464,6 +456,13 @@ void vp9_temporal_filter_prepare , start_frame); #endif + // Setup scaling factors. Scaling on each of the arnr frames is not supported + vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0], + &cpi->common.yv12_fb[cpi->common.new_fb_idx], + 16 * cpi->common.mb_cols, + 16 * cpi->common.mb_rows); + cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0]; + // Setup frame pointers, NULL indicates frame not included in filter vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *)); for (frame = 0; frame < frames_to_blur; frame++) { diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h index 27fc35f82..f3ca8c616 100644 --- a/vp9/encoder/vp9_temporal_filter.h +++ b/vp9/encoder/vp9_temporal_filter.h @@ -11,6 +11,6 @@ #ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ -extern void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance); +void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance); #endif // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index fc99311ae..95a2e1227 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -25,20 +25,14 @@ compressions, then generating vp9_context.c = initial stats. */ #ifdef ENTROPY_STATS -vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16]; -vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16]; +vp9_coeff_accum context_counters_4x4[BLOCK_TYPES]; +vp9_coeff_accum context_counters_8x8[BLOCK_TYPES]; +vp9_coeff_accum context_counters_16x16[BLOCK_TYPES]; vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32]; -extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16]; -extern vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16]; +extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES]; +extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES]; +extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES]; extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32]; #endif /* ENTROPY_STATS */ @@ -100,12 +94,6 @@ static void fill_value_tokens() { vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; } -#if CONFIG_NEWCOEFCONTEXT -#define PT pn -#else -#define PT pt -#endif - static void tokenize_b(VP9_COMP *cpi, MACROBLOCKD *xd, const int ib, @@ -114,22 +102,20 @@ static void tokenize_b(VP9_COMP *cpi, TX_SIZE tx_size, int dry_run) { int pt; /* near block/prev token context index */ - int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0; + int c = 0; + int recent_energy = 0; const BLOCKD * const b = xd->block + ib; - const int eob = b->eob; /* one beyond last nonzero coeff */ + const int eob = xd->eobs[ib]; /* one beyond last nonzero coeff */ TOKENEXTRA *t = *tp; /* store tokens starting here */ int16_t *qcoeff_ptr = b->qcoeff; int seg_eob; const int segment_id = xd->mode_info_context->mbmi.segment_id; - const int *bands, *scan; + const int *scan; vp9_coeff_count *counts; vp9_coeff_probs *probs; const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? get_tx_type(xd, b) : DCT_DCT; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; - int pn; -#endif + const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME; ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib]; @@ -147,45 +133,26 @@ static void tokenize_b(VP9_COMP *cpi, default: case TX_4X4: seg_eob = 16; - bands = vp9_coef_bands_4x4; scan = vp9_default_zig_zag1d_4x4; if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_4x4; - probs = cpi->common.fc.hybrid_coef_probs_4x4; if (tx_type == ADST_DCT) { scan = vp9_row_scan_4x4; } else if (tx_type == DCT_ADST) { scan = vp9_col_scan_4x4; } - } else { - counts = cpi->coef_counts_4x4; - probs = cpi->common.fc.coef_probs_4x4; } + counts = cpi->coef_counts_4x4; + probs = cpi->common.fc.coef_probs_4x4; break; case TX_8X8: - if (type == PLANE_TYPE_Y2) { - seg_eob = 4; - bands = vp9_coef_bands_4x4; - scan = vp9_default_zig_zag1d_4x4; - } else { -#if CONFIG_CNVCONTEXT - a_ec = (a[0] + a[1]) != 0; - l_ec = (l[0] + l[1]) != 0; -#endif - seg_eob = 64; - bands = vp9_coef_bands_8x8; - scan = vp9_default_zig_zag1d_8x8; - } - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_8x8; - probs = cpi->common.fc.hybrid_coef_probs_8x8; - } else { - counts = cpi->coef_counts_8x8; - probs = cpi->common.fc.coef_probs_8x8; - } + a_ec = (a[0] + a[1]) != 0; + l_ec = (l[0] + l[1]) != 0; + seg_eob = 64; + scan = vp9_default_zig_zag1d_8x8; + counts = cpi->coef_counts_8x8; + probs = cpi->common.fc.coef_probs_8x8; break; case TX_16X16: -#if CONFIG_CNVCONTEXT if (type != PLANE_TYPE_UV) { a_ec = (a[0] + a[1] + a[2] + a[3]) != 0; l_ec = (l[0] + l[1] + l[2] + l[3]) != 0; @@ -193,33 +160,23 @@ static void tokenize_b(VP9_COMP *cpi, a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; } -#endif seg_eob = 256; - bands = vp9_coef_bands_16x16; scan = vp9_default_zig_zag1d_16x16; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_16x16; - probs = cpi->common.fc.hybrid_coef_probs_16x16; - } else { - counts = cpi->coef_counts_16x16; - probs = cpi->common.fc.coef_probs_16x16; - } + counts = cpi->coef_counts_16x16; + probs = cpi->common.fc.coef_probs_16x16; if (type == PLANE_TYPE_UV) { int uv_idx = (ib - 16) >> 2; qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx; } break; case TX_32X32: -#if CONFIG_CNVCONTEXT a_ec = a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]; l_ec = l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]; a_ec = a_ec != 0; l_ec = l_ec != 0; -#endif seg_eob = 1024; - bands = vp9_coef_bands_32x32; scan = vp9_default_zig_zag1d_32x32; counts = cpi->coef_counts_32x32; probs = cpi->common.fc.coef_probs_32x32; @@ -228,16 +185,12 @@ static void tokenize_b(VP9_COMP *cpi, } VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec); -#if CONFIG_NEWCOEFCONTEXT - neighbors = vp9_get_coef_neighbors_handle(scan); - pn = pt; -#endif - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) - seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) + seg_eob = 0; do { - const int band = bands[c]; + const int band = get_coef_band(tx_size, c); int token; if (c < eob) { @@ -252,30 +205,23 @@ static void tokenize_b(VP9_COMP *cpi, } t->Token = token; - t->context_tree = probs[type][band][PT]; - t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) || - (band > 1 && type == PLANE_TYPE_Y_NO_DC)); + t->context_tree = probs[type][ref][band][pt]; + t->skip_eob_node = (pt == 0) && (band > 0); assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0); if (!dry_run) { - ++counts[type][band][PT][token]; + ++counts[type][ref][band][pt][token]; } - pt = vp9_prev_token_class[token]; -#if CONFIG_NEWCOEFCONTEXT - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(bands[c + 1])) - pn = vp9_get_coef_neighbor_context( - qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]); - else - pn = pt; -#endif + + pt = vp9_get_coef_context(&recent_energy, token); ++t; } while (c < eob && ++c < seg_eob); *tp = t; - a_ec = l_ec = (c > !type); /* 0 <-> all coeff data is zero */ + a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */ a[0] = a_ec; l[0] = l_ec; - if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) { + if (tx_size == TX_8X8) { a[1] = a_ec; l[1] = l_ec; } else if (tx_size == TX_16X16) { @@ -294,18 +240,13 @@ static void tokenize_b(VP9_COMP *cpi, } } -int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) { +int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd) { int skip = 1; int i = 0; - if (has_2nd_order) { - for (i = 0; i < 16; i++) - skip &= (xd->block[i].eob < 2); - skip &= (!xd->block[24].eob); - } else { - for (i = 0; i < 16; i++) - skip &= (!xd->block[i].eob); - } + for (i = 0; i < 16; i++) + skip &= (!xd->eobs[i]); + return skip; } @@ -314,47 +255,42 @@ int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) { int i; for (i = 16; i < 24; i++) - skip &= (!xd->block[i].eob); + skip &= (!xd->eobs[i]); return skip; } -static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) { - return (vp9_mby_is_skippable_4x4(xd, has_2nd_order) & +static int mb_is_skippable_4x4(MACROBLOCKD *xd) { + return (vp9_mby_is_skippable_4x4(xd) & vp9_mbuv_is_skippable_4x4(xd)); } -int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) { +int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd) { int skip = 1; int i = 0; - if (has_2nd_order) { - for (i = 0; i < 16; i += 4) - skip &= (xd->block[i].eob < 2); - skip &= (!xd->block[24].eob); - } else { - for (i = 0; i < 16; i += 4) - skip &= (!xd->block[i].eob); - } + for (i = 0; i < 16; i += 4) + skip &= (!xd->eobs[i]); + return skip; } int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) { - return (!xd->block[16].eob) & (!xd->block[20].eob); + return (!xd->eobs[16]) & (!xd->eobs[20]); } -static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) { - return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) & +static int mb_is_skippable_8x8(MACROBLOCKD *xd) { + return (vp9_mby_is_skippable_8x8(xd) & vp9_mbuv_is_skippable_8x8(xd)); } -static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_2nd_order) { - return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) & +static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd) { + return (vp9_mby_is_skippable_8x8(xd) & vp9_mbuv_is_skippable_4x4(xd)); } int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) { int skip = 1; - skip &= !xd->block[0].eob; + skip &= !xd->eobs[0]; return skip; } @@ -364,12 +300,12 @@ static int mb_is_skippable_16x16(MACROBLOCKD *xd) { int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) { int skip = 1; - skip &= !xd->block[0].eob; + skip &= !xd->eobs[0]; return skip; } int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) { - return (!xd->block[16].eob) & (!xd->block[20].eob); + return (!xd->eobs[16]) & (!xd->eobs[20]); } static int sb_is_skippable_32x32(MACROBLOCKD *xd) { @@ -384,14 +320,9 @@ void vp9_tokenize_sb(VP9_COMP *cpi, VP9_COMMON * const cm = &cpi->common; MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; TOKENEXTRA *t_backup = *t; - ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0), - (ENTROPY_CONTEXT *) (xd->above_context + 1), }; - ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0), - (ENTROPY_CONTEXT *) (xd->left_context + 1), }; const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP); const int segment_id = mbmi->segment_id; - const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0); + const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); int b; mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd); @@ -419,7 +350,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run); } - A[0][8] = L[0][8] = A[1][8] = L[1][8] = 0; if (dry_run) *t = t_backup; } @@ -428,8 +358,6 @@ void vp9_tokenize_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { - PLANE_TYPE plane_type; - int has_2nd_order; int b; int tx_size = xd->mode_info_context->mbmi.txfm_size; int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP); @@ -441,14 +369,11 @@ void vp9_tokenize_mb(VP9_COMP *cpi, int skip_inc; int segment_id = xd->mode_info_context->mbmi.segment_id; - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) { + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { skip_inc = 1; } else skip_inc = 0; - has_2nd_order = get_2nd_order_usage(xd); - switch (tx_size) { case TX_16X16: @@ -458,15 +383,15 @@ void vp9_tokenize_mb(VP9_COMP *cpi, if (xd->mode_info_context->mbmi.mode == I8X8_PRED || xd->mode_info_context->mbmi.mode == SPLITMV) xd->mode_info_context->mbmi.mb_skip_coeff = - mb_is_skippable_8x8_4x4uv(xd, 0); + mb_is_skippable_8x8_4x4uv(xd); else xd->mode_info_context->mbmi.mb_skip_coeff = - mb_is_skippable_8x8(xd, has_2nd_order); + mb_is_skippable_8x8(xd); break; default: xd->mode_info_context->mbmi.mb_skip_coeff = - mb_is_skippable_4x4(xd, has_2nd_order); + mb_is_skippable_4x4(xd); break; } @@ -487,15 +412,6 @@ void vp9_tokenize_mb(VP9_COMP *cpi, if (!dry_run) cpi->skip_false_count[mb_skip_context] += skip_inc; - if (has_2nd_order) { - tokenize_b(cpi, xd, 24, t, PLANE_TYPE_Y2, tx_size, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - plane_type = PLANE_TYPE_Y_WITH_DC; - } - if (tx_size == TX_16X16) { tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run); for (b = 16; b < 24; b += 4) { @@ -503,7 +419,7 @@ void vp9_tokenize_mb(VP9_COMP *cpi, } } else if (tx_size == TX_8X8) { for (b = 0; b < 16; b += 4) { - tokenize_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run); + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); } if (xd->mode_info_context->mbmi.mode == I8X8_PRED || xd->mode_info_context->mbmi.mode == SPLITMV) { @@ -516,11 +432,10 @@ void vp9_tokenize_mb(VP9_COMP *cpi, } } } else { - for (b = 0; b < 24; b++) { - if (b >= 16) - plane_type = PLANE_TYPE_UV; - tokenize_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run); - } + for (b = 0; b < 16; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run); + for (b = 16; b < 24; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); } if (dry_run) *t = t_backup; @@ -531,25 +446,13 @@ void init_context_counters(void) { FILE *f = fopen("context.bin", "rb"); if (!f) { vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4)); - vpx_memset(hybrid_context_counters_4x4, 0, - sizeof(hybrid_context_counters_4x4)); vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8)); - vpx_memset(hybrid_context_counters_8x8, 0, - sizeof(hybrid_context_counters_8x8)); vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16)); - vpx_memset(hybrid_context_counters_16x16, 0, - sizeof(hybrid_context_counters_16x16)); vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32)); } else { fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f); - fread(hybrid_context_counters_4x4, - sizeof(hybrid_context_counters_4x4), 1, f); fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f); - fread(hybrid_context_counters_8x8, - sizeof(hybrid_context_counters_8x8), 1, f); fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f); - fread(hybrid_context_counters_16x16, - sizeof(hybrid_context_counters_16x16), 1, f); fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f); fclose(f); } @@ -557,25 +460,13 @@ void init_context_counters(void) { f = fopen("treeupdate.bin", "rb"); if (!f) { vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4)); - vpx_memset(hybrid_tree_update_hist_4x4, 0, - sizeof(hybrid_tree_update_hist_4x4)); vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8)); - vpx_memset(hybrid_tree_update_hist_8x8, 0, - sizeof(hybrid_tree_update_hist_8x8)); vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16)); - vpx_memset(hybrid_tree_update_hist_16x16, 0, - sizeof(hybrid_tree_update_hist_16x16)); vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32)); } else { fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f); - fread(hybrid_tree_update_hist_4x4, - sizeof(hybrid_tree_update_hist_4x4), 1, f); fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f); - fread(hybrid_tree_update_hist_8x8, - sizeof(hybrid_tree_update_hist_8x8), 1, f); fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f); - fread(hybrid_tree_update_hist_16x16, - sizeof(hybrid_tree_update_hist_16x16), 1, f); fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f); fclose(f); } @@ -583,33 +474,38 @@ void init_context_counters(void) { static void print_counter(FILE *f, vp9_coeff_accum *context_counters, int block_types, const char *header) { - int type, band, pt, t; + int type, ref, band, pt, t; fprintf(f, "static const vp9_coeff_count %s = {\n", header); #define Comma(X) (X ? "," : "") type = 0; do { + ref = 0; fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); - band = 0; do { - fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); - pt = 0; + fprintf(f, "%s\n { /* %s */", Comma(type), ref ? "Inter" : "Intra"); + band = 0; do { - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; + fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); + pt = 0; do { - const int64_t x = context_counters[type][band][pt][t]; - const int y = (int) x; - - assert(x == (int64_t) y); /* no overflow handling yet */ - fprintf(f, "%s %d", Comma(t), y); - } while (++t < MAX_ENTROPY_TOKENS); - fprintf(f, "}"); - } while (++pt < PREV_COEF_CONTEXTS); + fprintf(f, "%s\n {", Comma(pt)); + + t = 0; + do { + const int64_t x = context_counters[type][ref][band][pt][t]; + const int y = (int) x; + + assert(x == (int64_t) y); /* no overflow handling yet */ + fprintf(f, "%s %d", Comma(t), y); + } while (++t < MAX_ENTROPY_TOKENS); + fprintf(f, "}"); + } while (++pt < PREV_COEF_CONTEXTS); + fprintf(f, "\n }"); + } while (++band < COEF_BANDS); fprintf(f, "\n }"); - } while (++band < COEF_BANDS); + } while (++ref < REF_TYPES); fprintf(f, "\n }"); } while (++type < block_types); fprintf(f, "\n};\n"); @@ -617,7 +513,7 @@ static void print_counter(FILE *f, vp9_coeff_accum *context_counters, static void print_probs(FILE *f, vp9_coeff_accum *context_counters, int block_types, const char *header) { - int type, band, pt, t; + int type, ref, band, pt, t; fprintf(f, "static const vp9_coeff_probs %s = {", header); @@ -626,32 +522,38 @@ static void print_probs(FILE *f, vp9_coeff_accum *context_counters, do { fprintf(f, "%s%s{ /* block Type %d */", Comma(type), Newline(type, " "), type); - band = 0; + ref = 0; do { - fprintf(f, "%s%s{ /* Coeff Band %d */", - Comma(band), Newline(band, " "), band); - pt = 0; + fprintf(f, "%s%s{ /* %s */", + Comma(band), Newline(band, " "), ref ? "Inter" : "Intra"); + band = 0; do { - unsigned int branch_ct[ENTROPY_NODES][2]; - unsigned int coef_counts[MAX_ENTROPY_TOKENS]; - vp9_prob coef_probs[ENTROPY_NODES]; - - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - coef_counts[t] = context_counters[type][band][pt][t]; - vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, - vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, coef_counts); - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; + fprintf(f, "%s%s{ /* Coeff Band %d */", + Comma(band), Newline(band, " "), band); + pt = 0; do { - fprintf(f, "%s %3d", Comma(t), coef_probs[t]); - } while (++t < ENTROPY_NODES); - - fprintf(f, " }"); - } while (++pt < PREV_COEF_CONTEXTS); + unsigned int branch_ct[ENTROPY_NODES][2]; + unsigned int coef_counts[MAX_ENTROPY_TOKENS]; + vp9_prob coef_probs[ENTROPY_NODES]; + + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + coef_counts[t] = context_counters[type][ref][band][pt][t]; + vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, + vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, coef_counts); + fprintf(f, "%s\n {", Comma(pt)); + + t = 0; + do { + fprintf(f, "%s %3d", Comma(t), coef_probs[t]); + } while (++t < ENTROPY_NODES); + + fprintf(f, " }"); + } while (++pt < PREV_COEF_CONTEXTS); + fprintf(f, "\n }"); + } while (++band < COEF_BANDS); fprintf(f, "\n }"); - } while (++band < COEF_BANDS); + } while (++ref < REF_TYPES); fprintf(f, "\n }"); } while (++type < block_types); fprintf(f, "\n};\n"); @@ -664,34 +566,22 @@ void print_context_counters() { fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); /* print counts */ - print_counter(f, context_counters_4x4, BLOCK_TYPES_4X4, + print_counter(f, context_counters_4x4, BLOCK_TYPES, "vp9_default_coef_counts_4x4[BLOCK_TYPES_4X4]"); - print_counter(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4, - "vp9_default_hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]"); - print_counter(f, context_counters_8x8, BLOCK_TYPES_8X8, + print_counter(f, context_counters_8x8, BLOCK_TYPES, "vp9_default_coef_counts_8x8[BLOCK_TYPES_8X8]"); - print_counter(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8, - "vp9_default_hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]"); - print_counter(f, context_counters_16x16, BLOCK_TYPES_16X16, + print_counter(f, context_counters_16x16, BLOCK_TYPES, "vp9_default_coef_counts_16x16[BLOCK_TYPES_16X16]"); - print_counter(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16, - "vp9_default_hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]"); print_counter(f, context_counters_32x32, BLOCK_TYPES_32X32, "vp9_default_coef_counts_32x32[BLOCK_TYPES_32X32]"); /* print coefficient probabilities */ - print_probs(f, context_counters_4x4, BLOCK_TYPES_4X4, + print_probs(f, context_counters_4x4, BLOCK_TYPES, "default_coef_probs_4x4[BLOCK_TYPES_4X4]"); - print_probs(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4, - "default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]"); - print_probs(f, context_counters_8x8, BLOCK_TYPES_8X8, + print_probs(f, context_counters_8x8, BLOCK_TYPES, "default_coef_probs_8x8[BLOCK_TYPES_8X8]"); - print_probs(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8, - "default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]"); - print_probs(f, context_counters_16x16, BLOCK_TYPES_16X16, + print_probs(f, context_counters_16x16, BLOCK_TYPES, "default_coef_probs_16x16[BLOCK_TYPES_16X16]"); - print_probs(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16, - "default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]"); print_probs(f, context_counters_32x32, BLOCK_TYPES_32X32, "default_coef_probs_32x32[BLOCK_TYPES_32X32]"); @@ -699,14 +589,8 @@ void print_context_counters() { f = fopen("context.bin", "wb"); fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f); - fwrite(hybrid_context_counters_4x4, - sizeof(hybrid_context_counters_4x4), 1, f); fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f); - fwrite(hybrid_context_counters_8x8, - sizeof(hybrid_context_counters_8x8), 1, f); fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f); - fwrite(hybrid_context_counters_16x16, - sizeof(hybrid_context_counters_16x16), 1, f); fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f); fclose(f); } @@ -716,21 +600,18 @@ void vp9_tokenize_initialize() { fill_value_tokens(); } -static __inline void stuff_b(VP9_COMP *cpi, - MACROBLOCKD *xd, - const int ib, - TOKENEXTRA **tp, - PLANE_TYPE type, - TX_SIZE tx_size, - int dry_run) { - const BLOCKD * const b = xd->block + ib; - const int *bands; +static INLINE void stuff_b(VP9_COMP *cpi, + MACROBLOCKD *xd, + const int ib, + TOKENEXTRA **tp, + PLANE_TYPE type, + TX_SIZE tx_size, + int dry_run) { vp9_coeff_count *counts; vp9_coeff_probs *probs; int pt, band; TOKENEXTRA *t = *tp; - const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, b) : DCT_DCT; + const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME; ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib]; ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context + @@ -744,33 +625,16 @@ static __inline void stuff_b(VP9_COMP *cpi, switch (tx_size) { default: case TX_4X4: - bands = vp9_coef_bands_4x4; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_4x4; - probs = cpi->common.fc.hybrid_coef_probs_4x4; - } else { - counts = cpi->coef_counts_4x4; - probs = cpi->common.fc.coef_probs_4x4; - } + counts = cpi->coef_counts_4x4; + probs = cpi->common.fc.coef_probs_4x4; break; case TX_8X8: -#if CONFIG_CNVCONTEXT - if (type != PLANE_TYPE_Y2) { - a_ec = (a[0] + a[1]) != 0; - l_ec = (l[0] + l[1]) != 0; - } -#endif - bands = vp9_coef_bands_8x8; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_8x8; - probs = cpi->common.fc.hybrid_coef_probs_8x8; - } else { - counts = cpi->coef_counts_8x8; - probs = cpi->common.fc.coef_probs_8x8; - } + a_ec = (a[0] + a[1]) != 0; + l_ec = (l[0] + l[1]) != 0; + counts = cpi->coef_counts_8x8; + probs = cpi->common.fc.coef_probs_8x8; break; case TX_16X16: -#if CONFIG_CNVCONTEXT if (type != PLANE_TYPE_UV) { a_ec = (a[0] + a[1] + a[2] + a[3]) != 0; l_ec = (l[0] + l[1] + l[2] + l[3]) != 0; @@ -778,26 +642,16 @@ static __inline void stuff_b(VP9_COMP *cpi, a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; } -#endif - bands = vp9_coef_bands_16x16; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_16x16; - probs = cpi->common.fc.hybrid_coef_probs_16x16; - } else { - counts = cpi->coef_counts_16x16; - probs = cpi->common.fc.coef_probs_16x16; - } + counts = cpi->coef_counts_16x16; + probs = cpi->common.fc.coef_probs_16x16; break; case TX_32X32: -#if CONFIG_CNVCONTEXT a_ec = a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]; l_ec = l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]; a_ec = a_ec != 0; l_ec = l_ec != 0; -#endif - bands = vp9_coef_bands_32x32; counts = cpi->coef_counts_32x32; probs = cpi->common.fc.coef_probs_32x32; break; @@ -805,14 +659,14 @@ static __inline void stuff_b(VP9_COMP *cpi, VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec); - band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0]; + band = get_coef_band(tx_size, 0); t->Token = DCT_EOB_TOKEN; - t->context_tree = probs[type][band][pt]; + t->context_tree = probs[type][ref][band][pt]; t->skip_eob_node = 0; ++t; *tp = t; *a = *l = 0; - if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) { + if (tx_size == TX_8X8) { a[1] = 0; l[1] = 0; } else if (tx_size == TX_16X16) { @@ -831,32 +685,18 @@ static __inline void stuff_b(VP9_COMP *cpi, } if (!dry_run) { - ++counts[type][band][pt][DCT_EOB_TOKEN]; + ++counts[type][ref][band][pt][DCT_EOB_TOKEN]; } } static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { - PLANE_TYPE plane_type; int b; - int has_2nd_order = get_2nd_order_usage(xd); - - if (has_2nd_order) { - stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { -#if CONFIG_CNVCONTEXT - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; -#endif - plane_type = PLANE_TYPE_Y_WITH_DC; - } - for (b = 0; b < 24; b += 4) { - if (b >= 16) - plane_type = PLANE_TYPE_UV; - stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run); - } + for (b = 0; b < 16; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); + for (b = 16; b < 24; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run); } static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd, @@ -867,56 +707,26 @@ static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd, for (b = 16; b < 24; b += 4) { stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run); } -#if CONFIG_CNVCONTEXT - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; -#endif } static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { int b; - PLANE_TYPE plane_type; - int has_2nd_order = get_2nd_order_usage(xd); - - if (has_2nd_order) { - stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_4X4, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - plane_type = PLANE_TYPE_Y_WITH_DC; - } - for (b = 0; b < 24; b++) { - if (b >= 16) - plane_type = PLANE_TYPE_UV; - stuff_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run); - } + for (b = 0; b < 16; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run); + for (b = 16; b < 24; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); } static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { - PLANE_TYPE plane_type; int b; - int has_2nd_order = get_2nd_order_usage(xd); - if (has_2nd_order) { - stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - plane_type = PLANE_TYPE_Y_WITH_DC; - } - - for (b = 0; b < 16; b += 4) { - stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run); - } - - for (b = 16; b < 24; b++) { + for (b = 0; b < 16; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); + for (b = 16; b < 24; b++) stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); - } } void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 3eeb8fa5a..6ac19ba71 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -28,42 +28,39 @@ typedef struct { uint8_t skip_eob_node; } TOKENEXTRA; -typedef int64_t vp9_coeff_accum[COEF_BANDS][PREV_COEF_CONTEXTS] +typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block); -extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd); -extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block); -extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd); -extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd); -extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd); -extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd); struct VP9_COMP; -extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); -extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); +void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); -extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); -extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); +void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); + +void vp9_fix_contexts_sb(MACROBLOCKD *xd); -extern void vp9_fix_contexts_sb(MACROBLOCKD *xd); #ifdef ENTROPY_STATS void init_context_counters(); void print_context_counters(); -extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16]; +extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES]; +extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES]; +extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES]; extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32]; - -extern vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16]; #endif extern const int *vp9_dct_value_cost_ptr; diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h index 4e0e5e12c..832471aa8 100644 --- a/vp9/encoder/vp9_treewriter.h +++ b/vp9/encoder/vp9_treewriter.h @@ -36,30 +36,28 @@ typedef BOOL_CODER vp9_writer; /* Both of these return bits, not scaled bits. */ - -static __inline unsigned int cost_branch(const unsigned int ct[2], - vp9_prob p) { +static INLINE unsigned int cost_branch256(const unsigned int ct[2], + vp9_prob p) { /* Imitate existing calculation */ - return ((ct[0] * vp9_cost_zero(p)) - + (ct[1] * vp9_cost_one(p))) >> 8; + return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p); } -static __inline unsigned int cost_branch256(const unsigned int ct[2], - vp9_prob p) { +static INLINE unsigned int cost_branch(const unsigned int ct[2], + vp9_prob p) { /* Imitate existing calculation */ - return ((ct[0] * vp9_cost_zero(p)) - + (ct[1] * vp9_cost_one(p))); + return cost_branch256(ct, p) >> 8; } + /* Small functions to write explicit values and tokens, as well as estimate their lengths. */ -static __inline void treed_write(vp9_writer *const w, - vp9_tree t, - const vp9_prob *const p, - int v, - /* number of bits in v, assumed nonzero */ - int n) { +static INLINE void treed_write(vp9_writer *const w, + vp9_tree t, + const vp9_prob *const p, + int v, + /* number of bits in v, assumed nonzero */ + int n) { vp9_tree_index i = 0; do { @@ -69,18 +67,18 @@ static __inline void treed_write(vp9_writer *const w, } while (n); } -static __inline void write_token(vp9_writer *const w, - vp9_tree t, - const vp9_prob *const p, - vp9_token *const x) { +static INLINE void write_token(vp9_writer *const w, + vp9_tree t, + const vp9_prob *const p, + vp9_token *const x) { treed_write(w, t, p, x->value, x->Len); } -static __inline int treed_cost(vp9_tree t, - const vp9_prob *const p, - int v, - /* number of bits in v, assumed nonzero */ - int n) { +static INLINE int treed_cost(vp9_tree t, + const vp9_prob *const p, + int v, + /* number of bits in v, assumed nonzero */ + int n) { int c = 0; vp9_tree_index i = 0; @@ -93,9 +91,9 @@ static __inline int treed_cost(vp9_tree t, return c; } -static __inline int cost_token(vp9_tree t, - const vp9_prob *const p, - vp9_token *const x) { +static INLINE int cost_token(vp9_tree t, + const vp9_prob *const p, + vp9_token *const x) { return treed_cost(t, p, x->value, x->Len); } diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 675dbb63e..7120c5fe7 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -19,12 +19,6 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int max_sad); -typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - int n); - typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -35,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, - unsigned short *sad_array); + unsigned int *sad_array); typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr, int source_stride, @@ -79,7 +73,6 @@ typedef struct vp9_variance_vtable { vp9_sad_multi_fn_t sdx3f; vp9_sad_multi1_fn_t sdx8f; vp9_sad_multi_d_fn_t sdx4df; - vp9_copy32xn_fn_t copymem; } vp9_variance_fn_ptr_t; #endif // VP9_ENCODER_VP9_VARIANCE_H_ diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index d03e285c6..d07a65b45 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, const int16_t *HFilter, *VFilter; uint16_t FData3[5 * 4]; // Temp data bufffer used in filtering - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); // First filter 1d Horizontal var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); @@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); @@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); @@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, uint8_t temp2[68 * 64]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 65, 64, HFilter); @@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, uint8_t temp2[36 * 32]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter); @@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); @@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c new file mode 100644 index 000000000..ff884d999 --- /dev/null +++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 +#include "vp9/common/vp9_idct.h" // for cospi constants + +#define pair_set_epi16(a, b) \ + _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16)) + +void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { + const int stride = pitch >> 1; + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // Load input + __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/substract + const __m128i q0 = _mm_add_epi16(in0, in7); + const __m128i q1 = _mm_add_epi16(in1, in6); + const __m128i q2 = _mm_add_epi16(in2, in5); + const __m128i q3 = _mm_add_epi16(in3, in4); + const __m128i q4 = _mm_sub_epi16(in3, in4); + const __m128i q5 = _mm_sub_epi16(in2, in5); + const __m128i q6 = _mm_sub_epi16(in1, in6); + const __m128i q7 = _mm_sub_epi16(in0, in7); + // Work on first four results + { + // Add/substract + const __m128i r0 = _mm_add_epi16(q0, q3); + const __m128i r1 = _mm_add_epi16(q1, q2); + const __m128i r2 = _mm_sub_epi16(q1, q2); + const __m128i r3 = _mm_sub_epi16(q0, q3); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); + // Add/substract + const __m128i x0 = _mm_add_epi16(q4, r0); + const __m128i x1 = _mm_sub_epi16(q4, r0); + const __m128i x2 = _mm_sub_epi16(q7, r1); + const __m128i x3 = _mm_add_epi16(q7, r1); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + _mm_storeu_si128 ((__m128i *)(output + 0 * 8), in0); + _mm_storeu_si128 ((__m128i *)(output + 1 * 8), in1); + _mm_storeu_si128 ((__m128i *)(output + 2 * 8), in2); + _mm_storeu_si128 ((__m128i *)(output + 3 * 8), in3); + _mm_storeu_si128 ((__m128i *)(output + 4 * 8), in4); + _mm_storeu_si128 ((__m128i *)(output + 5 * 8), in5); + _mm_storeu_si128 ((__m128i *)(output + 6 * 8), in6); + _mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7); + } +} diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm index 5d9f7769d..90c793d4f 100644 --- a/vp9/encoder/x86/vp9_encodeopt.asm +++ b/vp9/encoder/x86/vp9_encodeopt.asm @@ -125,7 +125,7 @@ sym(vp9_block_error_mmx): ret -;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr); global sym(vp9_mbblock_error_mmx_impl) PRIVATE sym(vp9_mbblock_error_mmx_impl): push rbp @@ -142,10 +142,6 @@ sym(vp9_mbblock_error_mmx_impl): mov rdi, arg(1) ;dcoef_ptr pxor mm2, mm2 - movd mm1, dword ptr arg(2) ;dc - por mm1, mm2 - - pcmpeqw mm1, mm7 mov rcx, 16 .mberror_loop_mmx: @@ -160,7 +156,6 @@ sym(vp9_mbblock_error_mmx_impl): pmaddwd mm5, mm5 psubw mm3, mm4 - pand mm3, mm1 pmaddwd mm3, mm3 paddd mm2, mm5 @@ -202,28 +197,24 @@ sym(vp9_mbblock_error_mmx_impl): ret -;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr); global sym(vp9_mbblock_error_xmm_impl) PRIVATE sym(vp9_mbblock_error_xmm_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 6 + SAVE_XMM 5 push rsi push rdi ; end prolog mov rsi, arg(0) ;coeff_ptr - pxor xmm6, xmm6 + pxor xmm5, xmm5 mov rdi, arg(1) ;dcoef_ptr pxor xmm4, xmm4 - movd xmm5, dword ptr arg(2) ;dc - por xmm5, xmm4 - - pcmpeqw xmm5, xmm6 mov rcx, 16 .mberror_loop: @@ -238,7 +229,6 @@ sym(vp9_mbblock_error_xmm_impl): pmaddwd xmm2, xmm2 psubw xmm0, xmm1 - pand xmm0, xmm5 pmaddwd xmm0, xmm0 add rsi, 32 @@ -252,9 +242,9 @@ sym(vp9_mbblock_error_xmm_impl): jnz .mberror_loop movdqa xmm0, xmm4 - punpckldq xmm0, xmm6 + punpckldq xmm0, xmm5 - punpckhdq xmm4, xmm6 + punpckhdq xmm4, xmm5 paddd xmm0, xmm4 movdqa xmm1, xmm0 diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm new file mode 100644 index 000000000..3716d91ec --- /dev/null +++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm @@ -0,0 +1,225 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m6, [ref1q+%5] + punpckldq m4, [ref2q+%5] + punpckldq m7, [ref3q+%5] + punpckldq m5, [ref4q+%5] + psadbw m6, m0 + psadbw m4, m0 + psadbw m7, m0 + psadbw m5, m0 + punpckldq m6, m4 + punpckldq m7, m5 +%else + movd m1, [ref1q+%3] + movd m2, [ref2q+%3] + movd m3, [ref3q+%3] + movd m4, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m1, [ref1q+%5] + punpckldq m2, [ref2q+%5] + punpckldq m3, [ref3q+%5] + punpckldq m4, [ref4q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + punpckldq m1, m2 + punpckldq m3, m4 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; unsigned int res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if mmsize == 16 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 64, 64 +SADNXN4D 32, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 + +INIT_MMX sse +SADNXN4D 4, 4 diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index 33271635c..ea482e071 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -8,403 +8,175 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_sad16x16_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad16x16_wmt) PRIVATE -sym(vp9_sad16x16_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 6 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor xmm6, xmm6 - -.x16x16sad_wmt_loop: - - movq xmm0, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rsi+8] - - movq xmm1, QWORD PTR [rdi] - movq xmm3, QWORD PTR [rdi+8] - - movq xmm4, QWORD PTR [rsi+rax] - movq xmm5, QWORD PTR [rdi+rdx] - - - punpcklbw xmm0, xmm2 - punpcklbw xmm1, xmm3 - - psadbw xmm0, xmm1 - movq xmm2, QWORD PTR [rsi+rax+8] - - movq xmm3, QWORD PTR [rdi+rdx+8] - lea rsi, [rsi+rax*2] - - lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm2 - - punpcklbw xmm5, xmm3 - psadbw xmm4, xmm5 - - paddw xmm6, xmm0 - paddw xmm6, xmm4 - - cmp rsi, rcx - jne .x16x16sad_wmt_loop - - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movq rax, xmm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp9_sad8x16_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int max_err) -global sym(vp9_sad8x16_wmt) PRIVATE -sym(vp9_sad8x16_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - - lea rcx, [rcx+rbx*8] - pxor mm7, mm7 - -.x8x16sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - jg .x8x16sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, QWORD PTR [rsi+rbx] - movq mm3, QWORD PTR [rdi+rdx] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - lea rsi, [rsi+rbx*2] - lea rdi, [rdi+rdx*2] - - paddw mm7, mm0 - paddw mm7, mm2 - - cmp rsi, rcx - jne .x8x16sad_wmt_loop - - movq rax, mm7 - -.x8x16sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad8x8_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad8x8_wmt) PRIVATE -sym(vp9_sad8x8_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - pxor mm7, mm7 - -.x8x8sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - jg .x8x8sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - psadbw mm0, mm1 - lea rsi, [rsi+rbx] - - add rdi, rdx - paddw mm7, mm0 - - cmp rsi, rcx - jne .x8x8sad_wmt_loop - - movq rax, mm7 -.x8x8sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp9_sad4x4_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad4x4_wmt) PRIVATE -sym(vp9_sad4x4_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - movd mm0, DWORD PTR [rsi] - movd mm1, DWORD PTR [rdi] - - movd mm2, DWORD PTR [rsi+rax] - movd mm3, DWORD PTR [rdi+rdx] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - psadbw mm0, mm1 - lea rsi, [rsi+rax*2] - - lea rdi, [rdi+rdx*2] - movd mm4, DWORD PTR [rsi] - - movd mm5, DWORD PTR [rdi] - movd mm6, DWORD PTR [rsi+rax] - - movd mm7, DWORD PTR [rdi+rdx] - punpcklbw mm4, mm6 - - punpcklbw mm5, mm7 - psadbw mm4, mm5 - - paddw mm0, mm4 - movq rax, mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad16x8_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad16x8_wmt) PRIVATE -sym(vp9_sad16x8_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - pxor mm7, mm7 - -.x16x8sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - jg .x16x8sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm2, QWORD PTR [rsi+8] - - movq mm1, QWORD PTR [rdi] - movq mm3, QWORD PTR [rdi+8] - - movq mm4, QWORD PTR [rsi+rbx] - movq mm5, QWORD PTR [rdi+rdx] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - movq mm1, QWORD PTR [rsi+rbx+8] - movq mm3, QWORD PTR [rdi+rdx+8] - - psadbw mm4, mm5 - psadbw mm1, mm3 - - lea rsi, [rsi+rbx*2] - lea rdi, [rdi+rdx*2] - - paddw mm0, mm2 - paddw mm4, mm1 - - paddw mm7, mm0 - paddw mm7, mm4 - - cmp rsi, rcx - jne .x16x8sad_wmt_loop - - movq rax, mm7 - -.x16x8sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_copy32xn_sse2( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp9_copy32xn_sse2) PRIVATE -sym(vp9_copy32xn_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;dst_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;dst_stride - movsxd rcx, dword ptr arg(4) ;height - -.block_copy_sse2_loopx4: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - movdqu xmm2, XMMWORD PTR [rsi + rax] - movdqu xmm3, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqu xmm4, XMMWORD PTR [rsi] - movdqu xmm5, XMMWORD PTR [rsi + 16] - movdqu xmm6, XMMWORD PTR [rsi + rax] - movdqu xmm7, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - movdqa XMMWORD PTR [rdi + rdx], xmm2 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 - - lea rdi, [rdi+rdx*2] - - movdqa XMMWORD PTR [rdi], xmm4 - movdqa XMMWORD PTR [rdi + 16], xmm5 - movdqa XMMWORD PTR [rdi + rdx], xmm6 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 - - lea rdi, [rdi+rdx*2] - - sub rcx, 4 - cmp rcx, 4 - jge .block_copy_sse2_loopx4 - - cmp rcx, 0 - je .copy_is_done - -.block_copy_sse2_loop: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - lea rsi, [rsi+rax] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - lea rdi, [rdi+rdx] - - sub rcx, 1 - jne .block_copy_sse2_loop - -.copy_is_done: - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_XMM sse2 +cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov n_rowsd, 64 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_XMM sse2 +cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov n_rowsd, 16 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1 +cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 + +; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1 +cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 + +; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_MMX sse +cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + movd m0, [refq] + movd m1, [refq+ref_strideq] + movd m2, [srcq] + movd m3, [srcq+src_strideq] + lea refq, [refq+ref_strideq*2] + lea srcq, [srcq+src_strideq*2] + movd m4, [refq] + movd m5, [refq+ref_strideq] + movd m6, [srcq] + movd m7, [srcq+src_strideq] + punpckldq m0, m1 + punpckldq m2, m3 + punpckldq m4, m5 + punpckldq m6, m7 + psadbw m0, m2 + psadbw m4, m6 + paddd m0, m4 + movd eax, m0 + RET diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm index 1c39a08f8..2b90a5d54 100644 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ b/vp9/encoder/x86/vp9_sad_sse3.asm @@ -83,87 +83,6 @@ ret %endmacro -%macro STACK_FRAME_CREATE_X4 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define r0_ptr rcx - %define r1_ptr rdx - %define r2_ptr rbx - %define r3_ptr rdi - %define ref_stride rbp - %define result_ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - push rbp - mov rdi, arg(2) ; ref_ptr_base - - LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi - - mov rsi, arg(0) ; src_ptr - - movsxd rbx, dword ptr arg(1) ; src_stride - movsxd rbp, dword ptr arg(3) ; ref_stride - - xchg rbx, rax -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define r0_ptr rsi - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr r8 - %define ref_stride r9 - %define result_ptr [rsp+xmm_stack_space+16+4*8] - push rsi - - LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr - %else - %define src_ptr rdi - %define src_stride rsi - %define r0_ptr r9 - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr rdx - %define ref_stride rcx - %define result_ptr r8 - - LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr - - %endif -%endif -%endmacro - -%macro STACK_FRAME_DESTROY_X4 0 - %define src_ptr - %define src_stride - %define r0_ptr - %define r1_ptr - %define r2_ptr - %define r3_ptr - %define ref_stride - %define result_ptr - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - pop rsi - RESTORE_XMM - %endif -%endif - ret -%endmacro - %macro PROCESS_16X2X3 5 %if %1==0 movdqa xmm0, XMMWORD PTR [%2] @@ -250,130 +169,6 @@ paddw mm7, mm3 %endmacro -%macro LOAD_X4_ADDRESSES 5 - mov %2, [%1+REG_SZ_BYTES*0] - mov %3, [%1+REG_SZ_BYTES*1] - - mov %4, [%1+REG_SZ_BYTES*2] - mov %5, [%1+REG_SZ_BYTES*3] -%endmacro - -%macro PROCESS_16X2X4 8 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm4, XMMWORD PTR [%3] - lddqu xmm5, XMMWORD PTR [%4] - lddqu xmm6, XMMWORD PTR [%5] - lddqu xmm7, XMMWORD PTR [%6] - - psadbw xmm4, xmm0 - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%4] - lddqu xmm3, XMMWORD PTR [%5] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6] - paddw xmm5, xmm2 - paddw xmm6, xmm3 - - psadbw xmm1, xmm0 - paddw xmm7, xmm1 -%endif - movdqa xmm0, XMMWORD PTR [%2+%7] - lddqu xmm1, XMMWORD PTR [%3+%8] - lddqu xmm2, XMMWORD PTR [%4+%8] - lddqu xmm3, XMMWORD PTR [%5+%8] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6+%8] - paddw xmm5, xmm2 - paddw xmm6, xmm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw xmm1, xmm0 - paddw xmm7, xmm1 - -%endmacro - -%macro PROCESS_8X2X4 8 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm4, QWORD PTR [%3] - movq mm5, QWORD PTR [%4] - movq mm6, QWORD PTR [%5] - movq mm7, QWORD PTR [%6] - - psadbw mm4, mm0 - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%4] - movq mm3, QWORD PTR [%5] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6] - paddw mm5, mm2 - paddw mm6, mm3 - - psadbw mm1, mm0 - paddw mm7, mm1 -%endif - movq mm0, QWORD PTR [%2+%7] - movq mm1, QWORD PTR [%3+%8] - movq mm2, QWORD PTR [%4+%8] - movq mm3, QWORD PTR [%5+%8] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6+%8] - paddw mm5, mm2 - paddw mm6, mm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw mm1, mm0 - paddw mm7, mm1 - -%endmacro - ;void int vp9_sad16x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, @@ -581,380 +376,3 @@ sym(vp9_sad4x4x3_sse3): movd [rcx+8], mm7 STACK_FRAME_DESTROY_X3 - -;unsigned int vp9_sad16x16_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int max_err) -;%define lddqu movdqu -global sym(vp9_sad16x16_sse3) PRIVATE -sym(vp9_sad16x16_sse3): - - STACK_FRAME_CREATE_X3 - - mov end_ptr, 4 - pxor xmm7, xmm7 - -.vp9_sad16x16_sse3_loop: - movdqa xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [ref_ptr] - movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] - movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movdqa xmm4, XMMWORD PTR [src_ptr] - movdqu xmm5, XMMWORD PTR [ref_ptr] - movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] - - psadbw xmm0, xmm1 - - movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] - - psadbw xmm2, xmm3 - psadbw xmm4, xmm5 - psadbw xmm6, xmm1 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - paddw xmm7, xmm0 - paddw xmm7, xmm2 - paddw xmm7, xmm4 - paddw xmm7, xmm6 - - sub end_ptr, 1 - jne .vp9_sad16x16_sse3_loop - - movq xmm0, xmm7 - psrldq xmm7, 8 - paddw xmm0, xmm7 - movq rax, xmm0 - - STACK_FRAME_DESTROY_X3 - -;void vp9_copy32xn_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp9_copy32xn_sse3) PRIVATE -sym(vp9_copy32xn_sse3): - - STACK_FRAME_CREATE_X3 - -.block_copy_sse3_loopx4: - lea end_ptr, [src_ptr+src_stride*2] - - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] - movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] - movdqu xmm4, XMMWORD PTR [end_ptr] - movdqu xmm5, XMMWORD PTR [end_ptr + 16] - movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] - movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] - - lea src_ptr, [src_ptr+src_stride*4] - - lea end_ptr, [ref_ptr+ref_stride*2] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 - movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 - movdqa XMMWORD PTR [end_ptr], xmm4 - movdqa XMMWORD PTR [end_ptr + 16], xmm5 - movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 - movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 - - lea ref_ptr, [ref_ptr+ref_stride*4] - - sub height, 4 - cmp height, 4 - jge .block_copy_sse3_loopx4 - - ;Check to see if there is more rows need to be copied. - cmp height, 0 - je .copy_is_done - -.block_copy_sse3_loop: - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - lea src_ptr, [src_ptr+src_stride] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - lea ref_ptr, [ref_ptr+ref_stride] - - sub height, 1 - jne .block_copy_sse3_loop - -.copy_is_done: - STACK_FRAME_DESTROY_X3 - -;void vp9_sad16x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad16x16x4d_sse3) PRIVATE -sym(vp9_sad16x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void vp9_sad16x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad16x8x4d_sse3) PRIVATE -sym(vp9_sad16x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad8x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x16x4d_sse3) PRIVATE -sym(vp9_sad8x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad8x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x8x4d_sse3) PRIVATE -sym(vp9_sad8x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad4x4x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad4x4x4d_sse3) PRIVATE -sym(vp9_sad4x4x4d_sse3): - - STACK_FRAME_CREATE_X4 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [r0_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [r1_ptr] - movd mm5, DWORD PTR [r2_ptr] - - movd mm6, DWORD PTR [r3_ptr] - movd mm2, DWORD PTR [r1_ptr+ref_stride] - - movd mm3, DWORD PTR [r2_ptr+ref_stride] - movd mm7, DWORD PTR [r3_ptr+ref_stride] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - punpcklbw mm6, mm7 - psadbw mm4, mm0 - - psadbw mm5, mm0 - psadbw mm6, mm0 - - - - lea src_ptr, [src_ptr+src_stride*2] - lea r0_ptr, [r0_ptr+ref_stride*2] - - lea r1_ptr, [r1_ptr+ref_stride*2] - lea r2_ptr, [r2_ptr+ref_stride*2] - - lea r3_ptr, [r3_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [r0_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm7, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm7 - - movd mm3, DWORD PTR [r1_ptr] - movd mm7, DWORD PTR [r2_ptr] - - psadbw mm2, mm0 -%if ABI_IS_32BIT - mov rax, rbp - - pop rbp -%define ref_stride rax -%endif - mov rsi, result_ptr - - paddw mm1, mm2 - movd [rsi], mm1 - - movd mm2, DWORD PTR [r1_ptr+ref_stride] - movd mm1, DWORD PTR [r2_ptr+ref_stride] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm1 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - movd mm2, DWORD PTR [r3_ptr] - movd mm1, DWORD PTR [r3_ptr+ref_stride] - - paddw mm3, mm4 - paddw mm7, mm5 - - movd [rsi+4], mm3 - punpcklbw mm2, mm1 - - movd [rsi+8], mm7 - psadbw mm2, mm0 - - paddw mm2, mm6 - movd [rsi+12], mm2 - - - STACK_FRAME_DESTROY_X4 - diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm index b42982a1f..faf1768a9 100644 --- a/vp9/encoder/x86/vp9_sad_sse4.asm +++ b/vp9/encoder/x86/vp9_sad_sse4.asm @@ -154,6 +154,16 @@ paddw xmm1, xmm5 %endmacro +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro ;void vp9_sad16x16x8_sse4( ; const unsigned char *src_ptr, @@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index 36fae6e8c..fc363b6b0 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -186,6 +186,7 @@ unsigned int vp9_variance16x16_wmt *sse = sse0; return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); } + unsigned int vp9_mse16x16_wmt( const unsigned char *src_ptr, int source_stride, @@ -305,20 +306,16 @@ unsigned int vp9_sub_pixel_variance8x8_wmt return (xxsum - (((unsigned int)xsum * xsum) >> 6)); } -unsigned int vp9_sub_pixel_variance16x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { +static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, int *avg) { int xsum0, xsum1; unsigned int xxsum0, xxsum1; - // note we could avoid these if statements if the calling function // just called the appropriate functions inside. if (xoffset == HALFNDX && yoffset == 0) { @@ -355,10 +352,136 @@ unsigned int vp9_sub_pixel_variance16x16_wmt } *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); + *avg = xsum0; +} + +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse_ptr) { + int avg; + unsigned int sse; + + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse, &avg); + *sse_ptr = sse; + + return (sse - (((unsigned int) avg * avg) >> 8)); +} + +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse_ptr) { + int avg0, avg1, avg2, avg3; + unsigned int sse0, sse1, sse2, sse3; + + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse0, &avg0); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse1, &avg1); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse3, &avg3); + sse0 += sse1 + sse2 + sse3; + avg0 += avg1 + avg2 + avg3; + *sse_ptr = sse0; + + return (sse0 - (((unsigned int) avg0 * avg0) >> 10)); +} + +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse_ptr) { + int avg0, avg1, avg2, avg3, avg4; + unsigned int sse0, sse1, sse2, sse3, sse4; + + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse0, &avg0); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse3, &avg3); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + avg0 += avg1 + avg2 + avg3; + sse0 += sse1 + sse2 + sse3; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse3, &avg3); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse4, &avg4); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + avg0 += avg1 + avg2 + avg3 + avg4; + sse0 += sse1 + sse2 + sse3 + sse4; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse3, &avg3); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse4, &avg4); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + avg0 += avg1 + avg2 + avg3 + avg4; + sse0 += sse1 + sse2 + sse3 + sse4; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse3, &avg3); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse4, &avg4); + avg0 += avg1 + avg2 + avg3 + avg4; + sse0 += sse1 + sse2 + sse3 + sse4; + *sse_ptr = sse0; + + return (sse0 - (((unsigned int) avg0 * avg0) >> 12)); } -unsigned int vp9_sub_pixel_mse16x16_wmt( +unsigned int vp9_sub_pixel_mse16x16_sse2( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, @@ -367,7 +490,8 @@ unsigned int vp9_sub_pixel_mse16x16_wmt( int dst_pixels_per_line, unsigned int *sse ) { - vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, sse); return *sse; } diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c index 3beef53a2..2bf32c569 100644 --- a/vp9/encoder/x86/vp9_x86_csystemdependent.c +++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c @@ -23,11 +23,11 @@ void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) { vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch); } -int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) { +int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr); +int vp9_mbblock_error_mmx(MACROBLOCK *mb) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; - return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); + return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr); } int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); @@ -51,11 +51,11 @@ void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { #endif #if HAVE_SSE2 -int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) { +int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr); +int vp9_mbblock_error_xmm(MACROBLOCK *mb) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; - return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); + return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr); } int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); |