summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c17
-rw-r--r--vp9/encoder/vp9_alt_ref_aq.c2
-rw-r--r--vp9/encoder/vp9_alt_ref_aq.h2
-rw-r--r--vp9/encoder/vp9_aq_cyclicrefresh.c3
-rw-r--r--vp9/encoder/vp9_encodeframe.c12
-rw-r--r--vp9/encoder/vp9_encodemb.c285
-rw-r--r--vp9/encoder/vp9_encoder.h1
-rw-r--r--vp9/encoder/vp9_pickmode.c34
-rw-r--r--vp9/encoder/vp9_ratectrl.c5
-rw-r--r--vp9/encoder/vp9_speed_features.c7
-rw-r--r--vp9/encoder/x86/temporal_filter_sse4.c1
-rw-r--r--vp9/encoder/x86/vp9_dct_intrin_sse2.c78
-rw-r--r--vp9/vp9_cx_iface.c8
13 files changed, 341 insertions, 114 deletions
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index bb2dcf52b..7e8089b51 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -54,7 +54,6 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[8];
- const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
@@ -106,14 +105,14 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[6] = _mm_srai_epi16(in[6], 5);
in[7] = _mm_srai_epi16(in[7], 5);
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
+ recon_and_store(dest + 0 * stride, in[0]);
+ recon_and_store(dest + 1 * stride, in[1]);
+ recon_and_store(dest + 2 * stride, in[2]);
+ recon_and_store(dest + 3 * stride, in[3]);
+ recon_and_store(dest + 4 * stride, in[4]);
+ recon_and_store(dest + 5 * stride, in[5]);
+ recon_and_store(dest + 6 * stride, in[6]);
+ recon_and_store(dest + 7 * stride, in[7]);
}
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
diff --git a/vp9/encoder/vp9_alt_ref_aq.c b/vp9/encoder/vp9_alt_ref_aq.c
index 3aeefb584..acc3764c7 100644
--- a/vp9/encoder/vp9_alt_ref_aq.c
+++ b/vp9/encoder/vp9_alt_ref_aq.c
@@ -15,7 +15,7 @@ struct ALT_REF_AQ {
int dummy;
};
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
}
diff --git a/vp9/encoder/vp9_alt_ref_aq.h b/vp9/encoder/vp9_alt_ref_aq.h
index 18acd8a85..e508cb44a 100644
--- a/vp9/encoder/vp9_alt_ref_aq.h
+++ b/vp9/encoder/vp9_alt_ref_aq.h
@@ -54,7 +54,7 @@ struct ALT_REF_AQ;
*
* \return Instance of the class
*/
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
/*!\brief Upload segmentation_map to self object
*
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 048ea629f..2f2f0055a 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -425,9 +425,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
int target_refresh = 0;
double weight_segment_target = 0;
double weight_segment = 0;
+ int thresh_low_motion = (cm->width < 720) ? 55 : 20;
cr->apply_cyclic_refresh = 1;
if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
- (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+ (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
rc->frames_since_key > 40)) {
cr->apply_cyclic_refresh = 0;
return;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 6215e198c..2b694a389 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -489,8 +489,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
return 0;
}
-int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width,
- int height, int content_state) {
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+ int width, int height,
+ int content_state) {
if (speed >= 8) {
if (width <= 640 && height <= 480)
return (5 * threshold_base) >> 2;
@@ -1022,6 +1023,9 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000)
x->content_state_sb = kLowVarHighSumdiff;
+ if (tmp_sad > (avg_source_sad_threshold << 1))
+ x->content_state_sb = kVeryHighSad;
+
if (cpi->content_state_sb_fd != NULL) {
if (tmp_sad < avg_source_sad_threshold2) {
// Cap the increment to 255.
@@ -1197,7 +1201,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
- x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+ if (cpi->use_skin_detection)
+ x->sb_is_skin =
+ skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
d = xd->plane[0].dst.buf;
dp = xd->plane[0].dst.stride;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 7e30499c5..d8ea92af0 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -49,19 +49,275 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
pd->dst.buf, pd->dst.stride);
}
-typedef struct vp9_token_state {
- int64_t error;
- int rate;
- int16_t next;
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+ { 10, 6 }, { 8, 5 },
+};
+
+#define USE_GREEDY_OPTIMIZE_B 0
+
+#if USE_GREEDY_OPTIMIZE_B
+
+typedef struct {
int16_t token;
tran_low_t qc;
tran_low_t dqc;
- uint8_t best_index;
} vp9_token_state;
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
- { 10, 6 }, { 8, 5 },
-};
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+ ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
+
+int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ int ctx) {
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ref = is_inter_block(xd->mi[0]);
+ vp9_token_state tokens[1025][2];
+ uint8_t token_cache[1024];
+ const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const int eob = p->eobs[block];
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const int default_eob = 16 << (tx_size << 1);
+ const int shift = (tx_size == TX_32X32);
+ const int16_t *const dequant_ptr = pd->dequant;
+ const uint8_t *const band_translate = get_band_translate(tx_size);
+ const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
+ const int16_t *const scan = so->scan;
+ const int16_t *const nb = so->neighbors;
+ const int64_t rdmult =
+ ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+ const int64_t rddiv = mb->rddiv;
+ int64_t rd_cost0, rd_cost1;
+ int64_t rate0, rate1;
+ int16_t t0, t1;
+ int i, final_eob;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+ unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ mb->token_costs[tx_size][plane_type][ref];
+ unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+ int64_t eob_cost0, eob_cost1;
+ const int ctx0 = ctx;
+ int64_t accu_rate = 0;
+ // Initialized to the worst possible error for the largest transform size.
+ // This ensures that it never goes negative.
+ int64_t accu_error = ((int64_t)1) << 50;
+ int64_t best_block_rd_cost = INT64_MAX;
+ int x_prev = 1;
+ assert((!plane_type && !plane) || (plane_type && plane));
+ assert(eob <= default_eob);
+
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ int x = qcoeff[rc];
+ t0 = vp9_get_token(x);
+ tokens[i][0].qc = x;
+ tokens[i][0].token = t0;
+ tokens[i][0].dqc = dqcoeff[rc];
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ }
+ tokens[eob][0].token = EOB_TOKEN;
+ tokens[eob][0].qc = 0;
+ tokens[eob][0].dqc = 0;
+ tokens[eob][1] = tokens[eob][0];
+ final_eob = 0;
+
+ // Initial RD cost.
+ token_costs_cur = token_costs + band_translate[0];
+ rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+ best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+ // For each token, pick one of two choices greedily:
+ // (i) First candidate: Keep current quantized value, OR
+ // (ii) Second candidate: Reduce quantized value by 1.
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ const int x = qcoeff[rc];
+ const int band_cur = band_translate[i];
+ const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+ const int token_tree_sel_cur = (x_prev == 0);
+ token_costs_cur = token_costs + band_cur;
+ if (x == 0) { // No need to search
+ rate0 =
+ (*token_costs_cur)[token_tree_sel_cur][ctx_cur][tokens[i][0].token];
+ accu_rate += rate0;
+ x_prev = 0;
+ // Note: accu_error does not change.
+ } else {
+ const int dqv = dequant_ptr[rc != 0];
+ // Compute the distortion for quantizing to 0.
+ const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+ const int diff_for_zero =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+ :
+#endif
+ diff_for_zero_raw;
+ const int64_t distortion_for_zero =
+ (int64_t)diff_for_zero * diff_for_zero;
+
+ // Compute the distortion for the first candidate
+ const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+ const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+ :
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ diff0_raw;
+ const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+ // Compute the distortion for the second candidate
+ const int sign = -(x < 0); // -1 if x is negative and 0 otherwise.
+ const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1.
+ int64_t distortion1;
+ if (x1 != 0) {
+ const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+ :
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ dqv;
+ const int diff_step = (dqv_step + sign) ^ sign;
+ const int diff1 = diff0 - diff_step;
+ assert(dqv > 0); // We aren't right shifting a negative number above.
+ distortion1 = (int64_t)diff1 * diff1;
+ } else {
+ distortion1 = distortion_for_zero;
+ }
+ {
+ // Calculate RDCost for current coeff for the two candidates.
+ const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+ const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+ rate0 =
+ base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+ rate1 =
+ base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
+ }
+ {
+ int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+ int dqc0, dqc1;
+ int64_t best_eob_cost_cur;
+
+ // Calculate RD Cost effect on the next coeff for the two candidates.
+ int64_t next_bits0 = 0;
+ int64_t next_bits1 = 0;
+ int64_t next_eob_bits0 = 0;
+ int64_t next_eob_bits1 = 0;
+ if (i < default_eob - 1) {
+ int ctx_next, token_tree_sel_next;
+ const int band_next = band_translate[i + 1];
+ unsigned int(
+ *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ token_costs + band_next;
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x == 0);
+ next_bits0 = (*token_costs_next)[token_tree_sel_next][ctx_next]
+ [tokens[i + 1][0].token];
+ next_eob_bits0 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+ token_cache[rc] = vp9_pt_energy_class[t1];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x1 == 0);
+ next_bits1 = (*token_costs_next)[token_tree_sel_next][ctx_next]
+ [tokens[i + 1][0].token];
+ if (x1 != 0) {
+ next_eob_bits1 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+ }
+ }
+
+ // Compare the total RD costs for two candidates.
+ rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+ rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+ rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+ eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+ (accu_error + distortion0 - distortion_for_zero));
+ eob_cost1 = eob_cost0;
+ if (x1 != 0) {
+ eob_cost1 =
+ RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+ (accu_error + distortion1 - distortion_for_zero));
+ eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+ } else {
+ eob_rdcost_better_for_x1 = 0;
+ }
+
+ // Calculate the two candidate de-quantized values.
+ dqc0 = dqcoeff[rc];
+ dqc1 = 0;
+ if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+ if (x1 != 0) {
+ dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+ } else {
+ dqc1 = 0;
+ }
+ }
+
+ // Pick and record the better quantized and de-quantized values.
+ if (rdcost_better_for_x1) {
+ qcoeff[rc] = x1;
+ dqcoeff[rc] = dqc1;
+ accu_rate += rate1;
+ accu_error += distortion1 - distortion_for_zero;
+ assert(distortion1 <= distortion_for_zero);
+ token_cache[rc] = vp9_pt_energy_class[t1];
+ } else {
+ accu_rate += rate0;
+ accu_error += distortion0 - distortion_for_zero;
+ assert(distortion0 <= distortion_for_zero);
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ }
+ assert(accu_error >= 0);
+ x_prev = qcoeff[rc]; // Update based on selected quantized value.
+
+ best_eob_cost_cur = eob_cost0;
+ tokens[i][1].token = t0;
+ tokens[i][1].qc = x;
+ tokens[i][1].dqc = dqc0;
+ if ((x1 != 0) && eob_rdcost_better_for_x1) {
+ best_eob_cost_cur = eob_cost1;
+ tokens[i][1].token = t1;
+ tokens[i][1].qc = x1;
+ tokens[i][1].dqc = dqc1;
+ }
+
+ // Determine whether to move the eob position to i+1
+ if (best_eob_cost_cur < best_block_rd_cost) {
+ best_block_rd_cost = best_eob_cost_cur;
+ final_eob = i + 1;
+ }
+ }
+ }
+ }
+ assert(final_eob <= eob);
+ if (final_eob > 0) {
+ int rc;
+ assert(tokens[final_eob - 1][1].qc != 0);
+ i = final_eob - 1;
+ rc = scan[i];
+ qcoeff[rc] = tokens[i][1].qc;
+ dqcoeff[rc] = tokens[i][1].dqc;
+ }
+ for (i = final_eob; i < eob; i++) {
+ int rc = scan[i];
+ qcoeff[rc] = 0;
+ dqcoeff[rc] = 0;
+ }
+ mb->plane[plane].eobs[block] = final_eob;
+ return final_eob;
+}
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
+
+#else
#define UPDATE_RD_COST() \
{ \
@@ -92,6 +348,17 @@ static const int16_t band_cum_count_table[TX_SIZES][8] = {
{ 0, 1, 3, 6, 10, 21, 256, 0 },
{ 0, 1, 3, 6, 10, 21, 1024, 0 },
};
+
+typedef struct vp9_token_state {
+ int64_t error;
+ int rate;
+ int16_t next;
+ int16_t token;
+ tran_low_t qc;
+ tran_low_t dqc;
+ uint8_t best_index;
+} vp9_token_state;
+
int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
int ctx) {
MACROBLOCKD *const xd = &mb->e_mbd;
@@ -327,6 +594,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
return final_eob;
}
+#endif // USE_GREEDY_OPTIMIZE_B
+
static INLINE void fdct32x32(int rd_transform, const int16_t *src,
tran_low_t *dst, int src_stride) {
if (rd_transform)
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 672c83bfd..7ab892000 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -138,6 +138,7 @@ typedef enum {
kHighSadLowSumdiff = 3,
kHighSadHighSumdiff = 4,
kLowVarHighSumdiff = 5,
+ kVeryHighSad = 6,
} CONTENT_STATE_SB;
typedef struct VP9EncoderConfig {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 525523ade..17dc0637f 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -217,7 +217,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
if (rv && search_subpel) {
- const int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+ int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+ if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2;
cpi->find_fractional_mv_step(
x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
@@ -1489,6 +1490,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int force_skip_low_temp_var = 0;
int skip_ref_find_pred[4] = { 0 };
unsigned int sse_zeromv_normalized = UINT_MAX;
+ unsigned int best_sse_sofar = UINT_MAX;
unsigned int thresh_svc_skip_golden = 500;
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_PICKMODE_CTX_DEN ctx_den;
@@ -1615,7 +1617,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
- x->last_sb_high_content > 40))
+ x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
usable_ref_frame = LAST_FRAME;
for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
@@ -1691,7 +1693,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
continue;
}
- if ((cpi->sf.short_circuit_low_temp_var >= 2 ||
+ if (x->content_state_sb != kVeryHighSad &&
+ (cpi->sf.short_circuit_low_temp_var >= 2 ||
(cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
force_skip_low_temp_var && ref_frame == LAST_FRAME &&
this_mode == NEWMV) {
@@ -1786,17 +1789,29 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
} else if (svc->use_base_mv && svc->spatial_layer_id) {
if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
const int pre_stride = xd->plane[0].pre[0].stride;
- int base_mv_sad = INT_MAX;
- const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f;
+ unsigned int base_mv_sse = UINT_MAX;
+ int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
const uint8_t *const pre_buf =
xd->plane[0].pre[0].buf +
(frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
(frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
- base_mv_sad = cpi->fn_ptr[bsize].sdf(
- x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride);
-
- if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) {
+ cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ pre_buf, pre_stride, &base_mv_sse);
+ // Exit NEWMV search if base_mv_sse is large.
+ if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+ continue;
+ if (base_mv_sse < (best_sse_sofar << 1)) {
// Base layer mv is good.
+ // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+ // (0, 0) mode is already tested.
+ unsigned int base_mv_sse_normalized =
+ base_mv_sse >>
+ (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+ base_mv_sse_normalized < 400 &&
+ frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+ frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+ continue;
if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
&frame_mv[NEWMV][ref_frame], &rate_mv,
best_rdc.rdcost, 1)) {
@@ -1942,6 +1957,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
sse_zeromv_normalized =
sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
}
+ if (sse_y < best_sse_sofar) best_sse_sofar = sse_y;
}
if (!this_early_term) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 27fea5d4e..1b5279412 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -209,7 +209,7 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
const int bpm =
(int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
return VPXMAX(FRAME_OVERHEAD_BITS,
- (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+ (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
}
int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
@@ -2070,7 +2070,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
return resize_action;
}
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+ uint64_t avg_sad_current) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int target;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 9df4f80ec..4c864d46a 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -534,10 +534,7 @@ static void set_rt_speed_feature_framesize_independent(
if (cpi->svc.temporal_layer_id > 0) {
sf->adaptive_rd_thresh = 4;
sf->limit_newmv_early_exit = 0;
- sf->base_mv_aggressive =
- (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
- ? 1
- : 0;
+ sf->base_mv_aggressive = 1;
}
}
@@ -589,7 +586,7 @@ static void set_rt_speed_feature_framesize_independent(
if (content == VP9E_CONTENT_SCREEN)
sf->mv.subpel_force_stop = 3;
- else if (cm->width * cm->height > 352 * 288) {
+ else if (cm->width * cm->height > 1280 * 720) {
sf->mv.subpel_force_stop = 2;
if (cpi->rc.avg_frame_low_motion > 87 && cm->current_video_frame > 30)
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index be4cd8685..460dab659 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -11,6 +11,7 @@
#include <assert.h>
#include <smmintrin.h>
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 09a1e48fc..969c60aba 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -15,6 +15,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
@@ -706,58 +707,6 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
store_output(&res[7], (output + 7 * stride));
}
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
static void fdct8_sse2(__m128i *in) {
// constants
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
@@ -895,7 +844,7 @@ static void fdct8_sse2(__m128i *in) {
in[7] = _mm_packs_epi32(v6, v7);
// transpose
- array_transpose_8x8(in, in);
+ transpose_16bit_8x8(in, in);
}
static void fadst8_sse2(__m128i *in) {
@@ -1125,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) {
in[7] = _mm_sub_epi16(k__const_0, s1);
// transpose
- array_transpose_8x8(in, in);
+ transpose_16bit_8x8(in, in);
}
void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -1182,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
}
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
// perform rounding operations
right_shift_8x8(res0, 2);
@@ -2002,13 +1934,13 @@ static void fadst16_8col(__m128i *in) {
static void fdct16_sse2(__m128i *in0, __m128i *in1) {
fdct16_8col(in0);
fdct16_8col(in1);
- array_transpose_16x16(in0, in1);
+ transpose_16bit_16x16(in0, in1);
}
static void fadst16_sse2(__m128i *in0, __m128i *in1) {
fadst16_8col(in0);
fadst16_8col(in1);
- array_transpose_16x16(in0, in1);
+ transpose_16bit_16x16(in0, in1);
}
void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index bb6b30bd4..d18457f34 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -432,8 +432,12 @@ static void config_target_level(VP9EncoderConfig *oxcf) {
(int)vp9_level_defs[target_level_index].min_altref_distance) {
oxcf->min_gf_interval =
(int)vp9_level_defs[target_level_index].min_altref_distance + 1;
- oxcf->max_gf_interval =
- VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval);
+ // If oxcf->max_gf_interval == 0, it will be assigned with a default value
+ // in vp9_rc_set_gf_interval_range().
+ if (oxcf->max_gf_interval != 0) {
+ oxcf->max_gf_interval =
+ VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval);
+ }
}
// Adjust maximum column tiles.