diff options
-rw-r--r-- | vp9/encoder/vp9_avg.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.c | 49 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.h | 6 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_avg_intrin_sse2.c | 15 |
6 files changed, 60 insertions, 30 deletions
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index 50c8bca0b..90d113c32 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -32,12 +32,13 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height) { int idx; + const int norm_factor = MAX(8, height >> 1); for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; - hbuf[idx] /= 32; + hbuf[idx] /= norm_factor; ++ref; } } @@ -45,9 +46,10 @@ void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) { int idx; int16_t sum = 0; + const int norm_factor = MAX(8, width >> 1); for (idx = 0; idx < width; ++idx) sum += ref[idx]; - return sum / 32; + return sum / norm_factor; } int vp9_vector_var_c(int16_t const *ref, int16_t const *src, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 65d8eaebf..2bdb9915c 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3914,7 +3914,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { // Special case: set prev_mi to NULL when the previous mode info // context cannot be used. cm->prev_mi = cm->use_prev_frame_mvs ? - cm->prev_mip + cm->mi_stride + 1 : NULL; + cm->prev_mip + cm->mi_stride + 1 : NULL; x->quant_fp = cpi->sf.use_quant_fp; vp9_zero(x->skip_txfm); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 70b804e31..65e299793 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -476,19 +476,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, break; case TX_16X16: vp9_highbd_fdct16x16_1(src_diff, coeff, diff_stride); - vp9_highbd_quantize_dc(coeff, x->skip_block, p->round, + vp9_highbd_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; case TX_8X8: vp9_highbd_fdct8x8_1(src_diff, coeff, diff_stride); - vp9_highbd_quantize_dc(coeff, x->skip_block, p->round, + vp9_highbd_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; case TX_4X4: x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_highbd_quantize_dc(coeff, x->skip_block, p->round, + vp9_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; @@ -508,19 +508,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, break; case TX_16X16: vp9_fdct16x16_1(src_diff, coeff, diff_stride); - vp9_quantize_dc(coeff, x->skip_block, p->round, + vp9_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; case TX_8X8: vp9_fdct8x8_1(src_diff, coeff, diff_stride); - vp9_quantize_dc(coeff, x->skip_block, p->round, + vp9_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; case TX_4X4: x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_quantize_dc(coeff, x->skip_block, p->round, + vp9_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 7143987d4..2523d1ea3 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -19,7 +19,8 @@ #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" -void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, +void vp9_quantize_dc(const tran_low_t *coeff_ptr, + int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { @@ -29,6 +30,9 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp, eob = -1; + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 16; @@ -41,12 +45,16 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, } #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, +void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, + int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { int eob = -1; + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -69,15 +77,20 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int n_coeffs = 1024; const int rc = 0; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp, eob = -1; + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; @@ -96,8 +109,12 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int n_coeffs = 1024; int eob = -1; + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -105,8 +122,8 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp = - (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) * - quant) >> 15; + (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT32_MIN, INT32_MAX) * quant) >> 15; qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign); dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; if (tmp) @@ -521,21 +538,21 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), - 16, x->skip_block, - p->zbin, p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), - pd->dequant, &p->eobs[block], - scan, iscan); + 16, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, + BLOCK_OFFSET(p->qcoeff, block), + BLOCK_OFFSET(pd->dqcoeff, block), + pd->dequant, &p->eobs[block], + scan, iscan); return; } #endif vp9_quantize_b(BLOCK_OFFSET(p->coeff, block), - 16, x->skip_block, - p->zbin, p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), - pd->dequant, &p->eobs[block], scan, iscan); + 16, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, + BLOCK_OFFSET(p->qcoeff, block), + BLOCK_OFFSET(pd->dqcoeff, block), + pd->dequant, &p->eobs[block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index de2839f5b..55e546944 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -37,7 +37,8 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); } QUANTS; -void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, +void vp9_quantize_dc(const tran_low_t *coeff_ptr, + int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr); @@ -49,7 +50,8 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, +void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, + int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr); diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 482fa3da3..f49949940 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -90,8 +90,16 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, s0 = _mm_adds_epu16(s0, t0); s1 = _mm_adds_epu16(s1, t1); - s0 = _mm_srai_epi16(s0, 5); - s1 = _mm_srai_epi16(s1, 5); + if (height == 64) { + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + } else if (height == 32) { + s0 = _mm_srai_epi16(s0, 4); + s1 = _mm_srai_epi16(s1, 4); + } else { + s0 = _mm_srai_epi16(s0, 3); + s1 = _mm_srai_epi16(s1, 3); + } _mm_store_si128((__m128i *)hbuf, s0); hbuf += 8; @@ -104,6 +112,7 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { __m128i s0 = _mm_sad_epu8(src_line, zero); __m128i s1; int i; + const int norm_factor = 3 + (width >> 5); for (i = 16; i < width; i += 16) { ref += 16; @@ -115,7 +124,7 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { s1 = _mm_srli_si128(s0, 8); s0 = _mm_adds_epu16(s0, s1); - return (_mm_extract_epi16(s0, 0)) >> 5; + return _mm_extract_epi16(s0, 0) >> norm_factor; } int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, |