diff options
-rw-r--r-- | vp9/common/x86/vp9_intrapred_ssse3.asm | 177 | ||||
-rw-r--r-- | vp9/encoder/vp9_bitstream.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 19 | ||||
-rw-r--r-- | vp9/encoder/vp9_dct.c | 50 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_int.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 130 | ||||
-rw-r--r-- | vp9/vp9_cx_iface.c | 26 |
7 files changed, 303 insertions, 104 deletions
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm index 8c03de7cd..dc483a01e 100644 --- a/vp9/common/x86/vp9_intrapred_ssse3.asm +++ b/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -612,3 +612,180 @@ cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset mova [dstq+stride3q ], m4 RESTORE_GOT RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 7619d76d4..2f59d333a 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -388,8 +388,8 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { mi->ref_frame[0]); } - // if using the prediction mdoel we have nothing further to do because - // the reference frame is fully coded by the segment + // If using the prediction model we have nothing further to do because + // the reference frame is fully coded by the segment. } static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 6314b6009..a63bd1b8a 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -178,4 +178,23 @@ struct macroblock { int y_blocks); }; +struct rdcost_block_args { + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[16]; + ENTROPY_CONTEXT t_left[16]; + TX_SIZE tx_size; + int bw; + int bh; + int rate; + int64_t dist; + int64_t sse; + int this_rate; + int64_t this_dist; + int64_t this_sse; + int64_t this_rd; + int64_t best_rd; + int skip; + const int16_t *scan, *nb; +}; + #endif // VP9_ENCODER_VP9_BLOCK_H_ diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 27e4cd07f..3008e46dd 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -17,7 +17,7 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" -static void fdct4_1d(int16_t *input, int16_t *output) { +static void fdct4(int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; @@ -102,7 +102,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { } } -static void fadst4_1d(int16_t *input, int16_t *output) { +static void fadst4(int16_t *input, int16_t *output) { int x0, x1, x2, x3; int s0, s1, s2, s3, s4, s5, s6, s7; @@ -143,10 +143,10 @@ static void fadst4_1d(int16_t *input, int16_t *output) { } static const transform_2d FHT_4[] = { - { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 - { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 - { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 - { fadst4_1d, fadst4_1d } // ADST_ADST = 3 + { fdct4, fdct4 }, // DCT_DCT = 0 + { fadst4, fdct4 }, // ADST_DCT = 1 + { fdct4, fadst4 }, // DCT_ADST = 2 + { fadst4, fadst4 } // ADST_ADST = 3 }; void vp9_short_fht4x4_c(int16_t *input, int16_t *output, @@ -183,7 +183,7 @@ void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { vp9_short_fdct4x4_c(input + 4, output + 16, pitch); } -static void fdct8_1d(int16_t *input, int16_t *output) { +static void fdct8(int16_t *input, int16_t *output) { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; /*canbe16*/ int x0, x1, x2, x3; @@ -198,7 +198,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) { s6 = input[1] - input[6]; s7 = input[0] - input[7]; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -259,7 +259,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { s6 = (input[1 * stride] - input[6 * stride]) * 4; s7 = (input[0 * stride] - input[7 * stride]) * 4; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -301,7 +301,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { // Rows for (i = 0; i < 8; ++i) { - fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); + fdct8(&intermediate[i * 8], &final_output[i * 8]); for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; } @@ -368,7 +368,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); } - // Work on the first eight values; fdct8_1d(input, even_results); + // Work on the first eight values; fdct8(input, even_results); { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; @@ -384,7 +384,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { s6 = input[1] - input[6]; s7 = input[0] - input[7]; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -486,7 +486,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { } } -static void fadst8_1d(int16_t *input, int16_t *output) { +static void fadst8(int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[7]; @@ -558,10 +558,10 @@ static void fadst8_1d(int16_t *input, int16_t *output) { } static const transform_2d FHT_8[] = { - { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 - { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 - { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 - { fadst8_1d, fadst8_1d } // ADST_ADST = 3 + { fdct8, fdct8 }, // DCT_DCT = 0 + { fadst8, fdct8 }, // ADST_DCT = 1 + { fdct8, fadst8 }, // DCT_ADST = 2 + { fadst8, fadst8 } // ADST_ADST = 3 }; void vp9_short_fht8x8_c(int16_t *input, int16_t *output, @@ -654,7 +654,7 @@ void vp9_short_walsh8x4_c(int16_t *input, int16_t *output, int pitch) { // Rewrote to use same algorithm as others. -static void fdct16_1d(int16_t in[16], int16_t out[16]) { +static void fdct16(int16_t in[16], int16_t out[16]) { /*canbe16*/ int step1[8]; /*canbe16*/ int step2[8]; /*canbe16*/ int step3[8]; @@ -680,7 +680,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) { step1[6] = in[1] - in[14]; step1[7] = in[0] - in[15]; - // fdct8_1d(step, step); + // fdct8(step, step); { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; @@ -696,7 +696,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) { s6 = input[1] - input[6]; s7 = input[0] - input[7]; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -795,7 +795,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) { out[15] = dct_const_round_shift(temp2); } -void fadst16_1d(int16_t *input, int16_t *output) { +void fadst16(int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -958,10 +958,10 @@ void fadst16_1d(int16_t *input, int16_t *output) { } static const transform_2d FHT_16[] = { - { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 - { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 - { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 - { fadst16_1d, fadst16_1d } // ADST_ADST = 3 + { fdct16, fdct16 }, // DCT_DCT = 0 + { fadst16, fdct16 }, // ADST_DCT = 1 + { fdct16, fadst16 }, // DCT_ADST = 2 + { fadst16, fadst16 } // ADST_ADST = 3 }; void vp9_short_fht16x16_c(int16_t *input, int16_t *output, diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index b271f55b9..d706ec71b 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -314,6 +314,7 @@ typedef struct VP9_COMP { MACROBLOCK mb; VP9_COMMON common; VP9_CONFIG oxcf; + struct rdcost_block_args rdcost_stack; struct lookahead_ctx *lookahead; struct lookahead_entry *source; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index dce73ee71..5ba3ec8ad 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -535,25 +535,6 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, return cost; } -struct rdcost_block_args { - MACROBLOCK *x; - ENTROPY_CONTEXT t_above[16]; - ENTROPY_CONTEXT t_left[16]; - TX_SIZE tx_size; - int bw; - int bh; - int rate[256]; - int64_t dist[256]; - int64_t sse[256]; - int this_rate; - int64_t this_dist; - int64_t this_sse; - int64_t this_rd; - int64_t best_rd; - int skip; - const int16_t *scan, *nb; -}; - static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { const int ss_txfrm_size = tx_size << 1; struct rdcost_block_args* args = arg; @@ -565,17 +546,17 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { int shift = args->tx_size == TX_32X32 ? 0 : 2; int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - args->dist[block] = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, - &this_sse) >> shift; - args->sse[block] = this_sse >> shift; + args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, + &this_sse) >> shift; + args->sse = this_sse >> shift; if (x->skip_encode && xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // TODO(jingning): tune the model to better capture the distortion. int64_t p = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> (shift + 2); - args->dist[block] += (p >> 4); - args->sse[block] += p; + args->dist += (p >> 4); + args->sse += p; } } @@ -586,10 +567,9 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, int x_idx, y_idx; txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx); - args->rate[block] = cost_coeffs(args->x, plane, block, - args->t_above + x_idx, - args->t_left + y_idx, args->tx_size, - args->scan, args->nb); + args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx, + args->t_left + y_idx, args->tx_size, + args->scan, args->nb); } static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, @@ -610,17 +590,17 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, dist_block(plane, block, tx_size, args); rate_block(plane, block, plane_bsize, tx_size, args); - rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]); - rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]); + rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); // TODO(jingning): temporarily enabled only for luma component rd = MIN(rd1, rd2); if (plane == 0) x->zcoeff_blk[tx_size][block] = rd1 > rd2; - args->this_rate += args->rate[block]; - args->this_dist += args->dist[block]; - args->this_sse += args->sse[block]; + args->this_rate += args->rate; + args->this_dist += args->dist; + args->this_sse += args->sse; args->this_rd += rd; if (args->this_rd > args->best_rd) { @@ -662,7 +642,20 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size, } } +static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size, + const int num_4x4_w, const int num_4x4_h, + const int64_t ref_rdcost, + struct rdcost_block_args *arg) { + vpx_memset(arg, 0, sizeof(struct rdcost_block_args)); + arg->x = x; + arg->tx_size = tx_size; + arg->bw = num_4x4_w; + arg->bh = num_4x4_h; + arg->best_rd = ref_rdcost; +} + static void txfm_rd_in_plane(MACROBLOCK *x, + struct rdcost_block_args *rd_stack, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, @@ -674,30 +667,29 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const int num_4x4_h = num_4x4_blocks_high_lookup[bs]; const uint8_t *band_translate; // just for the get_scan_and_band call - struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size, - num_4x4_w, num_4x4_h, - { 0 }, { 0 }, { 0 }, - 0, 0, 0, 0, ref_best_rd, 0 }; + init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h, + ref_best_rd, rd_stack); if (plane == 0) xd->this_mi->mbmi.tx_size = tx_size; - vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left, + vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left, pd->above_context, pd->left_context, num_4x4_w, num_4x4_h); - get_scan_and_band(xd, tx_size, pd->plane_type, 0, &args.scan, &args.nb, - &band_translate); + get_scan_and_band(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, + &rd_stack->nb, &band_translate); - foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args); - if (args.skip) { + foreach_transformed_block_in_plane(xd, bsize, plane, + block_yrd_txfm, rd_stack); + if (rd_stack->skip) { *rate = INT_MAX; *distortion = INT64_MAX; *sse = INT64_MAX; *skippable = 0; } else { - *distortion = args.this_dist; - *rate = args.this_rate; - *sse = args.this_sse; + *distortion = rd_stack->this_dist; + *rate = rd_stack->this_rate; + *sse = rd_stack->this_sse; *skippable = vp9_is_skippable_in_plane(xd, bsize, plane); } } @@ -725,7 +717,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, } else { mbmi->tx_size = TX_4X4; } - txfm_rd_in_plane(x, rate, distortion, skip, + txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip, &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); cpi->tx_stepdown_count[0]++; @@ -909,8 +901,8 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, // Actually encode using the chosen mode if a model was used, but do not // update the r, d costs - txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size], - ref_best_rd, 0, bs, mbmi->tx_size); + txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip, + &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); if (max_tx_size == TX_32X32 && rd[TX_32X32][1] <= rd[TX_16X16][1] && @@ -937,6 +929,7 @@ static void super_block_yrd(VP9_COMP *cpi, int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack; assert(bs == mbmi->sb_type); if (mbmi->ref_frame[0] > INTRA_FRAME) @@ -972,14 +965,16 @@ static void super_block_yrd(VP9_COMP *cpi, skip, sse, ref_best_rd, bs); } else { if (bs >= BLOCK_32X32) - txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], - &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32); + txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32], + &s[TX_32X32], &sse[TX_32X32], + ref_best_rd, 0, bs, TX_32X32); if (bs >= BLOCK_16X16) - txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], - &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16); - txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16], + &s[TX_16X16], &sse[TX_16X16], + ref_best_rd, 0, bs, TX_16X16); + txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8); - txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, bs); @@ -1289,7 +1284,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, +static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, BLOCK_SIZE bsize, int64_t ref_best_rd) { @@ -1312,7 +1307,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, *skippable = 1; for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, + txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, plane, bsize, uv_txfm_size); if (pnrate == INT_MAX) goto term; @@ -1351,7 +1346,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode; - super_block_uvrd(&cpi->common, x, &this_rate_tokenonly, + super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, &this_sse, bsize, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1382,8 +1377,8 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, int64_t this_sse; x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED; - super_block_uvrd(&cpi->common, x, rate_tokenonly, - distortion, skippable, &this_sse, bsize, INT64_MAX); + super_block_uvrd(cpi, x, rate_tokenonly, distortion, + skippable, &this_sse, bsize, INT64_MAX); *rate = *rate_tokenonly + x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED]; this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion); @@ -3008,7 +3003,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse)); - super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, + super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, bsize, ref_best_rd - rdcosty); if (*rate_uv == INT_MAX) { *rate2 = INT_MAX; @@ -3638,10 +3633,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // values, which actually are bigger than this_rd itself. This can // cause negative best_filter_rd[] values, which is obviously silly. // Therefore, if filter_cache < ref, we do an adjusted calculation. - if (cpi->rd_filter_cache[i] >= ref) + if (cpi->rd_filter_cache[i] >= ref) { adj_rd = this_rd + cpi->rd_filter_cache[i] - ref; - else // FIXME(rbultje) do this for comppred also - adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref; + } else { + // FIXME(rbultje) do this for comppsred also + // + // To prevent out-of-range computation in + // adj_rd = cpi->rd_filter_cache[i] * this_rd / ref + // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio. + int tmp = cpi->rd_filter_cache[i] * 256 / ref; + adj_rd = (this_rd * tmp) >> 8; + } best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); } } @@ -4190,7 +4192,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // then dont bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); - super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable, + super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu); if (rate_uv == INT_MAX) continue; diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e58debfd8..810fdf51f 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -267,11 +267,11 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, } if (cfg.g_pass == VPX_RC_FIRST_PASS) { - oxcf->allow_lag = 0; - oxcf->lag_in_frames = 0; + oxcf->allow_lag = 0; + oxcf->lag_in_frames = 0; } else { - oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; - oxcf->lag_in_frames = cfg.g_lag_in_frames; + oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; + oxcf->lag_in_frames = cfg.g_lag_in_frames; } // VBR only supported for now. @@ -283,7 +283,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, else if (cfg.rc_end_usage == VPX_Q) oxcf->end_usage = USAGE_CONSTANT_QUALITY; - oxcf->target_bandwidth = cfg.rc_target_bitrate; + oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; @@ -298,7 +298,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz; - oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct; oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct; @@ -314,23 +314,23 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->encode_breakout = vp8_cfg.static_thresh; oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; - oxcf->Sharpness = vp8_cfg.Sharpness; + oxcf->Sharpness = vp8_cfg.Sharpness; - oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; - oxcf->output_pkt_list = vp8_cfg.pkt_list; + oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; + oxcf->output_pkt_list = vp8_cfg.pkt_list; oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; - oxcf->arnr_strength = vp8_cfg.arnr_strength; - oxcf->arnr_type = vp8_cfg.arnr_type; + oxcf->arnr_strength = vp8_cfg.arnr_strength; + oxcf->arnr_type = vp8_cfg.arnr_type; oxcf->tuning = vp8_cfg.tuning; oxcf->tile_columns = vp8_cfg.tile_columns; - oxcf->tile_rows = vp8_cfg.tile_rows; + oxcf->tile_rows = vp8_cfg.tile_rows; oxcf->lossless = vp8_cfg.lossless; - oxcf->error_resilient_mode = cfg.g_error_resilient; + oxcf->error_resilient_mode = cfg.g_error_resilient; oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; oxcf->ss_number_layers = cfg.ss_number_layers; |