diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_loopfilter.c | 106 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 5 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodeframe.c | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.c | 96 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.h | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 48 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 33 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.c | 12 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_sse2.c | 51 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 67 |
12 files changed, 270 insertions, 177 deletions
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 2101ec58c..676382dc8 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -34,10 +34,10 @@ // // A loopfilter should be applied to every other 8x8 horizontally. static const uint64_t left_64x64_txform_mask[TX_SIZES]= { - 0xffffffffffffffff, // TX_4X4 - 0xffffffffffffffff, // TX_8x8 - 0x5555555555555555, // TX_16x16 - 0x1111111111111111, // TX_32x32 + 0xffffffffffffffffULL, // TX_4X4 + 0xffffffffffffffffULL, // TX_8x8 + 0x5555555555555555ULL, // TX_16x16 + 0x1111111111111111ULL, // TX_32x32 }; // 64 bit masks for above transform size. Each 1 represents a position where @@ -58,10 +58,10 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= { // // A loopfilter should be applied to every other 4 the row vertically. static const uint64_t above_64x64_txform_mask[TX_SIZES]= { - 0xffffffffffffffff, // TX_4X4 - 0xffffffffffffffff, // TX_8x8 - 0x00ff00ff00ff00ff, // TX_16x16 - 0x000000ff000000ff, // TX_32x32 + 0xffffffffffffffffULL, // TX_4X4 + 0xffffffffffffffffULL, // TX_8x8 + 0x00ff00ff00ff00ffULL, // TX_16x16 + 0x000000ff000000ffULL, // TX_32x32 }; // 64 bit masks for prediction sizes (left). Each 1 represents a position @@ -80,59 +80,59 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= { // 00000000 // 00000000 static const uint64_t left_prediction_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4, - 0x0000000000000001, // BLOCK_4X8, - 0x0000000000000001, // BLOCK_8X4, - 0x0000000000000001, // BLOCK_8X8, - 0x0000000000000101, // BLOCK_8X16, - 0x0000000000000001, // BLOCK_16X8, - 0x0000000000000101, // BLOCK_16X16, - 0x0000000001010101, // BLOCK_16X32, - 0x0000000000000101, // BLOCK_32X16, - 0x0000000001010101, // BLOCK_32X32, - 0x0101010101010101, // BLOCK_32X64, - 0x0000000001010101, // BLOCK_64X32, - 0x0101010101010101, // BLOCK_64X64 + 0x0000000000000001ULL, // BLOCK_4X4, + 0x0000000000000001ULL, // BLOCK_4X8, + 0x0000000000000001ULL, // BLOCK_8X4, + 0x0000000000000001ULL, // BLOCK_8X8, + 0x0000000000000101ULL, // BLOCK_8X16, + 0x0000000000000001ULL, // BLOCK_16X8, + 0x0000000000000101ULL, // BLOCK_16X16, + 0x0000000001010101ULL, // BLOCK_16X32, + 0x0000000000000101ULL, // BLOCK_32X16, + 0x0000000001010101ULL, // BLOCK_32X32, + 0x0101010101010101ULL, // BLOCK_32X64, + 0x0000000001010101ULL, // BLOCK_64X32, + 0x0101010101010101ULL, // BLOCK_64X64 }; // 64 bit mask to shift and set for each prediction size. static const uint64_t above_prediction_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4 - 0x0000000000000001, // BLOCK_4X8 - 0x0000000000000001, // BLOCK_8X4 - 0x0000000000000001, // BLOCK_8X8 - 0x0000000000000001, // BLOCK_8X16, - 0x0000000000000003, // BLOCK_16X8 - 0x0000000000000003, // BLOCK_16X16 - 0x0000000000000003, // BLOCK_16X32, - 0x000000000000000f, // BLOCK_32X16, - 0x000000000000000f, // BLOCK_32X32, - 0x000000000000000f, // BLOCK_32X64, - 0x00000000000000ff, // BLOCK_64X32, - 0x00000000000000ff, // BLOCK_64X64 + 0x0000000000000001ULL, // BLOCK_4X4 + 0x0000000000000001ULL, // BLOCK_4X8 + 0x0000000000000001ULL, // BLOCK_8X4 + 0x0000000000000001ULL, // BLOCK_8X8 + 0x0000000000000001ULL, // BLOCK_8X16, + 0x0000000000000003ULL, // BLOCK_16X8 + 0x0000000000000003ULL, // BLOCK_16X16 + 0x0000000000000003ULL, // BLOCK_16X32, + 0x000000000000000fULL, // BLOCK_32X16, + 0x000000000000000fULL, // BLOCK_32X32, + 0x000000000000000fULL, // BLOCK_32X64, + 0x00000000000000ffULL, // BLOCK_64X32, + 0x00000000000000ffULL, // BLOCK_64X64 }; // 64 bit mask to shift and set for each prediction size. A bit is set for // each 8x8 block that would be in the left most block of the given block // size in the 64x64 block. static const uint64_t size_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4 - 0x0000000000000001, // BLOCK_4X8 - 0x0000000000000001, // BLOCK_8X4 - 0x0000000000000001, // BLOCK_8X8 - 0x0000000000000101, // BLOCK_8X16, - 0x0000000000000003, // BLOCK_16X8 - 0x0000000000000303, // BLOCK_16X16 - 0x0000000003030303, // BLOCK_16X32, - 0x0000000000000f0f, // BLOCK_32X16, - 0x000000000f0f0f0f, // BLOCK_32X32, - 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64, - 0x00000000ffffffff, // BLOCK_64X32, - 0xffffffffffffffff, // BLOCK_64X64 + 0x0000000000000001ULL, // BLOCK_4X4 + 0x0000000000000001ULL, // BLOCK_4X8 + 0x0000000000000001ULL, // BLOCK_8X4 + 0x0000000000000001ULL, // BLOCK_8X8 + 0x0000000000000101ULL, // BLOCK_8X16, + 0x0000000000000003ULL, // BLOCK_16X8 + 0x0000000000000303ULL, // BLOCK_16X16 + 0x0000000003030303ULL, // BLOCK_16X32, + 0x0000000000000f0fULL, // BLOCK_32X16, + 0x000000000f0f0f0fULL, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0fULL, // BLOCK_32X64, + 0x00000000ffffffffULL, // BLOCK_64X32, + 0xffffffffffffffffULL, // BLOCK_64X64 }; // These are used for masking the left and above borders. -static const uint64_t left_border = 0x1111111111111111; -static const uint64_t above_border = 0x000000ff000000ff; +static const uint64_t left_border = 0x1111111111111111ULL; +static const uint64_t above_border = 0x000000ff000000ffULL; // 16 bit masks for uv transform sizes. static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= { @@ -773,7 +773,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // an 8x8 in that the internal ones can be skipped and don't depend on // the prediction block size. if (tx_size_y == TX_4X4) - *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; + *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y; if (tx_size_uv == TX_4X4) *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; @@ -819,7 +819,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, left_64x64_txform_mask[tx_size_y]) << shift_y; if (tx_size_y == TX_4X4) - *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; + *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y; } // This function sets up the bit masks for the entire 64x64 region represented @@ -1021,7 +1021,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, // Each pixel inside the border gets a 1, the multiply copies the border // to where we need it. - const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101; + const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL; const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111; // Internal edges are not applied on the last column of the image so @@ -1053,7 +1053,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, // out. if (mi_col == 0) { for (i = 0; i < TX_32X32; i++) { - lfm->left_y[i] &= 0xfefefefefefefefe; + lfm->left_y[i] &= 0xfefefefefefefefeULL; lfm->left_uv[i] &= 0xeeee; } } diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index cfb0a98e5..7ca24a56e 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -110,12 +110,7 @@ typedef struct { typedef struct VP9Common { struct vpx_internal_error_info error; - - DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); - vpx_color_space_t color_space; - int width; int height; int display_width; diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 18e0bf53f..d345a0578 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -741,6 +741,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space; } static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth, @@ -821,6 +822,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space; } static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { @@ -1436,6 +1438,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, #if CONFIG_VP9_HIGHBITDEPTH get_frame_new_buffer(cm)->bit_depth = cm->bit_depth; #endif + get_frame_new_buffer(cm)->color_space = cm->color_space; if (pbi->need_resync) { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 5a3671d84..ff08f04e6 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -96,30 +96,35 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm, static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, const MB_MODE_INFO *mbmi, int64_t rate, - int64_t dist) { + int64_t dist, + int bsize) { MV mv = mbmi->mv[0].as_mv; - // If projected rate is below the thresh_rate accept it for lower-qp coding. - // Otherwise, reject the block for lower-qp coding if projected distortion + // Reject the block for lower-qp coding if projected distortion // is above the threshold, and any of the following is true: // 1) mode uses large mv // 2) mode is an intra-mode - if (rate < cr->thresh_rate_sb) - return 1; - else if (dist > cr->thresh_dist_sb && - (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || - mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || - !is_inter_block(mbmi))) - return 0; + // Otherwise accept for refresh. + if (dist > cr->thresh_dist_sb && + (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || + mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || + !is_inter_block(mbmi))) + return CR_SEGMENT_ID_BASE; + else if (bsize >= BLOCK_32X32 && + rate < cr->thresh_rate_sb && + is_inter_block(mbmi) && + mbmi->mv[0].as_int == 0) + // More aggressive delta-q for bigger blocks with zero motion. + return CR_SEGMENT_ID_BOOST2; else - return 1; + return CR_SEGMENT_ID_BOOST1; } // Compute delta-q for the segment. -static int compute_deltaq(const VP9_COMP *cpi, int q) { +static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) { const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const RATE_CONTROL *const rc = &cpi->rc; int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type, - q, cr->rate_ratio_qdelta, + q, rate_factor, cpi->common.bit_depth); if ((-deltaq) > cr->max_qdelta_perc * q / 100) { deltaq = -cr->max_qdelta_perc * q / 100; @@ -128,8 +133,9 @@ static int compute_deltaq(const VP9_COMP *cpi, int q) { } // For the just encoded frame, estimate the bits, incorporating the delta-q -// from segment 1. This function is called in the postencode (called from -// rc_update_rate_correction_factors()). +// from non-base segment. For now ignore effect of multiple segments +// (with different delta-q). Note this function is called in the postencode +// (called from rc_update_rate_correction_factors()). int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, double correction_factor) { const VP9_COMMON *const cm = &cpi->common; @@ -137,11 +143,11 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, int estimated_bits; int mbs = cm->MBs; int num8x8bl = mbs << 2; - // Weight for segment 1: use actual number of blocks refreshed in + // Weight for non-base segments: use actual number of blocks refreshed in // previous/just encoded frame. Note number of blocks here is in 8x8 units. double weight_segment = (double)cr->actual_num_seg_blocks / num8x8bl; // Compute delta-q that was used in the just encoded frame. - int deltaq = compute_deltaq(cpi, cm->base_qindex); + int deltaq = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta); // Take segment weighted average for estimated bits. estimated_bits = (int)((1.0 - weight_segment) * vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs, @@ -155,6 +161,8 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, // Prior to encoding the frame, estimate the bits per mb, for a given q = i and // a corresponding delta-q (for segment 1). This function is called in the // rc_regulate_q() to set the base qp index. +// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or +// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding. int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i, double correction_factor) { const VP9_COMMON *const cm = &cpi->common; @@ -171,7 +179,7 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i, // does not occur/is very small. double weight_segment = (double)cr->target_num_seg_blocks / num8x8bl; // Compute delta-q corresponding to qindex i. - int deltaq = compute_deltaq(cpi, i); + int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); // Take segment weighted average for bits per mb. bits_per_mb = (int)((1.0 - weight_segment) * vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) + @@ -197,20 +205,22 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, const int xmis = MIN(cm->mi_cols - mi_col, bw); const int ymis = MIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; - const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist); + const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, + bsize); // Default is to not update the refresh map. int new_map_value = cr->map[block_index]; int x = 0; int y = 0; - // Check if we should reset the segment_id for this block. - if (mbmi->segment_id > 0 && !refresh_this_block) - mbmi->segment_id = 0; + // If this block is labeled for refresh, check if we should reset the + // segment_id. + if (mbmi->segment_id != CR_SEGMENT_ID_BASE) + mbmi->segment_id = refresh_this_block; // Update the cyclic refresh map, to be used for setting segmentation map // for the next frame. If the block will be refreshed this frame, mark it // as clean. The magnitude of the -ve influences how long before we consider // it for refresh again. - if (mbmi->segment_id == 1) { + if (mbmi->segment_id != CR_SEGMENT_ID_BASE) { new_map_value = -cr->time_for_refresh; } else if (refresh_this_block) { // Else if it is accepted as candidate for refresh, and has not already @@ -242,20 +252,24 @@ void vp9_cyclic_refresh_update_actual_count(struct VP9_COMP *const cpi) { cr->actual_num_seg_blocks = 0; for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { - if (seg_map[mi_row * cm->mi_cols + mi_col] == 1) + if (seg_map[mi_row * cm->mi_cols + mi_col] != CR_SEGMENT_ID_BASE) cr->actual_num_seg_blocks++; } } // Update the segmentation map, and related quantities: cyclic refresh map, // refresh sb_index, and target number of blocks to be refreshed. +// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to +// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. +// Blocks labeled as BOOST1 may later get set to BOOST2 (during the +// encoding of the superblock). void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->segmentation_map; int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; - vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + vpx_memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols); sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; sbs_in_frame = sb_cols * sb_rows; @@ -300,7 +314,7 @@ void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) { if (sum_map >= xmis * ymis / 2) { for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) { - seg_map[bl_index + y * cm->mi_cols + x] = 1; + seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; } cr->target_num_seg_blocks += xmis * ymis; } @@ -352,11 +366,11 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { vp9_clear_system_state(); cr->max_qdelta_perc = 50; cr->time_for_refresh = 0; - // Set rate threshold to some fraction (set to 1 for now) of the target + // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). - cr->thresh_rate_sb = (rc->sb64_target_rate << 8); + cr->thresh_rate_sb = (rc->sb64_target_rate << 8) << 1; // Distortion threshold, quadratic in Q, scale factor to be adjusted. - cr->thresh_dist_sb = (int)(q * q) << 5; + cr->thresh_dist_sb = (int)(q * q) << 2; cr->motion_thresh = 32; // Set up segmentation. // Clear down the segment map. @@ -372,19 +386,27 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // relative to 0 previous map. // seg->temporal_update = 0; - // Segment 0 "Q" feature is disabled so it defaults to the baseline Q. - vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); - // Use segment 1 for in-frame Q adjustment. - vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. + vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); + // Use segment BOOST1 for in-frame Q adjustment. + vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); + // Use segment BOOST2 for more aggressive in-frame Q adjustment. + vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); - // Set the q delta for segment 1. - qindex_delta = compute_deltaq(cpi, cm->base_qindex); + // Set the q delta for segment BOOST1. + qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta); - // Compute rd-mult for segment 1. + // Compute rd-mult for segment BOOST1. qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ); cr->rdmult = vp9_compute_rd_mult(cpi, qindex2); - vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta); + vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); + + // Set a more aggressive (higher) q delta for segment BOOST2. + qindex_delta = compute_deltaq(cpi, cm->base_qindex, + MIN(CR_MAX_RATE_TARGET_RATIO, + CR_BOOST2_FAC * cr->rate_ratio_qdelta)); + vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); // Update the segmentation and refresh map. vp9_cyclic_refresh_update_map(cpi); diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index 1ed07c2c2..dc1b968a2 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -18,6 +18,18 @@ extern "C" { #endif +// The segment ids used in cyclic refresh: from base (no boost) to increasing +// boost (higher delta-qp). +#define CR_SEGMENT_ID_BASE 0 +#define CR_SEGMENT_ID_BOOST1 1 +#define CR_SEGMENT_ID_BOOST2 2 + +// Maximum rate target ratio for setting segment delta-qp. +#define CR_MAX_RATE_TARGET_RATIO 4.0 + +// Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. +#define CR_BOOST2_FAC 1.7 + struct VP9_COMP; struct CYCLIC_REFRESH; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 4f245e249..cf67e115e 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -45,7 +45,7 @@ static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { (void)bs; (void)increase_denoising; - return 25 * 25; + return 625; } static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { @@ -53,8 +53,8 @@ static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { } static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, - int mv_row, int mv_col) { - if (mv_row * mv_row + mv_col * mv_col > + int motion_magnitude) { + if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) { return 0; } else { @@ -219,7 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. if (frame != INTRA_FRAME && - sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) { + sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) { mbmi->ref_frame[0] = ctx->best_reference_frame; mbmi->mode = ctx->best_sse_inter_mode; mbmi->mv[0] = ctx->best_sse_mv; @@ -241,8 +241,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, *mbmi = saved_mbmi; return COPY_BLOCK; } - if (mv_row * mv_row + mv_col * mv_col > - 8 * noise_motion_thresh(bs, increase_denoising)) { + if (*motion_magnitude > + (noise_motion_thresh(bs, increase_denoising) << 3)) { // Restore everything to its original state *mbmi = saved_mbmi; return COPY_BLOCK; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3bdaa043f..ff12bf8a2 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -403,7 +403,8 @@ static int set_vt_partitioning(VP9_COMP *cpi, int mi_row, int mi_col, int64_t threshold, - BLOCK_SIZE bsize_min) { + BLOCK_SIZE bsize_min, + int segment_id) { VP9_COMMON * const cm = &cpi->common; variance_node vt; const int block_width = num_8x8_blocks_wide_lookup[bsize]; @@ -412,6 +413,10 @@ static int set_vt_partitioning(VP9_COMP *cpi, assert(block_height == block_width); tree_to_node(data, bsize, &vt); + // No 64x64 blocks on segments other than base (un-boosted) segment. + if (segment_id != CR_SEGMENT_ID_BASE && bsize == BLOCK_64X64) + return 0; + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if // variance is below threshold, otherwise split will be selected. // No check for vert/horiz split as too few samples for variance. @@ -528,10 +533,10 @@ static int vector_match(int16_t *ref, int16_t *src) { } center = offset; - for (d = -8; d <= 8; d += 4) { + for (d = -8; d <= 8; d += 16) { int this_pos = offset + d; // check limit - if (this_pos < 0 || this_pos > 64 || this_pos == 32) + if (this_pos < 0 || this_pos > 64) continue; this_sad = vp9_vector_sad(&ref[this_pos], src, 64); if (this_sad < best_sad) { @@ -541,10 +546,10 @@ static int vector_match(int16_t *ref, int16_t *src) { } offset = center; - for (d = -4; d <= 4; d += 2) { + for (d = -4; d <= 4; d += 8) { int this_pos = offset + d; // check limit - if (this_pos < 0 || this_pos > 64 || this_pos == 32) + if (this_pos < 0 || this_pos > 64) continue; this_sad = vp9_vector_sad(&ref[this_pos], src, 64); if (this_sad < best_sad) { @@ -554,10 +559,23 @@ static int vector_match(int16_t *ref, int16_t *src) { } offset = center; - for (d = -2; d <= 2; d += 1) { + for (d = -2; d <= 2; d += 4) { int this_pos = offset + d; // check limit - if (this_pos < 0 || this_pos > 64 || this_pos == 32) + if (this_pos < 0 || this_pos > 64) + continue; + this_sad = vp9_vector_sad(&ref[this_pos], src, 64); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > 64) continue; this_sad = vp9_vector_sad(&ref[this_pos], src, 64); if (this_sad < best_sad) { @@ -670,6 +688,13 @@ static void choose_partitioning(VP9_COMP *cpi, const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; + int segment_id = CR_SEGMENT_ID_BASE; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { + const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map : + cm->last_frame_seg_map; + segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + } + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); if (xd->mb_to_right_edge < 0) @@ -843,7 +868,8 @@ static void choose_partitioning(VP9_COMP *cpi, // we get to one that's got a variance lower than our threshold. if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col, - cpi->vbp_threshold_bsize_max, BLOCK_16X16)) { + cpi->vbp_threshold_bsize_max, BLOCK_16X16, + segment_id)) { for (i = 0; i < 4; ++i) { const int x32_idx = ((i & 1) << 2); const int y32_idx = ((i >> 1) << 2); @@ -851,7 +877,7 @@ static void choose_partitioning(VP9_COMP *cpi, if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32, (mi_row + y32_idx), (mi_col + x32_idx), cpi->vbp_threshold, - BLOCK_16X16)) { + BLOCK_16X16, segment_id)) { for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); @@ -865,7 +891,7 @@ static void choose_partitioning(VP9_COMP *cpi, mi_row + y32_idx + y16_idx, mi_col + x32_idx + x16_idx, cpi->vbp_threshold_16x16, - cpi->vbp_bsize_min)) { + cpi->vbp_bsize_min, segment_id)) { for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); @@ -875,7 +901,7 @@ static void choose_partitioning(VP9_COMP *cpi, mi_row + y32_idx + y16_idx + y8_idx, mi_col + x32_idx + x16_idx + x8_idx, cpi->vbp_threshold_bsize_min, - BLOCK_8X8)) { + BLOCK_8X8, segment_id)) { set_block_size(cpi, xd, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx), diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 35c5a487b..69edfded1 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -269,6 +269,8 @@ struct EncWorkerData; typedef struct VP9_COMP { QUANTS quants; ThreadData td; + DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); VP9_COMMON common; VP9EncoderConfig oxcf; struct lookahead_ctx *lookahead; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 71cea0e45..c18687b55 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -242,9 +242,13 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, else xd->mi[0].src_mi->mbmi.tx_size = TX_8X8; - if (cpi->sf.partition_search_type == VAR_BASED_PARTITION && - xd->mi[0].src_mi->mbmi.tx_size > TX_16X16) - xd->mi[0].src_mi->mbmi.tx_size = TX_16X16; + if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + xd->mi[0].src_mi->mbmi.segment_id != CR_SEGMENT_ID_BASE) + xd->mi[0].src_mi->mbmi.tx_size = TX_8X8; + else if (xd->mi[0].src_mi->mbmi.tx_size > TX_16X16) + xd->mi[0].src_mi->mbmi.tx_size = TX_16X16; + } } else { xd->mi[0].src_mi->mbmi.tx_size = MIN(max_txsize_lookup[bsize], @@ -1228,7 +1232,9 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, [INTER_OFFSET(this_mode)]; } - vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride, +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride, &xd->mi[0].bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf, @@ -1237,7 +1243,24 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_interp_kernel(mbmi->interp_filter), MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * (i & 0x01), - mi_row * MI_SIZE + 4 * (i >> 1)); + mi_row * MI_SIZE + 4 * (i >> 1), xd->bd); + } else { +#endif + vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride, + pd->dst.buf, pd->dst.stride, + &xd->mi[0].bmi[i].as_mv[0].as_mv, + &xd->block_refs[0]->sf, + 4 * num_4x4_blocks_wide, + 4 * num_4x4_blocks_high, 0, + vp9_get_interp_kernel(mbmi->interp_filter), + MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i & 0x01), + mi_row * MI_SIZE + 4 * (i >> 1)); + +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y); diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 389dc87e0..7143987d4 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -591,7 +591,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->y_round[q][i] = (qrounding_factor * quant) >> 7; - cm->y_dequant[q][i] = quant; + cpi->y_dequant[q][i] = quant; // uv quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth) @@ -602,7 +602,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; - cm->uv_dequant[q][i] = quant; + cpi->uv_dequant[q][i] = quant; } for (i = 2; i < 8; i++) { @@ -612,7 +612,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; quants->y_zbin[q][i] = quants->y_zbin[q][1]; quants->y_round[q][i] = quants->y_round[q][1]; - cm->y_dequant[q][i] = cm->y_dequant[q][1]; + cpi->y_dequant[q][i] = cpi->y_dequant[q][1]; quants->uv_quant[q][i] = quants->uv_quant[q][1]; quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; @@ -620,7 +620,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; quants->uv_round[q][i] = quants->uv_round[q][1]; - cm->uv_dequant[q][i] = cm->uv_dequant[q][1]; + cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1]; } } } @@ -641,7 +641,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; - xd->plane[0].dequant = cm->y_dequant[qindex]; + xd->plane[0].dequant = cpi->y_dequant[qindex]; x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0]; x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1]; @@ -654,7 +654,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; - xd->plane[i].dequant = cm->uv_dequant[qindex]; + xd->plane[i].dequant = cpi->uv_dequant[qindex]; x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0]; x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index ae22a0b32..279074ce3 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -712,9 +712,7 @@ static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, } // right shift and rounding -static INLINE void right_shift_8x8(__m128i *res, int const bit) { - const __m128i kOne = _mm_set1_epi16(1); - const int bit_m02 = bit - 2; +static INLINE void right_shift_8x8(__m128i *res, const int bit) { __m128i sign0 = _mm_srai_epi16(res[0], 15); __m128i sign1 = _mm_srai_epi16(res[1], 15); __m128i sign2 = _mm_srai_epi16(res[2], 15); @@ -724,16 +722,16 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) { __m128i sign6 = _mm_srai_epi16(res[6], 15); __m128i sign7 = _mm_srai_epi16(res[7], 15); - if (bit_m02 >= 0) { - __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); - res[0] = _mm_add_epi16(res[0], k_const_rounding); - res[1] = _mm_add_epi16(res[1], k_const_rounding); - res[2] = _mm_add_epi16(res[2], k_const_rounding); - res[3] = _mm_add_epi16(res[3], k_const_rounding); - res[4] = _mm_add_epi16(res[4], k_const_rounding); - res[5] = _mm_add_epi16(res[5], k_const_rounding); - res[6] = _mm_add_epi16(res[6], k_const_rounding); - res[7] = _mm_add_epi16(res[7], k_const_rounding); + if (bit == 2) { + const __m128i const_rounding = _mm_set1_epi16(1); + res[0] = _mm_add_epi16(res[0], const_rounding); + res[1] = _mm_add_epi16(res[1], const_rounding); + res[2] = _mm_add_epi16(res[2], const_rounding); + res[3] = _mm_add_epi16(res[3], const_rounding); + res[4] = _mm_add_epi16(res[4], const_rounding); + res[5] = _mm_add_epi16(res[5], const_rounding); + res[6] = _mm_add_epi16(res[6], const_rounding); + res[7] = _mm_add_epi16(res[7], const_rounding); } res[0] = _mm_sub_epi16(res[0], sign0); @@ -745,14 +743,25 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) { res[6] = _mm_sub_epi16(res[6], sign6); res[7] = _mm_sub_epi16(res[7], sign7); - res[0] = _mm_srai_epi16(res[0], bit); - res[1] = _mm_srai_epi16(res[1], bit); - res[2] = _mm_srai_epi16(res[2], bit); - res[3] = _mm_srai_epi16(res[3], bit); - res[4] = _mm_srai_epi16(res[4], bit); - res[5] = _mm_srai_epi16(res[5], bit); - res[6] = _mm_srai_epi16(res[6], bit); - res[7] = _mm_srai_epi16(res[7], bit); + if (bit == 1) { + res[0] = _mm_srai_epi16(res[0], 1); + res[1] = _mm_srai_epi16(res[1], 1); + res[2] = _mm_srai_epi16(res[2], 1); + res[3] = _mm_srai_epi16(res[3], 1); + res[4] = _mm_srai_epi16(res[4], 1); + res[5] = _mm_srai_epi16(res[5], 1); + res[6] = _mm_srai_epi16(res[6], 1); + res[7] = _mm_srai_epi16(res[7], 1); + } else { + res[0] = _mm_srai_epi16(res[0], 2); + res[1] = _mm_srai_epi16(res[1], 2); + res[2] = _mm_srai_epi16(res[2], 2); + res[3] = _mm_srai_epi16(res[3], 2); + res[4] = _mm_srai_epi16(res[4], 2); + res[5] = _mm_srai_epi16(res[5], 2); + res[6] = _mm_srai_epi16(res[6], 2); + res[7] = _mm_srai_epi16(res[7], 2); + } } // write 8x8 array diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 72e01d646..c35eb3603 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -15,6 +15,7 @@ pw_1: times 8 dw 1 SECTION .text +; TODO(yunqingwang)fix quantize_b code for skip=1 case. %macro QUANTIZE_FN 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ @@ -244,11 +245,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psllw m2, 1 %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + lea coeffq, [ coeffq+ncoeffq*2] - lea iscanq, [ iscanq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea r5q, [ r5q+ncoeffq*2] + lea r3q, [ r3q+ncoeffq*2] + lea r4q, [r4q+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs @@ -266,15 +267,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 + mova [r3q+ncoeffq*2+ 0], m8 + mova [r3q+ncoeffq*2+16], m13 %ifidn %1, fp_32x32 pabsw m8, m8 pabsw m13, m13 %endif - pmullw m8, m3 ; dqc[i] = qc[i] * q + pmullw m8, m3 ; r4[i] = r3[i] * q punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; r4[i] = r3[i] * q %ifidn %1, fp_32x32 psrlw m8, 1 psrlw m13, 1 @@ -282,12 +283,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 psrlw m0, m3, 2 %endif - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 + mova [r4q+ncoeffq*2+ 0], m8 + mova [r4q+ncoeffq*2+16], m13 pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m7 ; m11 = scan[i] + 1 pandn m8, m6 ; m8 = max(eob) @@ -318,26 +319,26 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 + mova [r3q+ncoeffq*2+ 0], m14 + mova [r3q+ncoeffq*2+16], m13 %ifidn %1, fp_32x32 pabsw m14, m14 pabsw m13, m13 %endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q + pmullw m14, m3 ; r4[i] = r3[i] * q + pmullw m13, m3 ; r4[i] = r3[i] * q %ifidn %1, fp_32x32 psrlw m14, 1 psrlw m13, 1 psignw m14, m9 psignw m13, m10 %endif - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 + mova [r4q+ncoeffq*2+ 0], m14 + mova [r4q+ncoeffq*2+16], m13 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m7 ; m11 = scan[i] + 1 pandn m14, m6 ; m14 = max(eob) @@ -350,10 +351,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %ifidn %1, fp_32x32 jmp .accumulate_eob .skip_iter: - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 + mova [r3q+ncoeffq*2+ 0], m5 + mova [r3q+ncoeffq*2+16], m5 + mova [r4q+ncoeffq*2+ 0], m5 + mova [r4q+ncoeffq*2+16], m5 add ncoeffq, mmsize jl .ac_only_loop %endif @@ -368,7 +369,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pshuflw m7, m8, 0x1 pmaxsw m8, m7 pextrw r6, m8, 0 - mov [r2], r6 + mov [r2], r6 RET ; skip-block, i.e. just write all zeroes @@ -377,19 +378,19 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ movifnidn ncoeffq, ncoeffmp mov r2, qcoeffmp mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] + + lea r0q, [r0q+ncoeffq*2] + lea r2q, [r2q+ncoeffq*2] neg ncoeffq pxor m7, m7 .blank_loop: - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 + mova [r0q+ncoeffq*2+ 0], m7 + mova [r0q+ncoeffq*2+16], m7 + mova [r2q+ncoeffq*2+ 0], m7 + mova [r2q+ncoeffq*2+16], m7 add ncoeffq, mmsize jl .blank_loop - mov word [eobq], 0 + mov word [r3q], 0 RET %endmacro |