diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_blockd.c | 42 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 30 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 7 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 45 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 8 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_avg_intrin_sse2.c | 12 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_ssse3.c | 83 |
8 files changed, 96 insertions, 133 deletions
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c index 7094a0118..3cd9f44e9 100644 --- a/vp9/common/vp9_blockd.c +++ b/vp9/common/vp9_blockd.c @@ -50,39 +50,25 @@ void vp9_foreach_transformed_block_in_plane( const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; const int step = 1 << (tx_size << 1); - int i; + int i = 0, r, c; // If mb_to_right_edge is < 0 we are in a situation in which // the current block size extends into the UMV and we won't // visit the sub blocks that are wholly within the UMV. - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - int r, c; - - int max_blocks_wide = num_4x4_w; - int max_blocks_high = num_4x4_h; - - // xd->mb_to_right_edge is in units of pixels * 8. This converts - // it to 4x4 block sizes. - if (xd->mb_to_right_edge < 0) - max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - - if (xd->mb_to_bottom_edge < 0) - max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - - i = 0; - // Unlike the normal case - in here we have to keep track of the - // row and column of the blocks we use so that we know if we are in - // the unrestricted motion border. - for (r = 0; r < num_4x4_h; r += (1 << tx_size)) { - for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { - if (r < max_blocks_high && c < max_blocks_wide) - visit(plane, i, plane_bsize, tx_size, arg); - i += step; - } + const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : + xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : + xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += (1 << tx_size)) { + for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + if (c < max_blocks_wide) + visit(plane, i, plane_bsize, tx_size, arg); + i += step; } - } else { - for (i = 0; i < num_4x4_w * num_4x4_h; i += step) - visit(plane, i, plane_bsize, tx_size, arg); } } diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index 048a0ed35..4df1b588c 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -65,7 +65,7 @@ void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi); void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi); // Set golden frame update interval, for non-svc 1 pass CBR mode. -void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *cpi); +void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi); // Check if we should not update golden reference, based on past refresh stats. void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5c841cfdc..cf8ac0a22 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -514,12 +514,6 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { } } -#if CONFIG_VP9_HIGHBITDEPTH -#define GLOBAL_MOTION 0 -#else -#define GLOBAL_MOTION 1 -#endif - // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for down-sampled inputs. static void choose_partitioning(VP9_COMP *cpi, @@ -564,7 +558,7 @@ static void choose_partitioning(VP9_COMP *cpi, MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi; unsigned int uv_sad; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); -#if GLOBAL_MOTION + const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); unsigned int y_sad, y_sad_g; BLOCK_SIZE bsize; @@ -576,9 +570,9 @@ static void choose_partitioning(VP9_COMP *cpi, bsize = BLOCK_64X32; else bsize = BLOCK_32X32; -#endif + assert(yv12 != NULL); -#if GLOBAL_MOTION + if (yv12_g && yv12_g != yv12) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, &cm->frame_refs[GOLDEN_FRAME - 1].sf); @@ -589,7 +583,7 @@ static void choose_partitioning(VP9_COMP *cpi, } else { y_sad_g = UINT_MAX; } -#endif + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, &cm->frame_refs[LAST_FRAME - 1].sf); mbmi->ref_frame[0] = LAST_FRAME; @@ -597,7 +591,7 @@ static void choose_partitioning(VP9_COMP *cpi, mbmi->sb_type = BLOCK_64X64; mbmi->mv[0].as_int = 0; mbmi->interp_filter = BILINEAR; -#if GLOBAL_MOTION + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); if (y_sad_g < y_sad) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, @@ -608,29 +602,21 @@ static void choose_partitioning(VP9_COMP *cpi, } else { x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv; } -#endif vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); for (i = 1; i <= 2; ++i) { struct macroblock_plane *p = &x->plane[i]; struct macroblockd_plane *pd = &xd->plane[i]; -#if GLOBAL_MOTION const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); -#else - const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd); -#endif + if (bs == BLOCK_INVALID) - uv_sad = INT_MAX; + uv_sad = UINT_MAX; else uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); -#if GLOBAL_MOTION - x->color_sensitivity[i - 1] = uv_sad * 4 > y_sad; -#else - x->color_sensitivity[i - 1] = (uv_sad > 512); -#endif + x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2); } d = xd->plane[0].dst.buf; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 0730467b0..12882e432 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1807,6 +1807,13 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, int best_sad; MV this_mv; +#if CONFIG_VP9_HIGHBITDEPTH + tmp_mv->row = 0; + tmp_mv->col = 0; + return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); +#endif + // Set up prediction 1-D reference set ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); for (idx = 0; idx < search_width; idx += 16) { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index f457f20b1..e3acc5a82 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -137,10 +137,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int cost_list[5]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); - if (cpi->common.show_frame && - (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME]) - return rv; - if (scaled_ref_frame) { int i; // Swap out the reference frame for a version that's been scaled to @@ -796,15 +792,43 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, continue; if (this_mode == NEWMV) { - if (ref_frame > LAST_FRAME) - continue; if (cpi->sf.partition_search_type != VAR_BASED_PARTITION && best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize])) continue; - if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost)) + + if (ref_frame > LAST_FRAME) { + int tmp_sad; + int dis, cost_list[5]; + + if (bsize < BLOCK_16X16) + continue; + + tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize); + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) + continue; + + frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int; + rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, + &mbmi->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv, + &mbmi->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, + &x->pred_sse[ref_frame], NULL, 0, 0); + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], + &rate_mv, best_rdc.rdcost)) { continue; + } } if (this_mode != NEARESTMV && @@ -829,7 +853,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && - pred_filter_search && + pred_filter_search && (ref_frame == LAST_FRAME) && ((mbmi->mv[0].as_mv.row & 0x07) != 0 || (mbmi->mv[0].as_mv.col & 0x07) != 0)) { int pf_rate[3]; @@ -1064,6 +1088,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mode_idx[INTRA_FRAME][mbmi->mode]; PREDICTION_MODE this_mode; for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + if (best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx]; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index c7c5e972d..7783f7bdc 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -377,7 +377,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { rcf = rc->rate_correction_factors[INTER_NORMAL]; } rcf *= rcf_mult[rc->frame_size_selector]; - return rcf > MAX_BPB_FACTOR ? MAX_BPB_FACTOR : rcf; + return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); } static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { @@ -386,6 +386,8 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { // Normalize RCF to account for the size-dependent scaling factor. factor /= rcf_mult[cpi->rc.frame_size_selector]; + factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); + if (cpi->common.frame_type == KEY_FRAME) { rc->rate_correction_factors[KF_STD] = factor; } else if (cpi->oxcf.pass == 2) { @@ -754,7 +756,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, static int get_active_cq_level(const RATE_CONTROL *rc, const VP9EncoderConfig *const oxcf) { - static const double cq_adjust_threshold = 0.5; + static const double cq_adjust_threshold = 0.1; int active_cq_level = oxcf->cq_level; if (oxcf->rc_mode == VPX_CQ && rc->total_target_bits > 0) { @@ -1687,7 +1689,7 @@ void vp9_set_target_rate(VP9_COMP *cpi) { int target_rate = rc->base_frame_target; // Correction to rate target based on prior over or under shoot. - if (cpi->oxcf.rc_mode == VPX_VBR) + if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target); vp9_rc_set_frame_target(cpi, target_rate); } diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index f49949940..618b5f73d 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -61,7 +61,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); __m128i s0 = _mm_unpacklo_epi8(src_line, zero); __m128i s1 = _mm_unpackhi_epi8(src_line, zero); __m128i t0, t1; @@ -69,14 +69,14 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, ref += ref_stride; for (idx = 1; idx < height_1; idx += 2) { - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_adds_epu16(s0, t0); s1 = _mm_adds_epu16(s1, t1); ref += ref_stride; - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_adds_epu16(s0, t0); @@ -84,7 +84,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, ref += ref_stride; } - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_adds_epu16(s0, t0); @@ -101,9 +101,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, s1 = _mm_srai_epi16(s1, 3); } - _mm_store_si128((__m128i *)hbuf, s0); + _mm_storeu_si128((__m128i *)hbuf, s0); hbuf += 8; - _mm_store_si128((__m128i *)hbuf, s1); + _mm_storeu_si128((__m128i *)hbuf, s1); } int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c index 5c0ad7892..a1a2bda80 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/vp9/encoder/x86/vp9_dct_ssse3.c @@ -102,99 +102,56 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_add_epi16(r0, r1); - const __m128i t1 = _mm_sub_epi16(r0, r1); + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16); - const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = u0; - res4 = u1; - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - if (pass == 1) { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } else { + + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { // Interleave to do the multiply by constants which gets us into 32bits const __m128i d0 = _mm_sub_epi16(q6, q5); const __m128i d1 = _mm_add_epi16(q6, q5); const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); |