summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_blockd.c42
-rw-r--r--vp9/encoder/vp9_aq_cyclicrefresh.h2
-rw-r--r--vp9/encoder/vp9_encodeframe.c30
-rw-r--r--vp9/encoder/vp9_mcomp.c7
-rw-r--r--vp9/encoder/vp9_pickmode.c45
-rw-r--r--vp9/encoder/vp9_ratectrl.c8
-rw-r--r--vp9/encoder/x86/vp9_avg_intrin_sse2.c12
-rw-r--r--vp9/encoder/x86/vp9_dct_ssse3.c83
8 files changed, 96 insertions, 133 deletions
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 7094a0118..3cd9f44e9 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -50,39 +50,25 @@ void vp9_foreach_transformed_block_in_plane(
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
const int step = 1 << (tx_size << 1);
- int i;
+ int i = 0, r, c;
// If mb_to_right_edge is < 0 we are in a situation in which
// the current block size extends into the UMV and we won't
// visit the sub blocks that are wholly within the UMV.
- if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
- int r, c;
-
- int max_blocks_wide = num_4x4_w;
- int max_blocks_high = num_4x4_h;
-
- // xd->mb_to_right_edge is in units of pixels * 8. This converts
- // it to 4x4 block sizes.
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
- i = 0;
- // Unlike the normal case - in here we have to keep track of the
- // row and column of the blocks we use so that we know if we are in
- // the unrestricted motion border.
- for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
- for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
- if (r < max_blocks_high && c < max_blocks_wide)
- visit(plane, i, plane_bsize, tx_size, arg);
- i += step;
- }
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+ xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+ xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+ for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ if (c < max_blocks_wide)
+ visit(plane, i, plane_bsize, tx_size, arg);
+ i += step;
}
- } else {
- for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
- visit(plane, i, plane_bsize, tx_size, arg);
}
}
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 048a0ed35..4df1b588c 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -65,7 +65,7 @@ void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
// Set golden frame update interval, for non-svc 1 pass CBR mode.
-void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *cpi);
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
// Check if we should not update golden reference, based on past refresh stats.
void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5c841cfdc..cf8ac0a22 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -514,12 +514,6 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
}
}
-#if CONFIG_VP9_HIGHBITDEPTH
-#define GLOBAL_MOTION 0
-#else
-#define GLOBAL_MOTION 1
-#endif
-
// This function chooses partitioning based on the variance between source and
// reconstructed last, where variance is computed for down-sampled inputs.
static void choose_partitioning(VP9_COMP *cpi,
@@ -564,7 +558,7 @@ static void choose_partitioning(VP9_COMP *cpi,
MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
unsigned int uv_sad;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-#if GLOBAL_MOTION
+
const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
unsigned int y_sad, y_sad_g;
BLOCK_SIZE bsize;
@@ -576,9 +570,9 @@ static void choose_partitioning(VP9_COMP *cpi,
bsize = BLOCK_64X32;
else
bsize = BLOCK_32X32;
-#endif
+
assert(yv12 != NULL);
-#if GLOBAL_MOTION
+
if (yv12_g && yv12_g != yv12) {
vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
&cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -589,7 +583,7 @@ static void choose_partitioning(VP9_COMP *cpi,
} else {
y_sad_g = UINT_MAX;
}
-#endif
+
vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
&cm->frame_refs[LAST_FRAME - 1].sf);
mbmi->ref_frame[0] = LAST_FRAME;
@@ -597,7 +591,7 @@ static void choose_partitioning(VP9_COMP *cpi,
mbmi->sb_type = BLOCK_64X64;
mbmi->mv[0].as_int = 0;
mbmi->interp_filter = BILINEAR;
-#if GLOBAL_MOTION
+
y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
if (y_sad_g < y_sad) {
vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -608,29 +602,21 @@ static void choose_partitioning(VP9_COMP *cpi,
} else {
x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
}
-#endif
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
for (i = 1; i <= 2; ++i) {
struct macroblock_plane *p = &x->plane[i];
struct macroblockd_plane *pd = &xd->plane[i];
-#if GLOBAL_MOTION
const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#else
- const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd);
-#endif
+
if (bs == BLOCK_INVALID)
- uv_sad = INT_MAX;
+ uv_sad = UINT_MAX;
else
uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride);
-#if GLOBAL_MOTION
- x->color_sensitivity[i - 1] = uv_sad * 4 > y_sad;
-#else
- x->color_sensitivity[i - 1] = (uv_sad > 512);
-#endif
+ x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
}
d = xd->plane[0].dst.buf;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 0730467b0..12882e432 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1807,6 +1807,13 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
int best_sad;
MV this_mv;
+#if CONFIG_VP9_HIGHBITDEPTH
+ tmp_mv->row = 0;
+ tmp_mv->col = 0;
+ return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+#endif
+
// Set up prediction 1-D reference set
ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
for (idx = 0; idx < search_width; idx += 16) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index f457f20b1..e3acc5a82 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -137,10 +137,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int cost_list[5];
const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
ref);
- if (cpi->common.show_frame &&
- (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
- return rv;
-
if (scaled_ref_frame) {
int i;
// Swap out the reference frame for a version that's been scaled to
@@ -796,15 +792,43 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
continue;
if (this_mode == NEWMV) {
- if (ref_frame > LAST_FRAME)
- continue;
if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
continue;
- if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
- &frame_mv[NEWMV][ref_frame],
- &rate_mv, best_rdc.rdcost))
+
+ if (ref_frame > LAST_FRAME) {
+ int tmp_sad;
+ int dis, cost_list[5];
+
+ if (bsize < BLOCK_16X16)
+ continue;
+
+ tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+ if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
+ continue;
+
+ frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+ rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+ frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+ cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis,
+ &x->pred_sse[ref_frame], NULL, 0, 0);
+ } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame],
+ &rate_mv, best_rdc.rdcost)) {
continue;
+ }
}
if (this_mode != NEARESTMV &&
@@ -829,7 +853,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
- pred_filter_search &&
+ pred_filter_search && (ref_frame == LAST_FRAME) &&
((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
(mbmi->mv[0].as_mv.col & 0x07) != 0)) {
int pf_rate[3];
@@ -1064,6 +1088,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
mode_idx[INTRA_FRAME][mbmi->mode];
PREDICTION_MODE this_mode;
for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ if (best_ref_frame != ref_frame) continue;
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index c7c5e972d..7783f7bdc 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -377,7 +377,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
rcf = rc->rate_correction_factors[INTER_NORMAL];
}
rcf *= rcf_mult[rc->frame_size_selector];
- return rcf > MAX_BPB_FACTOR ? MAX_BPB_FACTOR : rcf;
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
}
static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
@@ -386,6 +386,8 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
// Normalize RCF to account for the size-dependent scaling factor.
factor /= rcf_mult[cpi->rc.frame_size_selector];
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
if (cpi->common.frame_type == KEY_FRAME) {
rc->rate_correction_factors[KF_STD] = factor;
} else if (cpi->oxcf.pass == 2) {
@@ -754,7 +756,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
static int get_active_cq_level(const RATE_CONTROL *rc,
const VP9EncoderConfig *const oxcf) {
- static const double cq_adjust_threshold = 0.5;
+ static const double cq_adjust_threshold = 0.1;
int active_cq_level = oxcf->cq_level;
if (oxcf->rc_mode == VPX_CQ &&
rc->total_target_bits > 0) {
@@ -1687,7 +1689,7 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
int target_rate = rc->base_frame_target;
// Correction to rate target based on prior over or under shoot.
- if (cpi->oxcf.rc_mode == VPX_VBR)
+ if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
vp9_rc_set_frame_target(cpi, target_rate);
}
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index f49949940..618b5f73d 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -61,7 +61,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
const int ref_stride, const int height) {
int idx;
__m128i zero = _mm_setzero_si128();
- __m128i src_line = _mm_load_si128((const __m128i *)ref);
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
__m128i s0 = _mm_unpacklo_epi8(src_line, zero);
__m128i s1 = _mm_unpackhi_epi8(src_line, zero);
__m128i t0, t1;
@@ -69,14 +69,14 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
ref += ref_stride;
for (idx = 1; idx < height_1; idx += 2) {
- src_line = _mm_load_si128((const __m128i *)ref);
+ src_line = _mm_loadu_si128((const __m128i *)ref);
t0 = _mm_unpacklo_epi8(src_line, zero);
t1 = _mm_unpackhi_epi8(src_line, zero);
s0 = _mm_adds_epu16(s0, t0);
s1 = _mm_adds_epu16(s1, t1);
ref += ref_stride;
- src_line = _mm_load_si128((const __m128i *)ref);
+ src_line = _mm_loadu_si128((const __m128i *)ref);
t0 = _mm_unpacklo_epi8(src_line, zero);
t1 = _mm_unpackhi_epi8(src_line, zero);
s0 = _mm_adds_epu16(s0, t0);
@@ -84,7 +84,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
ref += ref_stride;
}
- src_line = _mm_load_si128((const __m128i *)ref);
+ src_line = _mm_loadu_si128((const __m128i *)ref);
t0 = _mm_unpacklo_epi8(src_line, zero);
t1 = _mm_unpackhi_epi8(src_line, zero);
s0 = _mm_adds_epu16(s0, t0);
@@ -101,9 +101,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
s1 = _mm_srai_epi16(s1, 3);
}
- _mm_store_si128((__m128i *)hbuf, s0);
+ _mm_storeu_si128((__m128i *)hbuf, s0);
hbuf += 8;
- _mm_store_si128((__m128i *)hbuf, s1);
+ _mm_storeu_si128((__m128i *)hbuf, s1);
}
int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index 5c0ad7892..a1a2bda80 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -102,99 +102,56 @@ void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
const __m128i r2 = _mm_sub_epi16(q1, q2);
const __m128i r3 = _mm_sub_epi16(q0, q3);
// Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_add_epi16(r0, r1);
- const __m128i t1 = _mm_sub_epi16(r0, r1);
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16);
- const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
// dct_const_round_shift
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = u0;
- res4 = u1;
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- if (pass == 1) {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
+
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
// Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
- } else {
+
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
// Interleave to do the multiply by constants which gets us into 32bits
const __m128i d0 = _mm_sub_epi16(q6, q5);
const __m128i d1 = _mm_add_epi16(q6, q5);
const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
// Add/subtract
const __m128i x0 = _mm_add_epi16(q4, r0);
const __m128i x1 = _mm_sub_epi16(q4, r0);