summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon.asm25
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon.c42
-rw-r--r--vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c55
-rw-r--r--vp9/common/vp9_alloccommon.c6
-rw-r--r--vp9/common/vp9_blockd.h42
-rw-r--r--vp9/common/vp9_entropy.c85
-rw-r--r--vp9/common/vp9_entropy.h8
-rw-r--r--vp9/common/vp9_entropymode.c14
-rw-r--r--vp9/common/vp9_entropymode.h11
-rw-r--r--vp9/common/vp9_entropymv.c13
-rw-r--r--vp9/common/vp9_entropymv.h16
-rw-r--r--vp9/common/vp9_enums.h20
-rw-r--r--vp9/common/vp9_findnearmv.c10
-rw-r--r--vp9/common/vp9_findnearmv.h28
-rw-r--r--vp9/common/vp9_loopfilter.c54
-rw-r--r--vp9/common/vp9_loopfilter_filters.c81
-rw-r--r--vp9/common/vp9_onyx.h1
-rw-r--r--vp9/common/vp9_onyxc_int.h12
-rw-r--r--vp9/common/vp9_reconinter.c94
-rw-r--r--vp9/common/vp9_reconinter.h3
-rw-r--r--vp9/common/vp9_rtcd_defs.sh29
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c1857
-rw-r--r--vp9/common/x86/vp9_loopfilter_intrin_sse2.c182
-rw-r--r--vp9/decoder/vp9_dboolhuff.c32
-rw-r--r--vp9/decoder/vp9_dboolhuff.h58
-rw-r--r--vp9/decoder/vp9_decodeframe.c131
-rw-r--r--vp9/decoder/vp9_decodemv.c34
-rw-r--r--vp9/decoder/vp9_detokenize.c55
-rw-r--r--vp9/decoder/vp9_detokenize.h3
-rw-r--r--vp9/decoder/vp9_onyxd_if.c1
-rw-r--r--vp9/decoder/vp9_onyxd_int.h3
-rw-r--r--vp9/decoder/vp9_treereader.h30
-rw-r--r--vp9/encoder/vp9_bitstream.c59
-rw-r--r--vp9/encoder/vp9_block.h1
-rw-r--r--vp9/encoder/vp9_encodeframe.c140
-rw-r--r--vp9/encoder/vp9_encodemb.c56
-rw-r--r--vp9/encoder/vp9_encodemv.c140
-rw-r--r--vp9/encoder/vp9_encodemv.h2
-rw-r--r--vp9/encoder/vp9_firstpass.c53
-rw-r--r--vp9/encoder/vp9_modecosts.c43
-rw-r--r--vp9/encoder/vp9_modecosts.h17
-rw-r--r--vp9/encoder/vp9_onyx_if.c302
-rw-r--r--vp9/encoder/vp9_onyx_int.h9
-rw-r--r--vp9/encoder/vp9_quantize.c41
-rw-r--r--vp9/encoder/vp9_quantize.h2
-rw-r--r--vp9/encoder/vp9_ratectrl.c244
-rw-r--r--vp9/encoder/vp9_ratectrl.h49
-rw-r--r--vp9/encoder/vp9_rdopt.c363
-rw-r--r--vp9/encoder/vp9_tokenize.c83
-rw-r--r--vp9/encoder/vp9_tokenize.h4
-rw-r--r--vp9/encoder/vp9_treewriter.h5
-rw-r--r--vp9/encoder/x86/vp9_dct32x32_avx2.c2710
-rw-r--r--vp9/encoder/x86/vp9_dct_avx2.c2579
-rw-r--r--vp9/vp9_common.mk2
-rw-r--r--vp9/vp9cx.mk5
-rw-r--r--vp9/vp9dx.mk1
56 files changed, 7827 insertions, 2118 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
index e559272cd..751bc74bc 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
@@ -112,27 +112,27 @@
vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2)
; only compare the largest value to limit
- vmax.u8 q11, q11, q12 ; m1 = max(m1, m2)
- vmax.u8 q12, q13, q14 ; m2 = max(m3, m4)
+ vmax.u8 q11, q11, q12 ; m7 = max(m1, m2)
+ vmax.u8 q12, q13, q14 ; m8 = max(m3, m4)
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
- vmax.u8 q3, q3, q4 ; m3 = max(m5, m6)
+ vmax.u8 q3, q3, q4 ; m9 = max(m5, m6)
vmov.u8 q10, #0x80
- vmax.u8 q15, q11, q12 ; m1 = max(m1, m2)
+ vmax.u8 q15, q11, q12 ; m10 = max(m7, m8)
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 q15, q15, q3 ; m1 = max(m1, m3)
+ vmax.u8 q15, q15, q3 ; m11 = max(m10, m9)
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
veor q7, q7, q10 ; qs0
- vcge.u8 q15, q1, q15 ; abs(m1) > limit
+ vcge.u8 q15, q1, q15 ; abs(m11) > limit
vshr.u8 q2, q2, #1 ; a = a / 2
veor q6, q6, q10 ; ps0
@@ -142,7 +142,7 @@
veor q8, q8, q10 ; qs1
- vmov.u8 q4, #3
+ vmov.u16 q4, #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
@@ -150,13 +150,15 @@
vcge.u8 q9, q0, q9 ; a > blimit
vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1)
- vorr q14, q13, q14 ; hevmask
+ vorr q14, q13, q14 ; hev
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
vmul.i16 q11, q11, q4
vand q1, q1, q14 ; filter &= hev
- vand q15, q15, q9 ; filter_mask
+ vand q15, q15, q9 ; mask
+
+ vmov.u8 q4, #3
vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0)
vaddw.s8 q11, q11, d3
@@ -180,15 +182,14 @@
; outer tap adjustments
vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1
- veor q6, q11, q10 ; *op0 = u^0x80
+ veor q7, q0, q10 ; *oq0 = u^0x80
vbic q1, q1, q14 ; filter &= ~hev
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter)
-
- veor q7, q0, q10 ; *oq0 = u^0x80
+ veor q6, q11, q10 ; *op0 = u^0x80
veor q5, q13, q10 ; *op1 = u^0x80
veor q8, q12, q10 ; *oq1 = u^0x80
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 2f022dc1d..b97e7aa4a 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -10,17 +10,6 @@
#include "./vp9_rtcd.h"
-void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
- vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
@@ -31,3 +20,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
}
+
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,
+ 1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);
+ vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);
+}
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
index 36cfc83c4..0c0f155ae 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -306,4 +306,59 @@ void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
}
}
}
+
+void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+ vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+ vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1,
+ 1);
+}
+
+void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+ vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+ 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+ vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+ 1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh);
+ vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
#endif // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index d2981601b..f495c29f3 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -200,9 +200,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
void vp9_create_common(VP9_COMMON *cm) {
vp9_machine_specific_config(cm);
-
- cm->tx_mode = ONLY_4X4;
- cm->comp_pred_mode = HYBRID_PREDICTION;
}
void vp9_remove_common(VP9_COMMON *cm) {
@@ -211,9 +208,6 @@ void vp9_remove_common(VP9_COMMON *cm) {
void vp9_initialize_common() {
vp9_init_neighbors();
- vp9_coef_tree_initialize();
- vp9_entropy_mode_init();
- vp9_entropy_mv_init();
}
void vp9_update_frame_size(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index df963d1cc..993ee7935 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -153,6 +153,34 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
return mbmi->ref_frame[1] > INTRA_FRAME;
}
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
+ const MODE_INFO *left_mi, int b) {
+ if (b == 0 || b == 2) {
+ if (!left_mi || is_inter_block(&left_mi->mbmi))
+ return DC_PRED;
+
+ return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+ : left_mi->mbmi.mode;
+ } else {
+ assert(b == 1 || b == 3);
+ return cur_mi->bmi[b - 1].as_mode;
+ }
+}
+
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+ const MODE_INFO *above_mi, int b) {
+ if (b == 0 || b == 1) {
+ if (!above_mi || is_inter_block(&above_mi->mbmi))
+ return DC_PRED;
+
+ return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+ : above_mi->mbmi.mode;
+ } else {
+ assert(b == 2 || b == 3);
+ return cur_mi->bmi[b - 2].as_mode;
+ }
+}
+
enum mv_precision {
MV_PRECISION_Q3,
MV_PRECISION_Q4
@@ -170,7 +198,6 @@ struct buf_2d {
};
struct macroblockd_plane {
- int16_t *qcoeff;
int16_t *dqcoeff;
uint16_t *eobs;
PLANE_TYPE plane_type;
@@ -359,19 +386,6 @@ static INLINE void foreach_transformed_block_uv(
foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
}
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
- int raster_block, int stride) {
- const int bw = b_width_log2(plane_bsize);
- const int y = 4 * (raster_block >> bw);
- const int x = 4 * (raster_block & ((1 << bw) - 1));
- return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
- int raster_block, int16_t *base) {
- const int stride = 4 << b_width_log2(plane_bsize);
- return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, int block,
int *x, int *y) {
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 0f978cc95..b35c43fcd 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -113,49 +113,6 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
-// Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
- -DCT_EOB_TOKEN, 2, /* 0 = EOB */
- -ZERO_TOKEN, 4, /* 1 = ZERO */
- -ONE_TOKEN, 6, /* 2 = ONE */
- 8, 12, /* 3 = LOW_VAL */
- -TWO_TOKEN, 10, /* 4 = TWO */
- -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
- 14, 16, /* 6 = HIGH_LOW */
- -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
- 18, 20, /* 8 = CAT_THREEFOUR */
- -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
- -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
-};
-
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
- 2, 6, /* 0 = LOW_VAL */
- -TWO_TOKEN, 4, /* 1 = TWO */
- -THREE_TOKEN, -FOUR_TOKEN, /* 2 = THREE */
- 8, 10, /* 3 = HIGH_LOW */
- -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 4 = CAT_ONE */
- 12, 14, /* 5 = CAT_THREEFOUR */
- -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 6 = CAT_THREE */
- -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 7 = CAT_FIVE */
-};
-
-
-
-struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits. Probabilities are constant and
- do not depend on previously encoded bits */
-
-static const vp9_prob Pcat1[] = { 159};
-static const vp9_prob Pcat2[] = { 165, 145};
-static const vp9_prob Pcat3[] = { 173, 148, 140};
-static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
-static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const vp9_prob Pcat6[] = {
- 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
const vp9_tree_index vp9_coefmodel_tree[6] = {
-DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */
-ZERO_TOKEN, 4, /* 1 = ZERO */
@@ -446,43 +403,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
}
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
-
-static void init_bit_tree(vp9_tree_index *p, int n) {
- int i = 0;
-
- while (++i < n) {
- p[0] = p[1] = i << 1;
- p += 2;
- }
-
- p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
- init_bit_tree(cat1, 1);
- init_bit_tree(cat2, 2);
- init_bit_tree(cat3, 3);
- init_bit_tree(cat4, 4);
- init_bit_tree(cat5, 5);
- init_bit_tree(cat6, 14);
-}
-
-const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1, Pcat1, 1, 5}, // DCT_VAL_CATEGORY1
- {cat2, Pcat2, 2, 7}, // DCT_VAL_CATEGORY2
- {cat3, Pcat3, 3, 11}, // DCT_VAL_CATEGORY3
- {cat4, Pcat4, 4, 19}, // DCT_VAL_CATEGORY4
- {cat5, Pcat5, 5, 35}, // DCT_VAL_CATEGORY5
- {cat6, Pcat6, 14, 67}, // DCT_VAL_CATEGORY6
- {0, 0, 0, 0} // DCT_EOB_TOKEN
-};
-
#include "vp9/common/vp9_default_coef_probs.h"
void vp9_default_coef_probs(VP9_COMMON *cm) {
@@ -492,11 +412,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
}
-void vp9_coef_tree_initialize() {
- init_bit_trees();
- vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
#define COEF_COUNT_SAT 24
#define COEF_MAX_UPDATE_FACTOR 112
#define COEF_COUNT_SAT_KEY 24
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 92a6c592a..941b251c3 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -44,15 +44,9 @@
extern DECLARE_ALIGNED(16, const uint8_t,
vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
-
-extern const vp9_tree_index vp9_coef_con_tree[];
-
#define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */
extern const vp9_tree_index vp9_coefmodel_tree[];
-extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
typedef struct {
const vp9_tree_index *tree;
const vp9_prob *prob;
@@ -105,8 +99,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
struct VP9Common;
void vp9_default_coef_probs(struct VP9Common *cm);
-
-void vp9_coef_tree_initialize();
void vp9_adapt_coef_probs(struct VP9Common *cm);
static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3b2510dcd..265242129 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -232,21 +232,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
-D63_PRED, 16, /* 7 = D63_NODE */
-D153_PRED, -D207_PRED /* 8 = D153_NODE */
};
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-INTER_OFFSET(ZEROMV), 2,
-INTER_OFFSET(NEARESTMV), 4,
-INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
};
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
-PARTITION_NONE, 2,
-PARTITION_HORZ, 4,
-PARTITION_VERT, -PARTITION_SPLIT
};
-struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
9, 102, 187, 225
@@ -329,6 +326,7 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
cm->fc.tx_probs = default_tx_probs;
vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
+ vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
}
const vp9_tree_index vp9_switchable_interp_tree
@@ -336,15 +334,6 @@ const vp9_tree_index vp9_switchable_interp_tree
-EIGHTTAP, 2,
-EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
};
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init() {
- vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
- vp9_tokens_from_tree(vp9_switchable_interp_encodings,
- vp9_switchable_interp_tree);
- vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
- vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree);
-}
#define COUNT_SAT 20
#define MAX_UPDATE_FACTOR 128
@@ -466,7 +455,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
vp9_default_coef_probs(cm);
vp9_init_mbmode_probs(cm);
vp9_init_mv_probs(cm);
- vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
if (cm->frame_type == KEY_FRAME ||
cm->error_resilient_mode || cm->reset_frame_context == 3) {
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 38b419948..df58bea3c 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -37,24 +37,13 @@ struct tx_counts {
extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
[INTRA_MODES - 1];
-
extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
[PARTITION_TYPES - 1];
-
extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-
extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-
extern const vp9_tree_index vp9_switchable_interp_tree
[TREE_SIZE(SWITCHABLE_FILTERS)];
-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
void vp9_setup_past_independence(struct VP9Common *cm);
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 290dcdd17..60ae79fdc 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -23,7 +23,6 @@ const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
-MV_JOINT_HNZVZ, 4,
-MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
};
-struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
-MV_CLASS_0, 2,
@@ -37,19 +36,16 @@ const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
-MV_CLASS_7, -MV_CLASS_8,
-MV_CLASS_9, -MV_CLASS_10,
};
-struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-0, -1,
};
-struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
-0, 2,
-1, 4,
-2, -3
};
-struct vp9_token vp9_mv_fp_encodings[4];
static const nmv_context default_nmv_context = {
{32, 64, 96},
@@ -235,13 +231,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
}
}
-void vp9_entropy_mv_init() {
- vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
- vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
- vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
- vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
-}
-
void vp9_init_mv_probs(VP9_COMMON *cm) {
cm->fc.nmvc = default_nmv_context;
}
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index b62f7c42f..3175a1e49 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -18,7 +18,6 @@
struct VP9Common;
-void vp9_entropy_mv_init();
void vp9_init_mv_probs(struct VP9Common *cm);
void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
@@ -72,17 +71,10 @@ typedef enum {
#define MV_UPP ((1 << MV_IN_USE_BITS) - 1)
#define MV_LOW (-(1 << MV_IN_USE_BITS))
-extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
-extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
-extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
-extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-
-extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)];
-extern struct vp9_token vp9_mv_fp_encodings[4];
+extern const vp9_tree_index vp9_mv_joint_tree[];
+extern const vp9_tree_index vp9_mv_class_tree[];
+extern const vp9_tree_index vp9_mv_class0_tree[];
+extern const vp9_tree_index vp9_mv_fp_tree[];
typedef struct {
vp9_prob sign;
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 9e4117e17..34411a34f 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -52,20 +52,22 @@ typedef enum PARTITION_TYPE {
#define PARTITION_PLOFFSET 4 // number of probability models per block size
#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+// block transform size
typedef enum {
- TX_4X4 = 0, // 4x4 dct transform
- TX_8X8 = 1, // 8x8 dct transform
- TX_16X16 = 2, // 16x16 dct transform
- TX_32X32 = 3, // 32x32 dct transform
+ TX_4X4 = 0, // 4x4 transform
+ TX_8X8 = 1, // 8x8 transform
+ TX_16X16 = 2, // 16x16 transform
+ TX_32X32 = 3, // 32x32 transform
TX_SIZES
} TX_SIZE;
+// frame transform mode
typedef enum {
- ONLY_4X4 = 0,
- ALLOW_8X8 = 1,
- ALLOW_16X16 = 2,
- ALLOW_32X32 = 3,
- TX_MODE_SELECT = 4,
+ ONLY_4X4 = 0, // only 4x4 transform used
+ ALLOW_8X8 = 1, // allow block transform size up to 8x8
+ ALLOW_16X16 = 2, // allow block transform size up to 16x16
+ ALLOW_32X32 = 3, // allow block transform size up to 32x32
+ TX_MODE_SELECT = 4, // transform specified for each block
TX_MODES = 5,
} TX_MODE;
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 66178cd1b..ad97c0277 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -63,10 +63,12 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
break;
}
} else {
- int_mv candidates[2 + MAX_MV_REF_CANDIDATES] = { bmi[1].as_mv[ref_idx],
- bmi[0].as_mv[ref_idx],
- mv_list[0],
- mv_list[1] };
+ int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+ candidates[0] = bmi[1].as_mv[ref_idx];
+ candidates[1] = bmi[0].as_mv[ref_idx];
+ candidates[2] = mv_list[0];
+ candidates[3] = mv_list[1];
+
assert(block_idx == 3);
dst_nearest->as_int = bmi[2].as_mv[ref_idx].as_int;
for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) {
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 2362caa41..e9d4e1171 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -41,32 +41,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
int block_idx, int ref_idx,
int mi_row, int mi_col);
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
- const MODE_INFO *left_mi, int b) {
- if (b == 0 || b == 2) {
- if (!left_mi || is_inter_block(&left_mi->mbmi))
- return DC_PRED;
-
- return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
- : left_mi->mbmi.mode;
- } else {
- assert(b == 1 || b == 3);
- return cur_mi->bmi[b - 1].as_mode;
- }
-}
-
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
- const MODE_INFO *above_mi, int b) {
- if (b == 0 || b == 1) {
- if (!above_mi || is_inter_block(&above_mi->mbmi))
- return DC_PRED;
-
- return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
- : above_mi->mbmi.mode;
- } else {
- assert(b == 2 || b == 3);
- return cur_mi->bmi[b - 2].as_mode;
- }
-}
-
#endif // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 0b48de2cb..ff2bc45e4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -353,29 +353,17 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
// TODO(yunqingwang): count in loopfilter functions should be removed.
if (mask & 1) {
- if ((mask_16x16_0 | mask_16x16_1) & 1) {
- if ((mask_16x16_0 & mask_16x16_1) & 1) {
- // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
- vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr);
- vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr);
- } else if (mask_16x16_0 & 1) {
- vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr);
- } else {
- vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr);
- }
+ if (mask_16x16_0 & 1) {
+ // if (mask_16x16_0 & 1) is 1, then (mask_16x16_1 & 1) is 1.
+ vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr);
}
if ((mask_8x8_0 | mask_8x8_1) & 1) {
if ((mask_8x8_0 & mask_8x8_1) & 1) {
- // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
- vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, 1);
- vp9_mbloop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, 1);
+ vp9_mbloop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr);
} else if (mask_8x8_0 & 1) {
vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, 1);
@@ -387,11 +375,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
if ((mask_4x4_0 | mask_4x4_1) & 1) {
if ((mask_4x4_0 & mask_4x4_1) & 1) {
- // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
- vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, 1);
- vp9_loop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, 1);
+ vp9_loop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr);
} else if (mask_4x4_0 & 1) {
vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, 1);
@@ -403,11 +389,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
- // TODO(yunqingwang): Combine 2 calls as 1 wide filtering.
- vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, 1);
- vp9_loop_filter_vertical_edge(s + 8 *pitch + 4, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, 1);
+ vp9_loop_filter_vertical_edge_16(s + 4, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr);
} else if (mask_4x4_int_0 & 1) {
vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, 1);
@@ -448,14 +432,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
count = 1;
if (mask & 1) {
if (mask_16x16 & 1) {
- if ((mask_16x16 & 3) == 3) {
- vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
- count = 2;
- } else {
- vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
- }
+ // If (mask_16x16 & 1) is 1, then (mask_16x16 & 3) is 3.
+ vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+ count = 2;
} else if (mask_8x8 & 1) {
if ((mask_8x8 & 3) == 3) {
// Next block's thresholds
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 9edf8701f..ef8de2010 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -169,6 +169,34 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
}
}
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ int i, j;
+ const uint8_t *blimit = blimit0;
+ const uint8_t *limit = limit0;
+ const uint8_t *thresh = thresh0;
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 8; ++j) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ filter4(mask, hev, s - 2, s - 1, s, s + 1);
+ s += pitch;
+ }
+ blimit = blimit1;
+ limit = limit1;
+ thresh = thresh1;
+ }
+}
+
static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
uint8_t *op3, uint8_t *op2,
uint8_t *op1, uint8_t *op0,
@@ -264,6 +292,36 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
}
}
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ int i, j;
+ const uint8_t *blimit = blimit0;
+ const uint8_t *limit = limit0;
+ const uint8_t *thresh = thresh0;
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 8; ++j) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3);
+ s += pitch;
+ }
+ blimit = blimit1;
+ limit = limit1;
+ thresh = thresh1;
+ }
+}
+
static INLINE void filter16(int8_t mask, uint8_t hev,
uint8_t flat, uint8_t flat2,
uint8_t *op7, uint8_t *op6,
@@ -366,3 +424,26 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
s += p;
}
}
+
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+ q0, s[4], s[5], s[6], s[7]);
+
+ filter16(mask, hev, flat, flat2,
+ s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+ s += p;
+ }
+}
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index cda68a285..c5faf88f8 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -67,6 +67,7 @@ extern "C"
typedef enum {
NO_AQ = 0,
VARIANCE_AQ = 1,
+ COMPLEXITY_AQ = 2,
AQ_MODES_COUNT // This should always be the last member of the enum
} AQ_MODES;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index fb959cb36..751accf02 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -81,11 +81,11 @@ typedef struct {
typedef enum {
- SINGLE_PREDICTION_ONLY = 0,
- COMP_PREDICTION_ONLY = 1,
- HYBRID_PREDICTION = 2,
- NB_PREDICTION_TYPES = 3,
-} COMPPREDMODE_TYPE;
+ SINGLE_REFERENCE = 0,
+ COMPOUND_REFERENCE = 1,
+ REFERENCE_MODE_SELECT = 2,
+ REFERENCE_MODES = 3,
+} REFERENCE_MODE;
typedef struct VP9Common {
struct vpx_internal_error_info error;
@@ -195,7 +195,7 @@ typedef struct VP9Common {
int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
MV_REFERENCE_FRAME comp_var_ref[2];
- COMPPREDMODE_TYPE comp_pred_mode;
+ REFERENCE_MODE comp_pred_mode;
FRAME_CONTEXT fc; /* this frame entropy */
FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index aa17b85c8..09a4fc826 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -98,7 +98,6 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
return clamped_mv;
}
-
// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
// sizes smaller than 16x16 yet.
@@ -206,6 +205,96 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
MAX_MB_PLANE - 1);
}
+// TODO(jingning): This function serves as a placeholder for decoder prediction
+// using on demand border extension. It should be moved to /decoder/ directory.
+static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+ BLOCK_SIZE bsize, int pred_w, int pred_h,
+ int mi_x, int mi_y) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int bwl = b_width_log2(plane_bsize);
+ const int bw = 4 << bwl;
+ const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+ const int x = 4 * (block & ((1 << bwl) - 1));
+ const int y = 4 * (block >> bwl);
+ const MODE_INFO *mi = xd->mi_8x8[0];
+ const int is_compound = has_second_ref(&mi->mbmi);
+ int ref;
+
+ assert(x < bw);
+ assert(y < bh);
+ assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
+ assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
+
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ struct scale_factors *const scale = &xd->scale_factor[ref];
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
+ // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+ // same MV (the average of the 4 luma MVs) but we could do something
+ // smarter for non-4:2:0. Just punt for now, pending the changes to get
+ // rid of SPLITMV mode entirely.
+ const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+ ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
+ : mi_mv_pred_q4(mi, ref))
+ : mi->mbmi.mv[ref].as_mv;
+
+ // TODO(jkoleszar): This clamping is done in the incorrect place for the
+ // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+ // MV. Note however that it performs the subsampling aware scaling so
+ // that the result is always q4.
+ // mv_precision precision is MV_PRECISION_Q4.
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
+
+ uint8_t *pre;
+ MV32 scaled_mv;
+ int xs, ys;
+
+ if (vp9_is_scaled(scale->sfc)) {
+ pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
+ scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
+ scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+ xs = scale->sfc->x_step_q4;
+ ys = scale->sfc->y_step_q4;
+ } else {
+ pre = pre_buf->buf + (y * pre_buf->stride + x);
+ scaled_mv.row = mv_q4.row;
+ scaled_mv.col = mv_q4.col;
+ xs = ys = 16;
+ }
+
+ inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+ &scaled_mv, scale,
+ 4 << pred_w, 4 << pred_h, ref,
+ &xd->subpix, xs, ys);
+ }
+}
+
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ int plane;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+ const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+ if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
+ assert(bsize == BLOCK_8X8);
+ for (y = 0; y < 1 << bhl; ++y)
+ for (x = 0; x < 1 << bwl; ++x)
+ dec_build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y);
+ } else {
+ dec_build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y);
+ }
+ }
+}
+
// TODO(dkovalev: find better place for this function)
void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
const int ref = cm->active_ref_idx[i];
@@ -219,9 +308,6 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
vp9_setup_scale_factors_for_frame(sf, sfc,
fb->y_crop_width, fb->y_crop_height,
cm->width, cm->height);
-
- if (vp9_is_scaled(sfc))
- vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
}
}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index b328754e7..4a302f988 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -24,6 +24,9 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const MV *mv_q3,
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e18e757c1..627ea31ed 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -193,12 +193,21 @@ specialize vp9_dc_128_predictor_32x32
prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2
+prototype void vp9_mb_lpf_vertical_edge_w_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
+specialize vp9_mb_lpf_vertical_edge_w_16 sse2 neon dspr2
+
prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2
+prototype void vp9_mbloop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_vertical_edge_16 sse2 neon dspr2
+
prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_loop_filter_vertical_edge mmx neon dspr2
+prototype void vp9_loop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_vertical_edge_16 sse2 neon dspr2
+
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
@@ -206,13 +215,13 @@ prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const u
specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon
+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon dspr2
prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_loop_filter_horizontal_edge_16 sse2 neon
+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon dspr2
#
# post proc
@@ -698,31 +707,31 @@ fi
# fdct functions
prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2
+specialize vp9_short_fht4x4 sse2 avx2
prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2
+specialize vp9_short_fht8x8 sse2 avx2
prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2
+specialize vp9_short_fht16x16 sse2 avx2
prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
specialize vp9_fwht4x4
prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2
+specialize vp9_fdct4x4 sse2 avx2
prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2
+specialize vp9_fdct8x8 sse2 avx2
prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
+specialize vp9_fdct16x16 sse2 avx2
prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2
+specialize vp9_fdct32x32 sse2 avx2
prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2
+specialize vp9_fdct32x32_rd sse2 avx2
#
# Motion search
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index c65184f9c..947c0ba44 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -431,6 +431,27 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
res3 = _mm_packs_epi32(tmp6, tmp7); \
}
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ }
+
#define IDCT8_1D \
/* Stage1 */ \
{ \
@@ -629,6 +650,25 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ out[4] = out[5] = out[6] = out[7] = zero;
+}
+
static void idct8_1d_sse2(__m128i *in) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -1118,14 +1158,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
#define IDCT16_1D \
/* Stage2 */ \
{ \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
- const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \
- const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \
- const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
- const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+ const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
+ const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
+ const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+ const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
\
MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
stg2_0, stg2_1, stg2_2, stg2_3, \
@@ -1138,10 +1178,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
\
/* Stage3 */ \
{ \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
- const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+ const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
\
MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
stg3_0, stg3_1, stg3_2, stg3_3, \
@@ -1160,10 +1200,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
\
/* Stage4 */ \
{ \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
- const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+ const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
\
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -1275,16 +1315,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
- in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
- in10 = zero, in11 = zero, in12 = zero, in13 = zero,
- in14 = zero, in15 = zero;
- __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
- l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
- l12 = zero, l13 = zero, l14 = zero, l15 = zero;
- __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
- r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
- r12 = zero, r13 = zero, r14 = zero, r15 = zero;
+ __m128i in[16], l[16], r[16], *curr1;
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_8_0, stp1_12_0;
@@ -1293,162 +1324,132 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
- // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
- for (i = 0; i < 4; i++) {
- // 1-D idct
- if (i < 2) {
- if (i == 1) input += 128;
+ curr1 = l;
+ for (i = 0; i < 2; i++) {
+ // 1-D idct
// Load input data.
- in0 = _mm_load_si128((const __m128i *)input);
- in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
- in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
- in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
- in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
- in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
- in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
- in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
- in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
- in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
- in10, in11, in12, in13, in14, in15);
- }
-
- if (i == 2) {
- TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
- in5, in6, in7);
- TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
- in13, in14, in15);
- }
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in+8, in+8);
+
+ IDCT16_1D
+
+ // Stage7
+ curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+ curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+ curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+ curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+ curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+ curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+ curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+ curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+ curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+ curr1 = r;
+ input += 128;
+ }
+ for (i = 0; i < 2; i++) {
+ // 1-D idct
+ array_transpose_8x8(l+i*8, in);
+ array_transpose_8x8(r+i*8, in+8);
- if (i == 3) {
- TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
- in12, in13, in14, in15);
- }
+ IDCT16_1D
- IDCT16_1D
-
- // Stage7
- if (i == 0) {
- // Left 8x16
- l0 = _mm_add_epi16(stp2_0, stp1_15);
- l1 = _mm_add_epi16(stp2_1, stp1_14);
- l2 = _mm_add_epi16(stp2_2, stp2_13);
- l3 = _mm_add_epi16(stp2_3, stp2_12);
- l4 = _mm_add_epi16(stp2_4, stp2_11);
- l5 = _mm_add_epi16(stp2_5, stp2_10);
- l6 = _mm_add_epi16(stp2_6, stp1_9);
- l7 = _mm_add_epi16(stp2_7, stp1_8);
- l8 = _mm_sub_epi16(stp2_7, stp1_8);
- l9 = _mm_sub_epi16(stp2_6, stp1_9);
- l10 = _mm_sub_epi16(stp2_5, stp2_10);
- l11 = _mm_sub_epi16(stp2_4, stp2_11);
- l12 = _mm_sub_epi16(stp2_3, stp2_12);
- l13 = _mm_sub_epi16(stp2_2, stp2_13);
- l14 = _mm_sub_epi16(stp2_1, stp1_14);
- l15 = _mm_sub_epi16(stp2_0, stp1_15);
- } else if (i == 1) {
- // Right 8x16
- r0 = _mm_add_epi16(stp2_0, stp1_15);
- r1 = _mm_add_epi16(stp2_1, stp1_14);
- r2 = _mm_add_epi16(stp2_2, stp2_13);
- r3 = _mm_add_epi16(stp2_3, stp2_12);
- r4 = _mm_add_epi16(stp2_4, stp2_11);
- r5 = _mm_add_epi16(stp2_5, stp2_10);
- r6 = _mm_add_epi16(stp2_6, stp1_9);
- r7 = _mm_add_epi16(stp2_7, stp1_8);
- r8 = _mm_sub_epi16(stp2_7, stp1_8);
- r9 = _mm_sub_epi16(stp2_6, stp1_9);
- r10 = _mm_sub_epi16(stp2_5, stp2_10);
- r11 = _mm_sub_epi16(stp2_4, stp2_11);
- r12 = _mm_sub_epi16(stp2_3, stp2_12);
- r13 = _mm_sub_epi16(stp2_2, stp2_13);
- r14 = _mm_sub_epi16(stp2_1, stp1_14);
- r15 = _mm_sub_epi16(stp2_0, stp1_15);
- } else {
// 2-D
- in0 = _mm_add_epi16(stp2_0, stp1_15);
- in1 = _mm_add_epi16(stp2_1, stp1_14);
- in2 = _mm_add_epi16(stp2_2, stp2_13);
- in3 = _mm_add_epi16(stp2_3, stp2_12);
- in4 = _mm_add_epi16(stp2_4, stp2_11);
- in5 = _mm_add_epi16(stp2_5, stp2_10);
- in6 = _mm_add_epi16(stp2_6, stp1_9);
- in7 = _mm_add_epi16(stp2_7, stp1_8);
- in8 = _mm_sub_epi16(stp2_7, stp1_8);
- in9 = _mm_sub_epi16(stp2_6, stp1_9);
- in10 = _mm_sub_epi16(stp2_5, stp2_10);
- in11 = _mm_sub_epi16(stp2_4, stp2_11);
- in12 = _mm_sub_epi16(stp2_3, stp2_12);
- in13 = _mm_sub_epi16(stp2_2, stp2_13);
- in14 = _mm_sub_epi16(stp2_1, stp1_14);
- in15 = _mm_sub_epi16(stp2_0, stp1_15);
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
// Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
- in8 = _mm_adds_epi16(in8, final_rounding);
- in9 = _mm_adds_epi16(in9, final_rounding);
- in10 = _mm_adds_epi16(in10, final_rounding);
- in11 = _mm_adds_epi16(in11, final_rounding);
- in12 = _mm_adds_epi16(in12, final_rounding);
- in13 = _mm_adds_epi16(in13, final_rounding);
- in14 = _mm_adds_epi16(in14, final_rounding);
- in15 = _mm_adds_epi16(in15, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 6);
- in1 = _mm_srai_epi16(in1, 6);
- in2 = _mm_srai_epi16(in2, 6);
- in3 = _mm_srai_epi16(in3, 6);
- in4 = _mm_srai_epi16(in4, 6);
- in5 = _mm_srai_epi16(in5, 6);
- in6 = _mm_srai_epi16(in6, 6);
- in7 = _mm_srai_epi16(in7, 6);
- in8 = _mm_srai_epi16(in8, 6);
- in9 = _mm_srai_epi16(in9, 6);
- in10 = _mm_srai_epi16(in10, 6);
- in11 = _mm_srai_epi16(in11, 6);
- in12 = _mm_srai_epi16(in12, 6);
- in13 = _mm_srai_epi16(in13, 6);
- in14 = _mm_srai_epi16(in14, 6);
- in15 = _mm_srai_epi16(in15, 6);
-
- RECON_AND_STORE(dest, in0);
- RECON_AND_STORE(dest, in1);
- RECON_AND_STORE(dest, in2);
- RECON_AND_STORE(dest, in3);
- RECON_AND_STORE(dest, in4);
- RECON_AND_STORE(dest, in5);
- RECON_AND_STORE(dest, in6);
- RECON_AND_STORE(dest, in7);
- RECON_AND_STORE(dest, in8);
- RECON_AND_STORE(dest, in9);
- RECON_AND_STORE(dest, in10);
- RECON_AND_STORE(dest, in11);
- RECON_AND_STORE(dest, in12);
- RECON_AND_STORE(dest, in13);
- RECON_AND_STORE(dest, in14);
- RECON_AND_STORE(dest, in15);
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest, in[8]);
+ RECON_AND_STORE(dest, in[9]);
+ RECON_AND_STORE(dest, in[10]);
+ RECON_AND_STORE(dest, in[11]);
+ RECON_AND_STORE(dest, in[12]);
+ RECON_AND_STORE(dest, in[13]);
+ RECON_AND_STORE(dest, in[14]);
+ RECON_AND_STORE(dest, in[15]);
dest += 8 - (stride * 16);
- }
}
}
@@ -2468,15 +2469,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
- in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
- in10 = zero, in11 = zero, in12 = zero, in13 = zero,
- in14 = zero, in15 = zero;
- __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
- l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
- l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-
+ __m128i in[16], l[16];
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_8_0, stp1_12_0;
@@ -2484,25 +2477,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
+ in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
// 1-D idct. Load input data.
- in0 = _mm_load_si128((const __m128i *)input);
- in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
- TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
- TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
+ TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
+ TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
// Stage2
{
- const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
- const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
- const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
+ const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
+ const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
@@ -2544,8 +2538,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
// Stage3
{
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
@@ -2580,8 +2574,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
// Stage4
{
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
@@ -2690,106 +2684,99 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
}
// Stage7. Left 8x16 only.
- l0 = _mm_add_epi16(stp2_0, stp1_15);
- l1 = _mm_add_epi16(stp2_1, stp1_14);
- l2 = _mm_add_epi16(stp2_2, stp2_13);
- l3 = _mm_add_epi16(stp2_3, stp2_12);
- l4 = _mm_add_epi16(stp2_4, stp2_11);
- l5 = _mm_add_epi16(stp2_5, stp2_10);
- l6 = _mm_add_epi16(stp2_6, stp1_9);
- l7 = _mm_add_epi16(stp2_7, stp1_8);
- l8 = _mm_sub_epi16(stp2_7, stp1_8);
- l9 = _mm_sub_epi16(stp2_6, stp1_9);
- l10 = _mm_sub_epi16(stp2_5, stp2_10);
- l11 = _mm_sub_epi16(stp2_4, stp2_11);
- l12 = _mm_sub_epi16(stp2_3, stp2_12);
- l13 = _mm_sub_epi16(stp2_2, stp2_13);
- l14 = _mm_sub_epi16(stp2_1, stp1_14);
- l15 = _mm_sub_epi16(stp2_0, stp1_15);
+ l[0] = _mm_add_epi16(stp2_0, stp1_15);
+ l[1] = _mm_add_epi16(stp2_1, stp1_14);
+ l[2] = _mm_add_epi16(stp2_2, stp2_13);
+ l[3] = _mm_add_epi16(stp2_3, stp2_12);
+ l[4] = _mm_add_epi16(stp2_4, stp2_11);
+ l[5] = _mm_add_epi16(stp2_5, stp2_10);
+ l[6] = _mm_add_epi16(stp2_6, stp1_9);
+ l[7] = _mm_add_epi16(stp2_7, stp1_8);
+ l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ l[15] = _mm_sub_epi16(stp2_0, stp1_15);
// 2-D idct. We do 2 8x16 blocks.
for (i = 0; i < 2; i++) {
- if (i == 0)
- TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
- in5, in6, in7);
-
- if (i == 1)
- TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
+ array_transpose_4X8(l + 8*i, in);
+ in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
IDCT16_1D
// Stage7
- in0 = _mm_add_epi16(stp2_0, stp1_15);
- in1 = _mm_add_epi16(stp2_1, stp1_14);
- in2 = _mm_add_epi16(stp2_2, stp2_13);
- in3 = _mm_add_epi16(stp2_3, stp2_12);
- in4 = _mm_add_epi16(stp2_4, stp2_11);
- in5 = _mm_add_epi16(stp2_5, stp2_10);
- in6 = _mm_add_epi16(stp2_6, stp1_9);
- in7 = _mm_add_epi16(stp2_7, stp1_8);
- in8 = _mm_sub_epi16(stp2_7, stp1_8);
- in9 = _mm_sub_epi16(stp2_6, stp1_9);
- in10 = _mm_sub_epi16(stp2_5, stp2_10);
- in11 = _mm_sub_epi16(stp2_4, stp2_11);
- in12 = _mm_sub_epi16(stp2_3, stp2_12);
- in13 = _mm_sub_epi16(stp2_2, stp2_13);
- in14 = _mm_sub_epi16(stp2_1, stp1_14);
- in15 = _mm_sub_epi16(stp2_0, stp1_15);
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
// Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
- in8 = _mm_adds_epi16(in8, final_rounding);
- in9 = _mm_adds_epi16(in9, final_rounding);
- in10 = _mm_adds_epi16(in10, final_rounding);
- in11 = _mm_adds_epi16(in11, final_rounding);
- in12 = _mm_adds_epi16(in12, final_rounding);
- in13 = _mm_adds_epi16(in13, final_rounding);
- in14 = _mm_adds_epi16(in14, final_rounding);
- in15 = _mm_adds_epi16(in15, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 6);
- in1 = _mm_srai_epi16(in1, 6);
- in2 = _mm_srai_epi16(in2, 6);
- in3 = _mm_srai_epi16(in3, 6);
- in4 = _mm_srai_epi16(in4, 6);
- in5 = _mm_srai_epi16(in5, 6);
- in6 = _mm_srai_epi16(in6, 6);
- in7 = _mm_srai_epi16(in7, 6);
- in8 = _mm_srai_epi16(in8, 6);
- in9 = _mm_srai_epi16(in9, 6);
- in10 = _mm_srai_epi16(in10, 6);
- in11 = _mm_srai_epi16(in11, 6);
- in12 = _mm_srai_epi16(in12, 6);
- in13 = _mm_srai_epi16(in13, 6);
- in14 = _mm_srai_epi16(in14, 6);
- in15 = _mm_srai_epi16(in15, 6);
-
- RECON_AND_STORE(dest, in0);
- RECON_AND_STORE(dest, in1);
- RECON_AND_STORE(dest, in2);
- RECON_AND_STORE(dest, in3);
- RECON_AND_STORE(dest, in4);
- RECON_AND_STORE(dest, in5);
- RECON_AND_STORE(dest, in6);
- RECON_AND_STORE(dest, in7);
- RECON_AND_STORE(dest, in8);
- RECON_AND_STORE(dest, in9);
- RECON_AND_STORE(dest, in10);
- RECON_AND_STORE(dest, in11);
- RECON_AND_STORE(dest, in12);
- RECON_AND_STORE(dest, in13);
- RECON_AND_STORE(dest, in14);
- RECON_AND_STORE(dest, in15);
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest, in[8]);
+ RECON_AND_STORE(dest, in[9]);
+ RECON_AND_STORE(dest, in[10]);
+ RECON_AND_STORE(dest, in[11]);
+ RECON_AND_STORE(dest, in[12]);
+ RECON_AND_STORE(dest, in[13]);
+ RECON_AND_STORE(dest, in[14]);
+ RECON_AND_STORE(dest, in[15]);
dest += 8 - (stride * 16);
}
@@ -2801,28 +2788,329 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
input += 8; \
} \
+#define IDCT32_1D_34 \
+/* Stage1 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+ \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+ \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+ stg1_1, stp1_16, stp1_31); \
+ MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+ stg1_7, stp1_19, stp1_28); \
+ MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+ stg1_9, stp1_20, stp1_27); \
+ MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+ stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+ stg2_1, stp2_8, stp2_15); \
+ MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+ stg2_7, stp2_11, stp2_12); \
+ \
+ stp2_16 = stp1_16; \
+ stp2_19 = stp1_19; \
+ \
+ stp2_20 = stp1_20; \
+ stp2_23 = stp1_23; \
+ \
+ stp2_24 = stp1_24; \
+ stp2_27 = stp1_27; \
+ \
+ stp2_28 = stp1_28; \
+ stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+ stg3_1, stp1_4, stp1_7); \
+ \
+ stp1_8 = stp2_8; \
+ stp1_11 = stp2_11; \
+ stp1_12 = stp2_12; \
+ stp1_15 = stp2_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i zero = _mm_setzero_si128();\
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+ stg4_1, stp2_0, stp2_1); \
+ \
+ stp2_4 = stp1_4; \
+ stp2_5 = stp1_4; \
+ stp2_6 = stp1_7; \
+ stp2_7 = stp1_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = stp2_0; \
+ stp1_1 = stp2_1; \
+ stp1_2 = stp2_1; \
+ stp1_3 = stp2_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+
#define IDCT32_1D \
/* Stage1 */ \
{ \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
- \
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
- const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
- \
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
\
MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
@@ -2840,15 +3128,15 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
\
/* Stage2 */ \
{ \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
\
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
\
MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
@@ -2880,10 +3168,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
\
/* Stage3 */ \
{ \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
\
const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
@@ -2927,10 +3215,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
\
/* Stage4 */ \
{ \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
\
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -3187,10 +3475,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
- in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
- in24, in25, in26, in27, in28, in29, in30, in31;
- __m128i col[128];
+ __m128i in[32], col[32];
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3202,296 +3487,225 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i, j, i32;
-
- // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
- for (i = 0; i < 8; i++) {
- i32 = (i << 5);
- if (i == 0) {
- // First 1-D idct: first 8 rows
- // Load input data.
- LOAD_DQCOEFF(in0, input);
- LOAD_DQCOEFF(in8, input);
- LOAD_DQCOEFF(in16, input);
- LOAD_DQCOEFF(in24, input);
- LOAD_DQCOEFF(in1, input);
- LOAD_DQCOEFF(in9, input);
- LOAD_DQCOEFF(in17, input);
- LOAD_DQCOEFF(in25, input);
- LOAD_DQCOEFF(in2, input);
- LOAD_DQCOEFF(in10, input);
- LOAD_DQCOEFF(in18, input);
- LOAD_DQCOEFF(in26, input);
- LOAD_DQCOEFF(in3, input);
- LOAD_DQCOEFF(in11, input);
- LOAD_DQCOEFF(in19, input);
- LOAD_DQCOEFF(in27, input);
-
- LOAD_DQCOEFF(in4, input);
- LOAD_DQCOEFF(in12, input);
- LOAD_DQCOEFF(in20, input);
- LOAD_DQCOEFF(in28, input);
- LOAD_DQCOEFF(in5, input);
- LOAD_DQCOEFF(in13, input);
- LOAD_DQCOEFF(in21, input);
- LOAD_DQCOEFF(in29, input);
- LOAD_DQCOEFF(in6, input);
- LOAD_DQCOEFF(in14, input);
- LOAD_DQCOEFF(in22, input);
- LOAD_DQCOEFF(in30, input);
- LOAD_DQCOEFF(in7, input);
- LOAD_DQCOEFF(in15, input);
- LOAD_DQCOEFF(in23, input);
- LOAD_DQCOEFF(in31, input);
-
- // Transpose 32x8 block to 8x32 block
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
- in10, in11, in12, in13, in14, in15);
- TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
- in18, in19, in20, in21, in22, in23);
- TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
- in26, in27, in28, in29, in30, in31);
- } else if (i < 4) {
- // First 1-D idct: next 24 zero-coeff rows
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- } else {
- // Second 1-D idct
- j = i - 4;
-
- // Transpose 32x8 block to 8x32 block
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
- in5, in6, in7);
- j += 4;
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
- in11, in12, in13, in14, in15);
- j += 4;
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
- in19, in20, in21, in22, in23);
- j += 4;
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
- in28, in29, in30, in31);
- }
-
- IDCT32_1D
+ int i;
+ // Load input data.
+ LOAD_DQCOEFF(in[0], input);
+ LOAD_DQCOEFF(in[8], input);
+ LOAD_DQCOEFF(in[16], input);
+ LOAD_DQCOEFF(in[24], input);
+ LOAD_DQCOEFF(in[1], input);
+ LOAD_DQCOEFF(in[9], input);
+ LOAD_DQCOEFF(in[17], input);
+ LOAD_DQCOEFF(in[25], input);
+ LOAD_DQCOEFF(in[2], input);
+ LOAD_DQCOEFF(in[10], input);
+ LOAD_DQCOEFF(in[18], input);
+ LOAD_DQCOEFF(in[26], input);
+ LOAD_DQCOEFF(in[3], input);
+ LOAD_DQCOEFF(in[11], input);
+ LOAD_DQCOEFF(in[19], input);
+ LOAD_DQCOEFF(in[27], input);
+
+ LOAD_DQCOEFF(in[4], input);
+ LOAD_DQCOEFF(in[12], input);
+ LOAD_DQCOEFF(in[20], input);
+ LOAD_DQCOEFF(in[28], input);
+ LOAD_DQCOEFF(in[5], input);
+ LOAD_DQCOEFF(in[13], input);
+ LOAD_DQCOEFF(in[21], input);
+ LOAD_DQCOEFF(in[29], input);
+ LOAD_DQCOEFF(in[6], input);
+ LOAD_DQCOEFF(in[14], input);
+ LOAD_DQCOEFF(in[22], input);
+ LOAD_DQCOEFF(in[30], input);
+ LOAD_DQCOEFF(in[7], input);
+ LOAD_DQCOEFF(in[15], input);
+ LOAD_DQCOEFF(in[23], input);
+ LOAD_DQCOEFF(in[31], input);
- // final stage
- if (i < 4) {
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
- } else {
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in+8, in+8);
+ array_transpose_8x8(in+16, in+16);
+ array_transpose_8x8(in+24, in+24);
+
+ IDCT32_1D
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+ for (i = 0; i < 4; i++) {
const __m128i zero = _mm_setzero_si128();
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col+i*8, in);
+ IDCT32_1D_34
// 2_D: Calculate the results and store them to destination.
- in0 = _mm_add_epi16(stp1_0, stp1_31);
- in1 = _mm_add_epi16(stp1_1, stp1_30);
- in2 = _mm_add_epi16(stp1_2, stp1_29);
- in3 = _mm_add_epi16(stp1_3, stp1_28);
- in4 = _mm_add_epi16(stp1_4, stp1_27);
- in5 = _mm_add_epi16(stp1_5, stp1_26);
- in6 = _mm_add_epi16(stp1_6, stp1_25);
- in7 = _mm_add_epi16(stp1_7, stp1_24);
- in8 = _mm_add_epi16(stp1_8, stp1_23);
- in9 = _mm_add_epi16(stp1_9, stp1_22);
- in10 = _mm_add_epi16(stp1_10, stp1_21);
- in11 = _mm_add_epi16(stp1_11, stp1_20);
- in12 = _mm_add_epi16(stp1_12, stp1_19);
- in13 = _mm_add_epi16(stp1_13, stp1_18);
- in14 = _mm_add_epi16(stp1_14, stp1_17);
- in15 = _mm_add_epi16(stp1_15, stp1_16);
- in16 = _mm_sub_epi16(stp1_15, stp1_16);
- in17 = _mm_sub_epi16(stp1_14, stp1_17);
- in18 = _mm_sub_epi16(stp1_13, stp1_18);
- in19 = _mm_sub_epi16(stp1_12, stp1_19);
- in20 = _mm_sub_epi16(stp1_11, stp1_20);
- in21 = _mm_sub_epi16(stp1_10, stp1_21);
- in22 = _mm_sub_epi16(stp1_9, stp1_22);
- in23 = _mm_sub_epi16(stp1_8, stp1_23);
- in24 = _mm_sub_epi16(stp1_7, stp1_24);
- in25 = _mm_sub_epi16(stp1_6, stp1_25);
- in26 = _mm_sub_epi16(stp1_5, stp1_26);
- in27 = _mm_sub_epi16(stp1_4, stp1_27);
- in28 = _mm_sub_epi16(stp1_3, stp1_28);
- in29 = _mm_sub_epi16(stp1_2, stp1_29);
- in30 = _mm_sub_epi16(stp1_1, stp1_30);
- in31 = _mm_sub_epi16(stp1_0, stp1_31);
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
// Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
- in8 = _mm_adds_epi16(in8, final_rounding);
- in9 = _mm_adds_epi16(in9, final_rounding);
- in10 = _mm_adds_epi16(in10, final_rounding);
- in11 = _mm_adds_epi16(in11, final_rounding);
- in12 = _mm_adds_epi16(in12, final_rounding);
- in13 = _mm_adds_epi16(in13, final_rounding);
- in14 = _mm_adds_epi16(in14, final_rounding);
- in15 = _mm_adds_epi16(in15, final_rounding);
- in16 = _mm_adds_epi16(in16, final_rounding);
- in17 = _mm_adds_epi16(in17, final_rounding);
- in18 = _mm_adds_epi16(in18, final_rounding);
- in19 = _mm_adds_epi16(in19, final_rounding);
- in20 = _mm_adds_epi16(in20, final_rounding);
- in21 = _mm_adds_epi16(in21, final_rounding);
- in22 = _mm_adds_epi16(in22, final_rounding);
- in23 = _mm_adds_epi16(in23, final_rounding);
- in24 = _mm_adds_epi16(in24, final_rounding);
- in25 = _mm_adds_epi16(in25, final_rounding);
- in26 = _mm_adds_epi16(in26, final_rounding);
- in27 = _mm_adds_epi16(in27, final_rounding);
- in28 = _mm_adds_epi16(in28, final_rounding);
- in29 = _mm_adds_epi16(in29, final_rounding);
- in30 = _mm_adds_epi16(in30, final_rounding);
- in31 = _mm_adds_epi16(in31, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 6);
- in1 = _mm_srai_epi16(in1, 6);
- in2 = _mm_srai_epi16(in2, 6);
- in3 = _mm_srai_epi16(in3, 6);
- in4 = _mm_srai_epi16(in4, 6);
- in5 = _mm_srai_epi16(in5, 6);
- in6 = _mm_srai_epi16(in6, 6);
- in7 = _mm_srai_epi16(in7, 6);
- in8 = _mm_srai_epi16(in8, 6);
- in9 = _mm_srai_epi16(in9, 6);
- in10 = _mm_srai_epi16(in10, 6);
- in11 = _mm_srai_epi16(in11, 6);
- in12 = _mm_srai_epi16(in12, 6);
- in13 = _mm_srai_epi16(in13, 6);
- in14 = _mm_srai_epi16(in14, 6);
- in15 = _mm_srai_epi16(in15, 6);
- in16 = _mm_srai_epi16(in16, 6);
- in17 = _mm_srai_epi16(in17, 6);
- in18 = _mm_srai_epi16(in18, 6);
- in19 = _mm_srai_epi16(in19, 6);
- in20 = _mm_srai_epi16(in20, 6);
- in21 = _mm_srai_epi16(in21, 6);
- in22 = _mm_srai_epi16(in22, 6);
- in23 = _mm_srai_epi16(in23, 6);
- in24 = _mm_srai_epi16(in24, 6);
- in25 = _mm_srai_epi16(in25, 6);
- in26 = _mm_srai_epi16(in26, 6);
- in27 = _mm_srai_epi16(in27, 6);
- in28 = _mm_srai_epi16(in28, 6);
- in29 = _mm_srai_epi16(in29, 6);
- in30 = _mm_srai_epi16(in30, 6);
- in31 = _mm_srai_epi16(in31, 6);
-
- RECON_AND_STORE(dest, in0);
- RECON_AND_STORE(dest, in1);
- RECON_AND_STORE(dest, in2);
- RECON_AND_STORE(dest, in3);
- RECON_AND_STORE(dest, in4);
- RECON_AND_STORE(dest, in5);
- RECON_AND_STORE(dest, in6);
- RECON_AND_STORE(dest, in7);
- RECON_AND_STORE(dest, in8);
- RECON_AND_STORE(dest, in9);
- RECON_AND_STORE(dest, in10);
- RECON_AND_STORE(dest, in11);
- RECON_AND_STORE(dest, in12);
- RECON_AND_STORE(dest, in13);
- RECON_AND_STORE(dest, in14);
- RECON_AND_STORE(dest, in15);
- RECON_AND_STORE(dest, in16);
- RECON_AND_STORE(dest, in17);
- RECON_AND_STORE(dest, in18);
- RECON_AND_STORE(dest, in19);
- RECON_AND_STORE(dest, in20);
- RECON_AND_STORE(dest, in21);
- RECON_AND_STORE(dest, in22);
- RECON_AND_STORE(dest, in23);
- RECON_AND_STORE(dest, in24);
- RECON_AND_STORE(dest, in25);
- RECON_AND_STORE(dest, in26);
- RECON_AND_STORE(dest, in27);
- RECON_AND_STORE(dest, in28);
- RECON_AND_STORE(dest, in29);
- RECON_AND_STORE(dest, in30);
- RECON_AND_STORE(dest, in31);
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+ in[16] = _mm_adds_epi16(in[16], final_rounding);
+ in[17] = _mm_adds_epi16(in[17], final_rounding);
+ in[18] = _mm_adds_epi16(in[18], final_rounding);
+ in[19] = _mm_adds_epi16(in[19], final_rounding);
+ in[20] = _mm_adds_epi16(in[20], final_rounding);
+ in[21] = _mm_adds_epi16(in[21], final_rounding);
+ in[22] = _mm_adds_epi16(in[22], final_rounding);
+ in[23] = _mm_adds_epi16(in[23], final_rounding);
+ in[24] = _mm_adds_epi16(in[24], final_rounding);
+ in[25] = _mm_adds_epi16(in[25], final_rounding);
+ in[26] = _mm_adds_epi16(in[26], final_rounding);
+ in[27] = _mm_adds_epi16(in[27], final_rounding);
+ in[28] = _mm_adds_epi16(in[28], final_rounding);
+ in[29] = _mm_adds_epi16(in[29], final_rounding);
+ in[30] = _mm_adds_epi16(in[30], final_rounding);
+ in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+ in[16] = _mm_srai_epi16(in[16], 6);
+ in[17] = _mm_srai_epi16(in[17], 6);
+ in[18] = _mm_srai_epi16(in[18], 6);
+ in[19] = _mm_srai_epi16(in[19], 6);
+ in[20] = _mm_srai_epi16(in[20], 6);
+ in[21] = _mm_srai_epi16(in[21], 6);
+ in[22] = _mm_srai_epi16(in[22], 6);
+ in[23] = _mm_srai_epi16(in[23], 6);
+ in[24] = _mm_srai_epi16(in[24], 6);
+ in[25] = _mm_srai_epi16(in[25], 6);
+ in[26] = _mm_srai_epi16(in[26], 6);
+ in[27] = _mm_srai_epi16(in[27], 6);
+ in[28] = _mm_srai_epi16(in[28], 6);
+ in[29] = _mm_srai_epi16(in[29], 6);
+ in[30] = _mm_srai_epi16(in[30], 6);
+ in[31] = _mm_srai_epi16(in[31], 6);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest, in[8]);
+ RECON_AND_STORE(dest, in[9]);
+ RECON_AND_STORE(dest, in[10]);
+ RECON_AND_STORE(dest, in[11]);
+ RECON_AND_STORE(dest, in[12]);
+ RECON_AND_STORE(dest, in[13]);
+ RECON_AND_STORE(dest, in[14]);
+ RECON_AND_STORE(dest, in[15]);
+ RECON_AND_STORE(dest, in[16]);
+ RECON_AND_STORE(dest, in[17]);
+ RECON_AND_STORE(dest, in[18]);
+ RECON_AND_STORE(dest, in[19]);
+ RECON_AND_STORE(dest, in[20]);
+ RECON_AND_STORE(dest, in[21]);
+ RECON_AND_STORE(dest, in[22]);
+ RECON_AND_STORE(dest, in[23]);
+ RECON_AND_STORE(dest, in[24]);
+ RECON_AND_STORE(dest, in[25]);
+ RECON_AND_STORE(dest, in[26]);
+ RECON_AND_STORE(dest, in[27]);
+ RECON_AND_STORE(dest, in[28]);
+ RECON_AND_STORE(dest, in[29]);
+ RECON_AND_STORE(dest, in[30]);
+ RECON_AND_STORE(dest, in[31]);
dest += 8 - (stride * 32);
}
}
-}
void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
@@ -3546,10 +3760,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
- in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
- in24, in25, in26, in27, in28, in29, in30, in31;
- __m128i col[128];
+ __m128i in[32], col[128], zero_idx[16];
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3562,66 +3773,63 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i, j, i32;
- __m128i zero_idx[16];
int zero_flag[2];
- // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
- for (i = 0; i < 8; i++) {
+ for (i = 0; i < 4; i++) {
i32 = (i << 5);
- if (i < 4) {
// First 1-D idct
// Load input data.
- LOAD_DQCOEFF(in0, input);
- LOAD_DQCOEFF(in8, input);
- LOAD_DQCOEFF(in16, input);
- LOAD_DQCOEFF(in24, input);
- LOAD_DQCOEFF(in1, input);
- LOAD_DQCOEFF(in9, input);
- LOAD_DQCOEFF(in17, input);
- LOAD_DQCOEFF(in25, input);
- LOAD_DQCOEFF(in2, input);
- LOAD_DQCOEFF(in10, input);
- LOAD_DQCOEFF(in18, input);
- LOAD_DQCOEFF(in26, input);
- LOAD_DQCOEFF(in3, input);
- LOAD_DQCOEFF(in11, input);
- LOAD_DQCOEFF(in19, input);
- LOAD_DQCOEFF(in27, input);
-
- LOAD_DQCOEFF(in4, input);
- LOAD_DQCOEFF(in12, input);
- LOAD_DQCOEFF(in20, input);
- LOAD_DQCOEFF(in28, input);
- LOAD_DQCOEFF(in5, input);
- LOAD_DQCOEFF(in13, input);
- LOAD_DQCOEFF(in21, input);
- LOAD_DQCOEFF(in29, input);
- LOAD_DQCOEFF(in6, input);
- LOAD_DQCOEFF(in14, input);
- LOAD_DQCOEFF(in22, input);
- LOAD_DQCOEFF(in30, input);
- LOAD_DQCOEFF(in7, input);
- LOAD_DQCOEFF(in15, input);
- LOAD_DQCOEFF(in23, input);
- LOAD_DQCOEFF(in31, input);
+ LOAD_DQCOEFF(in[0], input);
+ LOAD_DQCOEFF(in[8], input);
+ LOAD_DQCOEFF(in[16], input);
+ LOAD_DQCOEFF(in[24], input);
+ LOAD_DQCOEFF(in[1], input);
+ LOAD_DQCOEFF(in[9], input);
+ LOAD_DQCOEFF(in[17], input);
+ LOAD_DQCOEFF(in[25], input);
+ LOAD_DQCOEFF(in[2], input);
+ LOAD_DQCOEFF(in[10], input);
+ LOAD_DQCOEFF(in[18], input);
+ LOAD_DQCOEFF(in[26], input);
+ LOAD_DQCOEFF(in[3], input);
+ LOAD_DQCOEFF(in[11], input);
+ LOAD_DQCOEFF(in[19], input);
+ LOAD_DQCOEFF(in[27], input);
+
+ LOAD_DQCOEFF(in[4], input);
+ LOAD_DQCOEFF(in[12], input);
+ LOAD_DQCOEFF(in[20], input);
+ LOAD_DQCOEFF(in[28], input);
+ LOAD_DQCOEFF(in[5], input);
+ LOAD_DQCOEFF(in[13], input);
+ LOAD_DQCOEFF(in[21], input);
+ LOAD_DQCOEFF(in[29], input);
+ LOAD_DQCOEFF(in[6], input);
+ LOAD_DQCOEFF(in[14], input);
+ LOAD_DQCOEFF(in[22], input);
+ LOAD_DQCOEFF(in[30], input);
+ LOAD_DQCOEFF(in[7], input);
+ LOAD_DQCOEFF(in[15], input);
+ LOAD_DQCOEFF(in[23], input);
+ LOAD_DQCOEFF(in[31], input);
// checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in0, in1);
- zero_idx[1] = _mm_or_si128(in2, in3);
- zero_idx[2] = _mm_or_si128(in4, in5);
- zero_idx[3] = _mm_or_si128(in6, in7);
- zero_idx[4] = _mm_or_si128(in8, in9);
- zero_idx[5] = _mm_or_si128(in10, in11);
- zero_idx[6] = _mm_or_si128(in12, in13);
- zero_idx[7] = _mm_or_si128(in14, in15);
- zero_idx[8] = _mm_or_si128(in16, in17);
- zero_idx[9] = _mm_or_si128(in18, in19);
- zero_idx[10] = _mm_or_si128(in20, in21);
- zero_idx[11] = _mm_or_si128(in22, in23);
- zero_idx[12] = _mm_or_si128(in24, in25);
- zero_idx[13] = _mm_or_si128(in26, in27);
- zero_idx[14] = _mm_or_si128(in28, in29);
- zero_idx[15] = _mm_or_si128(in30, in31);
+ zero_idx[0] = _mm_or_si128(in[0], in[1]);
+ zero_idx[1] = _mm_or_si128(in[2], in[3]);
+ zero_idx[2] = _mm_or_si128(in[4], in[5]);
+ zero_idx[3] = _mm_or_si128(in[6], in[7]);
+ zero_idx[4] = _mm_or_si128(in[8], in[9]);
+ zero_idx[5] = _mm_or_si128(in[10], in[11]);
+ zero_idx[6] = _mm_or_si128(in[12], in[13]);
+ zero_idx[7] = _mm_or_si128(in[14], in[15]);
+ zero_idx[8] = _mm_or_si128(in[16], in[17]);
+ zero_idx[9] = _mm_or_si128(in[18], in[19]);
+ zero_idx[10] = _mm_or_si128(in[20], in[21]);
+ zero_idx[11] = _mm_or_si128(in[22], in[23]);
+ zero_idx[12] = _mm_or_si128(in[24], in[25]);
+ zero_idx[13] = _mm_or_si128(in[26], in[27]);
+ zero_idx[14] = _mm_or_si128(in[28], in[29]);
+ zero_idx[15] = _mm_or_si128(in[30], in[31]);
zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
@@ -3683,44 +3891,13 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
}
// Transpose 32x8 block to 8x32 block
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
- TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
- in10, in11, in12, in13, in14, in15);
- TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
- in18, in19, in20, in21, in22, in23);
- TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
- in26, in27, in28, in29, in30, in31);
- } else {
- // Second 1-D idct
- j = i - 4;
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in+8, in+8);
+ array_transpose_8x8(in+16, in+16);
+ array_transpose_8x8(in+24, in+24);
- // Transpose 32x8 block to 8x32 block
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
- in5, in6, in7);
- j += 4;
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
- in11, in12, in13, in14, in15);
- j += 4;
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
- in19, in20, in21, in22, in23);
- j += 4;
- TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
- col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
- col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
- in28, in29, in30, in31);
- }
-
- IDCT32_1D
+ IDCT32_1D
- // final stage
- if (i < 4) {
// 1_D: Store 32 intermediate results for each 8x32 block.
col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
@@ -3754,146 +3931,156 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
- } else {
+ }
+ for (i = 0; i < 4; i++) {
const __m128i zero = _mm_setzero_si128();
+ // Second 1-D idct
+ j = i << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col+j, in);
+ array_transpose_8x8(col+j+32, in+8);
+ array_transpose_8x8(col+j+64, in+16);
+ array_transpose_8x8(col+j+96, in+24);
+
+ IDCT32_1D
// 2_D: Calculate the results and store them to destination.
- in0 = _mm_add_epi16(stp1_0, stp1_31);
- in1 = _mm_add_epi16(stp1_1, stp1_30);
- in2 = _mm_add_epi16(stp1_2, stp1_29);
- in3 = _mm_add_epi16(stp1_3, stp1_28);
- in4 = _mm_add_epi16(stp1_4, stp1_27);
- in5 = _mm_add_epi16(stp1_5, stp1_26);
- in6 = _mm_add_epi16(stp1_6, stp1_25);
- in7 = _mm_add_epi16(stp1_7, stp1_24);
- in8 = _mm_add_epi16(stp1_8, stp1_23);
- in9 = _mm_add_epi16(stp1_9, stp1_22);
- in10 = _mm_add_epi16(stp1_10, stp1_21);
- in11 = _mm_add_epi16(stp1_11, stp1_20);
- in12 = _mm_add_epi16(stp1_12, stp1_19);
- in13 = _mm_add_epi16(stp1_13, stp1_18);
- in14 = _mm_add_epi16(stp1_14, stp1_17);
- in15 = _mm_add_epi16(stp1_15, stp1_16);
- in16 = _mm_sub_epi16(stp1_15, stp1_16);
- in17 = _mm_sub_epi16(stp1_14, stp1_17);
- in18 = _mm_sub_epi16(stp1_13, stp1_18);
- in19 = _mm_sub_epi16(stp1_12, stp1_19);
- in20 = _mm_sub_epi16(stp1_11, stp1_20);
- in21 = _mm_sub_epi16(stp1_10, stp1_21);
- in22 = _mm_sub_epi16(stp1_9, stp1_22);
- in23 = _mm_sub_epi16(stp1_8, stp1_23);
- in24 = _mm_sub_epi16(stp1_7, stp1_24);
- in25 = _mm_sub_epi16(stp1_6, stp1_25);
- in26 = _mm_sub_epi16(stp1_5, stp1_26);
- in27 = _mm_sub_epi16(stp1_4, stp1_27);
- in28 = _mm_sub_epi16(stp1_3, stp1_28);
- in29 = _mm_sub_epi16(stp1_2, stp1_29);
- in30 = _mm_sub_epi16(stp1_1, stp1_30);
- in31 = _mm_sub_epi16(stp1_0, stp1_31);
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
// Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
- in8 = _mm_adds_epi16(in8, final_rounding);
- in9 = _mm_adds_epi16(in9, final_rounding);
- in10 = _mm_adds_epi16(in10, final_rounding);
- in11 = _mm_adds_epi16(in11, final_rounding);
- in12 = _mm_adds_epi16(in12, final_rounding);
- in13 = _mm_adds_epi16(in13, final_rounding);
- in14 = _mm_adds_epi16(in14, final_rounding);
- in15 = _mm_adds_epi16(in15, final_rounding);
- in16 = _mm_adds_epi16(in16, final_rounding);
- in17 = _mm_adds_epi16(in17, final_rounding);
- in18 = _mm_adds_epi16(in18, final_rounding);
- in19 = _mm_adds_epi16(in19, final_rounding);
- in20 = _mm_adds_epi16(in20, final_rounding);
- in21 = _mm_adds_epi16(in21, final_rounding);
- in22 = _mm_adds_epi16(in22, final_rounding);
- in23 = _mm_adds_epi16(in23, final_rounding);
- in24 = _mm_adds_epi16(in24, final_rounding);
- in25 = _mm_adds_epi16(in25, final_rounding);
- in26 = _mm_adds_epi16(in26, final_rounding);
- in27 = _mm_adds_epi16(in27, final_rounding);
- in28 = _mm_adds_epi16(in28, final_rounding);
- in29 = _mm_adds_epi16(in29, final_rounding);
- in30 = _mm_adds_epi16(in30, final_rounding);
- in31 = _mm_adds_epi16(in31, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 6);
- in1 = _mm_srai_epi16(in1, 6);
- in2 = _mm_srai_epi16(in2, 6);
- in3 = _mm_srai_epi16(in3, 6);
- in4 = _mm_srai_epi16(in4, 6);
- in5 = _mm_srai_epi16(in5, 6);
- in6 = _mm_srai_epi16(in6, 6);
- in7 = _mm_srai_epi16(in7, 6);
- in8 = _mm_srai_epi16(in8, 6);
- in9 = _mm_srai_epi16(in9, 6);
- in10 = _mm_srai_epi16(in10, 6);
- in11 = _mm_srai_epi16(in11, 6);
- in12 = _mm_srai_epi16(in12, 6);
- in13 = _mm_srai_epi16(in13, 6);
- in14 = _mm_srai_epi16(in14, 6);
- in15 = _mm_srai_epi16(in15, 6);
- in16 = _mm_srai_epi16(in16, 6);
- in17 = _mm_srai_epi16(in17, 6);
- in18 = _mm_srai_epi16(in18, 6);
- in19 = _mm_srai_epi16(in19, 6);
- in20 = _mm_srai_epi16(in20, 6);
- in21 = _mm_srai_epi16(in21, 6);
- in22 = _mm_srai_epi16(in22, 6);
- in23 = _mm_srai_epi16(in23, 6);
- in24 = _mm_srai_epi16(in24, 6);
- in25 = _mm_srai_epi16(in25, 6);
- in26 = _mm_srai_epi16(in26, 6);
- in27 = _mm_srai_epi16(in27, 6);
- in28 = _mm_srai_epi16(in28, 6);
- in29 = _mm_srai_epi16(in29, 6);
- in30 = _mm_srai_epi16(in30, 6);
- in31 = _mm_srai_epi16(in31, 6);
-
- RECON_AND_STORE(dest, in0);
- RECON_AND_STORE(dest, in1);
- RECON_AND_STORE(dest, in2);
- RECON_AND_STORE(dest, in3);
- RECON_AND_STORE(dest, in4);
- RECON_AND_STORE(dest, in5);
- RECON_AND_STORE(dest, in6);
- RECON_AND_STORE(dest, in7);
- RECON_AND_STORE(dest, in8);
- RECON_AND_STORE(dest, in9);
- RECON_AND_STORE(dest, in10);
- RECON_AND_STORE(dest, in11);
- RECON_AND_STORE(dest, in12);
- RECON_AND_STORE(dest, in13);
- RECON_AND_STORE(dest, in14);
- RECON_AND_STORE(dest, in15);
- RECON_AND_STORE(dest, in16);
- RECON_AND_STORE(dest, in17);
- RECON_AND_STORE(dest, in18);
- RECON_AND_STORE(dest, in19);
- RECON_AND_STORE(dest, in20);
- RECON_AND_STORE(dest, in21);
- RECON_AND_STORE(dest, in22);
- RECON_AND_STORE(dest, in23);
- RECON_AND_STORE(dest, in24);
- RECON_AND_STORE(dest, in25);
- RECON_AND_STORE(dest, in26);
- RECON_AND_STORE(dest, in27);
- RECON_AND_STORE(dest, in28);
- RECON_AND_STORE(dest, in29);
- RECON_AND_STORE(dest, in30);
- RECON_AND_STORE(dest, in31);
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+ in[16] = _mm_adds_epi16(in[16], final_rounding);
+ in[17] = _mm_adds_epi16(in[17], final_rounding);
+ in[18] = _mm_adds_epi16(in[18], final_rounding);
+ in[19] = _mm_adds_epi16(in[19], final_rounding);
+ in[20] = _mm_adds_epi16(in[20], final_rounding);
+ in[21] = _mm_adds_epi16(in[21], final_rounding);
+ in[22] = _mm_adds_epi16(in[22], final_rounding);
+ in[23] = _mm_adds_epi16(in[23], final_rounding);
+ in[24] = _mm_adds_epi16(in[24], final_rounding);
+ in[25] = _mm_adds_epi16(in[25], final_rounding);
+ in[26] = _mm_adds_epi16(in[26], final_rounding);
+ in[27] = _mm_adds_epi16(in[27], final_rounding);
+ in[28] = _mm_adds_epi16(in[28], final_rounding);
+ in[29] = _mm_adds_epi16(in[29], final_rounding);
+ in[30] = _mm_adds_epi16(in[30], final_rounding);
+ in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+ in[16] = _mm_srai_epi16(in[16], 6);
+ in[17] = _mm_srai_epi16(in[17], 6);
+ in[18] = _mm_srai_epi16(in[18], 6);
+ in[19] = _mm_srai_epi16(in[19], 6);
+ in[20] = _mm_srai_epi16(in[20], 6);
+ in[21] = _mm_srai_epi16(in[21], 6);
+ in[22] = _mm_srai_epi16(in[22], 6);
+ in[23] = _mm_srai_epi16(in[23], 6);
+ in[24] = _mm_srai_epi16(in[24], 6);
+ in[25] = _mm_srai_epi16(in[25], 6);
+ in[26] = _mm_srai_epi16(in[26], 6);
+ in[27] = _mm_srai_epi16(in[27], 6);
+ in[28] = _mm_srai_epi16(in[28], 6);
+ in[29] = _mm_srai_epi16(in[29], 6);
+ in[30] = _mm_srai_epi16(in[30], 6);
+ in[31] = _mm_srai_epi16(in[31], 6);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest, in[8]);
+ RECON_AND_STORE(dest, in[9]);
+ RECON_AND_STORE(dest, in[10]);
+ RECON_AND_STORE(dest, in[11]);
+ RECON_AND_STORE(dest, in[12]);
+ RECON_AND_STORE(dest, in[13]);
+ RECON_AND_STORE(dest, in[14]);
+ RECON_AND_STORE(dest, in[15]);
+ RECON_AND_STORE(dest, in[16]);
+ RECON_AND_STORE(dest, in[17]);
+ RECON_AND_STORE(dest, in[18]);
+ RECON_AND_STORE(dest, in[19]);
+ RECON_AND_STORE(dest, in[20]);
+ RECON_AND_STORE(dest, in[21]);
+ RECON_AND_STORE(dest, in[22]);
+ RECON_AND_STORE(dest, in[23]);
+ RECON_AND_STORE(dest, in[24]);
+ RECON_AND_STORE(dest, in[25]);
+ RECON_AND_STORE(dest, in[26]);
+ RECON_AND_STORE(dest, in[27]);
+ RECON_AND_STORE(dest, in[28]);
+ RECON_AND_STORE(dest, in[29]);
+ RECON_AND_STORE(dest, in[30]);
+ RECON_AND_STORE(dest, in[31]);
dest += 8 - (stride * 32);
}
- }
} //NOLINT
void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 925f74d19..3ca55cfc3 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <emmintrin.h> /* SSE2 */
+#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_loopfilter.h"
#include "vpx_ports/emmintrin_compat.h"
@@ -99,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
- /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi8(filt, t4);
@@ -110,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
filter2 = _mm_unpacklo_epi8(zero, filter2);
filter2 = _mm_srai_epi16(filter2, 0xB);
- /* Filter1 >> 3 */
+ // Filter1 >> 3
filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
- /* filt >> 1 */
+ // filt >> 1
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
@@ -473,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
- /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
+ // Filter1 >> 3
work_a = _mm_cmpgt_epi8(zero, filter1);
filter1 = _mm_srli_epi16(filter1, 3);
work_a = _mm_and_si128(work_a, te0);
@@ -487,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
filter1 = _mm_or_si128(filter1, work_a);
qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- /* Filter2 >> 3 */
+ // Filter2 >> 3
work_a = _mm_cmpgt_epi8(zero, filter2);
filter2 = _mm_srli_epi16(filter2, 3);
work_a = _mm_and_si128(work_a, te0);
@@ -495,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
filter2 = _mm_or_si128(filter2, work_a);
ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- /* filt >> 1 */
+ // filt >> 1
filt = _mm_adds_epi8(filter1, t1);
work_a = _mm_cmpgt_epi8(zero, filt);
filt = _mm_srli_epi16(filt, 1);
@@ -1014,23 +1014,23 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
- /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
+ // Filter1 >> 3
filter1 = _mm_unpacklo_epi8(zero, filter1);
filter1 = _mm_srai_epi16(filter1, 11);
filter1 = _mm_packs_epi16(filter1, filter1);
- /* Filter2 >> 3 */
+ // Filter2 >> 3
filter2 = _mm_unpacklo_epi8(zero, filter2);
filter2 = _mm_srai_epi16(filter2, 11);
filter2 = _mm_packs_epi16(filter2, zero);
- /* filt >> 1 */
+ // filt >> 1
filt = _mm_adds_epi8(filter1, t1);
filt = _mm_unpacklo_epi8(zero, filt);
filt = _mm_srai_epi16(filt, 9);
@@ -1083,7 +1083,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
}
}
-void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p,
const uint8_t *_blimit0,
const uint8_t *_limit0,
const uint8_t *_thresh0,
@@ -1255,27 +1255,27 @@ void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
- /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
+ // Filter1 >> 3
work_a = _mm_cmpgt_epi8(zero, filter1);
filter1 = _mm_srli_epi16(filter1, 3);
work_a = _mm_and_si128(work_a, te0);
filter1 = _mm_and_si128(filter1, t1f);
filter1 = _mm_or_si128(filter1, work_a);
- /* Filter2 >> 3 */
+ // Filter2 >> 3
work_a = _mm_cmpgt_epi8(zero, filter2);
filter2 = _mm_srli_epi16(filter2, 3);
work_a = _mm_and_si128(work_a, te0);
filter2 = _mm_and_si128(filter2, t1f);
filter2 = _mm_or_si128(filter2, work_a);
- /* filt >> 1 */
+ // filt >> 1
filt = _mm_adds_epi8(filter1, t1);
work_a = _mm_cmpgt_epi8(zero, filt);
filt = _mm_srli_epi16(filt, 1);
@@ -1427,27 +1427,27 @@ void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s,
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
- /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
+ // Filter1 >> 3
work_a = _mm_cmpgt_epi8(zero, filter1);
filter1 = _mm_srli_epi16(filter1, 3);
work_a = _mm_and_si128(work_a, te0);
filter1 = _mm_and_si128(filter1, t1f);
filter1 = _mm_or_si128(filter1, work_a);
- /* Filter2 >> 3 */
+ // Filter2 >> 3
work_a = _mm_cmpgt_epi8(zero, filter2);
filter2 = _mm_srli_epi16(filter2, 3);
work_a = _mm_and_si128(work_a, te0);
filter2 = _mm_and_si128(filter2, t1f);
filter2 = _mm_or_si128(filter2, work_a);
- /* filt >> 1 */
+ // filt >> 1
filt = _mm_adds_epi8(filter1, t1);
work_a = _mm_cmpgt_epi8(zero, filt);
filt = _mm_srli_epi16(filt, 1);
@@ -1474,7 +1474,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i x8, x9, x10, x11, x12, x13, x14, x15;
- /* Read in 16 lines */
+ // Read in 16 lines
x0 = _mm_loadl_epi64((__m128i *)in0);
x8 = _mm_loadl_epi64((__m128i *)in1);
x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
@@ -1512,7 +1512,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
x14 = _mm_unpacklo_epi32(x12, x13);
x15 = _mm_unpackhi_epi32(x12, x13);
- /* Store first 4-line result */
+ // Store first 4-line result
_mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
_mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
_mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1528,7 +1528,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
x14 = _mm_unpacklo_epi32(x12, x13);
x15 = _mm_unpackhi_epi32(x12, x13);
- /* Store second 4-line result */
+ // Store second 4-line result
_mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
_mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
_mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1598,61 +1598,129 @@ static INLINE void transpose(unsigned char *src[], int in_p,
} while (++idx8x8 < num_8x8_to_transpose);
}
-void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
- int p,
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+ unsigned char *src[2];
+ unsigned char *dst[2];
+
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ vp9_loop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ transpose(src, 16, dst, p, 2);
+}
+
+void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+ DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+ unsigned char *src[1];
+ unsigned char *dst[1];
+ (void)count;
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ transpose(src, p, dst, 8, 1);
+
+ // Loop filtering
+ vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 4 * 8, 8, blimit, limit,
+ thresh, 1);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ transpose(src, 8, dst, p, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
unsigned char *src[2];
unsigned char *dst[2];
- (void)count;
- /* Transpose 16x16 */
- transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
- transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
+ // Transpose 8x16
+ transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
- /* Loop filtering */
- vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
- thresh, 1);
- src[0] = t_dst + 3 * 16;
- src[1] = t_dst + 3 * 16 + 8;
+ // Loop filtering
+ vp9_mbloop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
- dst[0] = s - 5;
- dst[1] = s - 5 + p * 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
- /* Transpose 16x8 */
+ // Transpose back
transpose(src, 16, dst, p, 2);
}
-void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
- int p,
+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
- unsigned char *src[4];
- unsigned char *dst[4];
-
- dst[0] = t_dst;
- dst[1] = t_dst + 8 * 16;
+ DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+ unsigned char *src[2];
+ unsigned char *dst[2];
src[0] = s - 8;
- src[1] = s - 8 + 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
- /* Transpose 16x16 */
- transpose(src, p, dst, 16, 2);
+ // Transpose 16x8
+ transpose(src, p, dst, 8, 2);
- /* Loop filtering */
- vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
- thresh, 1);
+ // Loop filtering
+ mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
src[0] = t_dst;
- src[1] = t_dst + 8 * 16;
-
+ src[1] = t_dst + 8 * 8;
dst[0] = s - 8;
- dst[1] = s - 8 + 8;
+ dst[1] = s;
- transpose(src, 16, dst, p, 2);
+ // Transpose back
+ transpose(src, 8, dst, p, 2);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_sse2(unsigned char *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+
+ // Transpose 16x16
+ transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+ transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+ thresh);
+
+ // Transpose back
+ transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+ transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
}
diff --git a/vp9/decoder/vp9_dboolhuff.c b/vp9/decoder/vp9_dboolhuff.c
index 06acec4db..4f16e95b0 100644
--- a/vp9/decoder/vp9_dboolhuff.c
+++ b/vp9/decoder/vp9_dboolhuff.c
@@ -18,32 +18,28 @@
// Even relatively modest values like 100 would work fine.
#define LOTS_OF_BITS 0x40000000
-
int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
- int marker_bit;
-
- r->buffer_end = buffer + size;
- r->buffer = buffer;
- r->value = 0;
- r->count = -8;
- r->range = 255;
-
- if (size && !buffer)
+ if (size && !buffer) {
return 1;
-
- vp9_reader_fill(r);
- marker_bit = vp9_read_bit(r);
- return marker_bit != 0;
+ } else {
+ r->buffer_end = buffer + size;
+ r->buffer = buffer;
+ r->value = 0;
+ r->count = -8;
+ r->range = 255;
+ vp9_reader_fill(r);
+ return vp9_read_bit(r) != 0; // marker bit
+ }
}
void vp9_reader_fill(vp9_reader *r) {
const uint8_t *const buffer_end = r->buffer_end;
const uint8_t *buffer = r->buffer;
- VP9_BD_VALUE value = r->value;
+ BD_VALUE value = r->value;
int count = r->count;
- int shift = BD_VALUE_SIZE - 8 - (count + 8);
+ int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
int loop_end = 0;
- const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT);
+ const int bits_left = (int)((buffer_end - buffer) * CHAR_BIT);
const int x = shift + CHAR_BIT - bits_left;
if (x >= 0) {
@@ -54,7 +50,7 @@ void vp9_reader_fill(vp9_reader *r) {
if (x < 0 || bits_left) {
while (shift >= loop_end) {
count += CHAR_BIT;
- value |= (VP9_BD_VALUE)*buffer++ << shift;
+ value |= (BD_VALUE)*buffer++ << shift;
shift -= CHAR_BIT;
}
}
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index fd8e74ca4..8339c2701 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -18,46 +18,50 @@
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
-typedef size_t VP9_BD_VALUE;
+#include "vp9/common/vp9_treecoder.h"
-#define BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
typedef struct {
const uint8_t *buffer_end;
const uint8_t *buffer;
- VP9_BD_VALUE value;
+ BD_VALUE value;
int count;
unsigned int range;
} vp9_reader;
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
-
int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size);
void vp9_reader_fill(vp9_reader *r);
+int vp9_reader_has_error(vp9_reader *r);
+
const uint8_t *vp9_reader_find_end(vp9_reader *r);
-static int vp9_read(vp9_reader *br, int probability) {
+static int vp9_read(vp9_reader *r, int prob) {
unsigned int bit = 0;
- VP9_BD_VALUE value;
- VP9_BD_VALUE bigsplit;
+ BD_VALUE value;
+ BD_VALUE bigsplit;
int count;
unsigned int range;
- unsigned int split = ((br->range * probability) + (256 - probability)) >> 8;
+ unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
- if (br->count < 0)
- vp9_reader_fill(br);
+ if (r->count < 0)
+ vp9_reader_fill(r);
- value = br->value;
- count = br->count;
+ value = r->value;
+ count = r->count;
- bigsplit = (VP9_BD_VALUE)split << (BD_VALUE_SIZE - 8);
+ bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
range = split;
if (value >= bigsplit) {
- range = br->range - split;
+ range = r->range - split;
value = value - bigsplit;
bit = 1;
}
@@ -68,9 +72,9 @@ static int vp9_read(vp9_reader *br, int probability) {
value <<= shift;
count -= shift;
}
- br->value = value;
- br->count = count;
- br->range = range;
+ r->value = value;
+ r->count = count;
+ r->range = range;
return bit;
}
@@ -79,15 +83,23 @@ static int vp9_read_bit(vp9_reader *r) {
return vp9_read(r, 128); // vp9_prob_half
}
-static int vp9_read_literal(vp9_reader *br, int bits) {
- int z = 0, bit;
+static int vp9_read_literal(vp9_reader *r, int bits) {
+ int literal = 0, bit;
for (bit = bits - 1; bit >= 0; bit--)
- z |= vp9_read_bit(br) << bit;
+ literal |= vp9_read_bit(r) << bit;
- return z;
+ return literal;
}
-int vp9_reader_has_error(vp9_reader *r);
+static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
+ const vp9_prob *probs) {
+ vp9_tree_index i = 0;
+
+ while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
+ continue;
+
+ return -i;
+}
#endif // VP9_DECODER_VP9_DBOOLHUFF_H_
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 916cb424e..9b6740eea 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -11,6 +11,8 @@
#include <assert.h>
#include "./vp9_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/vpx_scale.h"
@@ -34,14 +36,11 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
#include "vp9/decoder/vp9_thread.h"
-#include "vp9/decoder/vp9_treereader.h"
typedef struct TileWorkerData {
VP9_COMMON *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
- DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
- DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]);
DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
} TileWorkerData;
@@ -50,7 +49,7 @@ static int read_be32(const uint8_t *p) {
return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
}
-static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+static int is_compound_reference_allowed(const VP9_COMMON *cm) {
int i;
for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
@@ -59,7 +58,7 @@ static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
return 0;
}
-static void setup_compound_prediction(VP9_COMMON *cm) {
+static void setup_compound_reference(VP9_COMMON *cm) {
if (cm->ref_frame_sign_bias[LAST_FRAME] ==
cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
cm->comp_fixed_ref = ALTREF_FRAME;
@@ -94,7 +93,7 @@ static TX_MODE read_tx_mode(vp9_reader *r) {
return tx_mode;
}
-static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
int i, j;
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
@@ -124,33 +123,31 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
}
-static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
- COMPPREDMODE_TYPE mode = vp9_read_bit(r);
- if (mode)
- mode += vp9_read_bit(r);
- return mode;
+static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) {
+ if (is_compound_reference_allowed(cm)) {
+ REFERENCE_MODE mode = vp9_read_bit(r);
+ if (mode)
+ mode += vp9_read_bit(r);
+ setup_compound_reference(cm);
+ return mode;
+ } else {
+ return SINGLE_REFERENCE;
+ }
}
-static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
int i;
-
- const int compound_allowed = is_compound_prediction_allowed(cm);
- cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
- : SINGLE_PREDICTION_ONLY;
- if (compound_allowed)
- setup_compound_prediction(cm);
-
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
- if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+ if (cm->comp_pred_mode != COMPOUND_REFERENCE)
for (i = 0; i < REF_CONTEXTS; i++) {
vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
}
- if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+ if (cm->comp_pred_mode != SINGLE_REFERENCE)
for (i = 0; i < REF_CONTEXTS; i++)
vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
}
@@ -241,8 +238,7 @@ static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) {
}
static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
- TX_SIZE tx_size, uint8_t *dst, int stride,
- uint8_t *token_cache) {
+ TX_SIZE tx_size, uint8_t *dst, int stride) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int eob = pd->eobs[block];
if (eob > 0) {
@@ -275,20 +271,13 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
if (eob == 1) {
vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
- vpx_memset(token_cache, 0, 2 * sizeof(token_cache[0]));
} else {
- if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) {
+ if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
- vpx_memset(token_cache, 0,
- 4 * (4 << tx_size) * sizeof(token_cache[0]));
- } else if (tx_size == TX_32X32 && eob <= 34) {
+ else if (tx_size == TX_32X32 && eob <= 34)
vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
- vpx_memset(token_cache, 0, 256 * sizeof(token_cache[0]));
- } else {
+ else
vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
- vpx_memset(token_cache, 0,
- (16 << (tx_size << 1)) * sizeof(token_cache[0]));
- }
}
}
}
@@ -297,7 +286,6 @@ struct intra_args {
VP9_COMMON *cm;
MACROBLOCKD *xd;
vp9_reader *r;
- uint8_t *token_cache;
};
static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -326,9 +314,8 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
if (!mi->mbmi.skip_coeff) {
vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size,
- args->r, args->token_cache);
- inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
- args->token_cache);
+ args->r);
+ inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride);
}
}
@@ -337,7 +324,6 @@ struct inter_args {
MACROBLOCKD *xd;
vp9_reader *r;
int *eobtotal;
- uint8_t *token_cache;
};
static void reconstruct_inter_block(int plane, int block,
@@ -351,10 +337,10 @@ static void reconstruct_inter_block(int plane, int block,
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
*args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
plane_bsize, x, y, tx_size,
- args->r, args->token_cache);
+ args->r);
inverse_transform_block(xd, plane, block, tx_size,
&pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
- pd->dst.stride, args->token_cache);
+ pd->dst.stride);
}
static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -404,8 +390,7 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE bsize,
- uint8_t *token_cache) {
+ vp9_reader *r, BLOCK_SIZE bsize) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
@@ -427,9 +412,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
}
if (!is_inter_block(mbmi)) {
- struct intra_args arg = {
- cm, xd, r, token_cache
- };
+ struct intra_args arg = { cm, xd, r };
foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
&arg);
} else {
@@ -442,14 +425,12 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
vp9_get_filter_kernel(mbmi->interp_filter);
// Prediction
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
// Reconstruction
if (!mbmi->skip_coeff) {
int eobtotal = 0;
- struct inter_args arg = {
- cm, xd, r, &eobtotal, token_cache
- };
+ struct inter_args arg = { cm, xd, r, &eobtotal };
foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
if (!less8x8 && eobtotal == 0)
mbmi->skip_coeff = 1; // skip loopfilter
@@ -471,7 +452,7 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
PARTITION_TYPE p;
if (has_rows && has_cols)
- p = treed_read(r, vp9_partition_tree, probs);
+ p = vp9_read_tree(r, vp9_partition_tree, probs);
else if (!has_rows && has_cols)
p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
else if (has_rows && !has_cols)
@@ -488,8 +469,7 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE bsize,
- uint8_t *token_cache) {
+ vp9_reader* r, BLOCK_SIZE bsize) {
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
@@ -500,33 +480,27 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
subsize = get_subsize(bsize, partition);
if (subsize < BLOCK_8X8) {
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
} else {
switch (partition) {
case PARTITION_NONE:
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
break;
case PARTITION_HORZ:
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
if (mi_row + hbs < cm->mi_rows)
- decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
- token_cache);
+ decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
break;
case PARTITION_VERT:
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
if (mi_col + hbs < cm->mi_cols)
- decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
- token_cache);
+ decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
break;
case PARTITION_SPLIT:
- decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
- token_cache);
- decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
- token_cache);
- decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
- token_cache);
- decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
- token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
break;
default:
assert(!"Invalid partition type");
@@ -809,8 +783,7 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
vp9_zero(xd->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
- decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
- pbi->token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
}
if (pbi->do_loopfilter_inline) {
@@ -951,11 +924,9 @@ static void setup_tile_macroblockd(TileWorkerData *const tile_data) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- pd[i].qcoeff = tile_data->qcoeff[i];
pd[i].dqcoeff = tile_data->dqcoeff[i];
pd[i].eobs = tile_data->eobs[i];
vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
- vpx_memset(tile_data->token_cache, 0, sizeof(tile_data->token_cache));
}
}
@@ -971,8 +942,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
- mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
- tile_data->token_cache);
+ mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
}
}
return !tile_data->xd.corrupted;
@@ -1164,8 +1134,12 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
cm->mcomp_filter_type = read_interp_filter_type(rb);
- for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+ for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
vp9_setup_scale_factors(cm, i);
+ if (vp9_is_scaled(&cm->active_ref_scale_comm[i]))
+ vp9_extend_frame_borders(&cm->yv12_fb[cm->active_ref_idx[i]],
+ cm->subsampling_x, cm->subsampling_y);
+ }
}
}
@@ -1212,7 +1186,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
if (cm->tx_mode == TX_MODE_SELECT)
- read_tx_probs(&fc->tx_probs, &r);
+ read_tx_mode_probs(&fc->tx_probs, &r);
read_coef_probs(fc, cm->tx_mode, &r);
for (k = 0; k < MBSKIP_CONTEXTS; ++k)
@@ -1230,7 +1204,8 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
- read_comp_pred(cm, &r);
+ cm->comp_pred_mode = read_reference_mode(cm, &r);
+ read_reference_mode_probs(cm, &r);
for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
for (i = 0; i < INTRA_MODES - 1; ++i)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 75f0ae865..327a9166c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -20,13 +20,13 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_seg_common.h"
+#include "vp9/decoder/vp9_dboolhuff.h"
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decodeframe.h"
#include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_treereader.h"
static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
- return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
+ return (MB_PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
}
static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
@@ -49,8 +49,8 @@ static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
int ctx) {
- const int mode = treed_read(r, vp9_inter_mode_tree,
- cm->fc.inter_mode_probs[ctx]);
+ const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
+ cm->fc.inter_mode_probs[ctx]);
if (!cm->frame_parallel_decoding_mode)
++cm->counts.inter_mode[ctx][mode];
@@ -58,7 +58,7 @@ static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
}
static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
- return treed_read(r, vp9_segment_tree, seg->tree_probs);
+ return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
}
static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -210,12 +210,12 @@ static int read_mv_component(vp9_reader *r,
const nmv_component *mvcomp, int usehp) {
int mag, d, fr, hp;
const int sign = vp9_read(r, mvcomp->sign);
- const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+ const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
const int class0 = mv_class == MV_CLASS_0;
// Integer part
if (class0) {
- d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
+ d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
} else {
int i;
const int n = mv_class + CLASS0_BITS - 1; // number of bits
@@ -226,8 +226,8 @@ static int read_mv_component(vp9_reader *r,
}
// Fractional part
- fr = treed_read(r, vp9_mv_fp_tree,
- class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+ fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+ : mvcomp->fp);
// High precision part (if hp is not used, the default value of the hp is 1)
@@ -242,7 +242,7 @@ static int read_mv_component(vp9_reader *r,
static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
const nmv_context *ctx,
nmv_context_counts *counts, int allow_hp) {
- const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+ const MV_JOINT_TYPE j = vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints);
const int use_hp = allow_hp && vp9_use_mv_hp(ref);
MV diff = {0, 0};
@@ -258,14 +258,14 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
mv->col = ref->col + diff.col;
}
-static COMPPREDMODE_TYPE read_reference_mode(VP9_COMMON *cm,
+static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm,
const MACROBLOCKD *xd,
vp9_reader *r) {
const int ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
if (!cm->frame_parallel_decoding_mode)
++cm->counts.comp_inter[ctx][mode];
- return mode; // SINGLE_PREDICTION_ONLY or COMP_PREDICTION_ONLY
+ return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE
}
// Read the referncence frame
@@ -279,12 +279,12 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
ref_frame[1] = NONE;
} else {
- const COMPPREDMODE_TYPE mode = (cm->comp_pred_mode == HYBRID_PREDICTION)
+ const REFERENCE_MODE mode = (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
? read_reference_mode(cm, xd, r)
: cm->comp_pred_mode;
// FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
- if (mode == COMP_PREDICTION_ONLY) {
+ if (mode == COMPOUND_REFERENCE) {
const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
@@ -292,7 +292,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
++counts->comp_ref[ctx][bit];
ref_frame[idx] = cm->comp_fixed_ref;
ref_frame[!idx] = cm->comp_var_ref[bit];
- } else if (mode == SINGLE_PREDICTION_ONLY) {
+ } else if (mode == SINGLE_REFERENCE) {
const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
if (!cm->frame_parallel_decoding_mode)
@@ -318,8 +318,8 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
- const int type = treed_read(r, vp9_switchable_interp_tree,
- cm->fc.switchable_interp_prob[ctx]);
+ const int type = vp9_read_tree(r, vp9_switchable_interp_tree,
+ cm->fc.switchable_interp_prob[ctx]);
if (!cm->frame_parallel_decoding_mode)
++cm->counts.switchable_interp[ctx][type];
return type;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 214c1c198..bdbe67dbc 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -18,7 +18,6 @@
#include "vp9/decoder/vp9_dboolhuff.h"
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_treereader.h"
#define EOB_CONTEXT_NODE 0
#define ZERO_CONTEXT_NODE 1
@@ -61,16 +60,10 @@ static const vp9_prob cat6_prob[15] = {
254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
};
-static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
- ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN,
- TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN,
- TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
-};
-
#define INCREMENT_COUNT(token) \
do { \
if (!cm->frame_parallel_decoding_mode) \
- ++coef_counts[band][pt][token_to_counttoken[token]]; \
+ ++coef_counts[band][pt][token]; \
} while (0)
@@ -78,7 +71,6 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
{ \
v = (val * dqv) >> dq_shift; \
dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -v : v); \
- INCREMENT_COUNT(token); \
token_cache[scan[c]] = vp9_pt_energy_class[token]; \
++c; \
pt = get_coef_context(nb, token_cache, c); \
@@ -94,9 +86,8 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_reader *r, int block_idx,
- PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
- TX_SIZE tx_size, const int16_t *dq, int pt,
- uint8_t *token_cache) {
+ PLANE_TYPE type, int max_eob, int16_t *dqcoeff_ptr,
+ TX_SIZE tx_size, const int16_t *dq, int pt) {
const FRAME_CONTEXT *const fc = &cm->fc;
FRAME_COUNTS *const counts = &cm->counts;
const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -108,6 +99,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
counts->coef[tx_size][type][ref];
unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
counts->eob_branch[tx_size][type][ref];
+ uint8_t token_cache[32 * 32];
const uint8_t *cat6;
const uint8_t *band_translate = get_band_translate(tx_size);
const int dq_shift = (tx_size == TX_32X32);
@@ -117,38 +109,39 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
int v;
int16_t dqv = dq[0];
-
-
- while (c < seg_eob) {
+ while (c < max_eob) {
int val;
band = *band_translate++;
prob = coef_probs[band][pt];
if (!cm->frame_parallel_decoding_mode)
++eob_branch_count[band][pt];
- if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
+ if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+ INCREMENT_COUNT(DCT_EOB_MODEL_TOKEN);
break;
+ }
- DECODE_ZERO:
- if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+ while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
dqv = dq[1];
+ token_cache[scan[c]] = 0;
++c;
- if (c >= seg_eob)
- break;
+ if (c >= max_eob)
+ return c; // zero tokens at the end (no eob token)
pt = get_coef_context(nb, token_cache, c);
band = *band_translate++;
prob = coef_probs[band][pt];
- goto DECODE_ZERO;
}
// ONE_CONTEXT_NODE_0_
if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+ INCREMENT_COUNT(ONE_TOKEN);
WRITE_COEF_CONTINUE(1, ONE_TOKEN);
}
- prob = vp9_pareto8_full[coef_probs[band][pt][PIVOT_NODE]-1];
+ INCREMENT_COUNT(TWO_TOKEN);
+
+ prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
- // LOW_VAL_CONTEXT_NODE_0_
if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
WRITE_COEF_CONTINUE(2, TWO_TOKEN);
@@ -158,7 +151,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
}
WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
}
- // HIGH_LOW_CONTEXT_NODE_0_
+
if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
val = CAT1_MIN_VAL;
@@ -170,7 +163,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
ADJUST_COEF(CAT2_PROB0, 0);
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2);
}
- // CAT_THREEFOUR_CONTEXT_NODE_0_
+
if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
val = CAT3_MIN_VAL;
@@ -186,7 +179,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
ADJUST_COEF(CAT4_PROB0, 0);
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4);
}
- // CAT_FIVE_CONTEXT_NODE_0_:
+
if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
val = CAT5_MIN_VAL;
ADJUST_COEF(CAT5_PROB4, 4);
@@ -205,18 +198,12 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
}
- if (c < seg_eob) {
- if (!cm->frame_parallel_decoding_mode)
- ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN];
- }
-
return c;
}
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
int plane, int block, BLOCK_SIZE plane_bsize,
- int x, int y, TX_SIZE tx_size, vp9_reader *r,
- uint8_t *token_cache) {
+ int x, int y, TX_SIZE tx_size, vp9_reader *r) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
tx_size);
@@ -224,7 +211,7 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
pd->left_context + y);
const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob,
BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
- pd->dequant, pt, token_cache);
+ pd->dequant, pt);
set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
pd->eobs[block] = eob;
return eob;
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index e858a19f7..2a8807379 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -17,7 +17,6 @@
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
int plane, int block, BLOCK_SIZE plane_bsize,
- int x, int y, TX_SIZE tx_size, vp9_reader *r,
- uint8_t *token_cache);
+ int x, int y, TX_SIZE tx_size, vp9_reader *r);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 7c0f91d88..740ad72cb 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -113,7 +113,6 @@ static void init_macroblockd(VP9D_COMP *const pbi) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- pd[i].qcoeff = pbi->qcoeff[i];
pd[i].dqcoeff = pbi->dqcoeff[i];
pd[i].eobs = pbi->eobs[i];
}
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index d3d29e98d..038cd96a5 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -22,7 +22,6 @@ typedef struct VP9Decompressor {
DECLARE_ALIGNED(16, VP9_COMMON, common);
- DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]);
DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
@@ -53,8 +52,6 @@ typedef struct VP9Decompressor {
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
PARTITION_CONTEXT *above_seg_context;
-
- DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
} VP9D_COMP;
#endif // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h
deleted file mode 100644
index 41680d245..000000000
--- a/vp9/decoder/vp9_treereader.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_TREEREADER_H_
-#define VP9_DECODER_VP9_TREEREADER_H_
-
-#include "vp9/common/vp9_treecoder.h"
-#include "vp9/decoder/vp9_dboolhuff.h"
-
-// Intent of tree data structure is to make decoding trivial.
-static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
- vp9_tree t,
- const vp9_prob *const p) {
- register vp9_tree_index i = 0;
-
- while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0)
- continue;
-
- return -i;
-}
-
-#endif // VP9_DECODER_VP9_TREEREADER_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a0fced576..9f79f8cdc 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -32,6 +32,7 @@
#include "vp9/encoder/vp9_bitstream.h"
#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/encoder/vp9_write_bit_buffer.h"
@@ -151,6 +152,30 @@ void write_switchable_interp_stats() {
}
#endif
+static struct vp9_token intra_mode_encodings[INTRA_MODES];
+static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+static struct vp9_token partition_encodings[PARTITION_TYPES];
+static struct vp9_token inter_mode_encodings[INTER_MODES];
+
+void vp9_entropy_mode_init() {
+ vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
+ vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
+ vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
+ vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
+}
+
+static void write_intra_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+ const vp9_prob *probs) {
+ write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
+}
+
+static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+ const vp9_prob *probs) {
+ assert(is_inter_mode(mode));
+ write_token(w, vp9_inter_mode_tree, probs,
+ &inter_mode_encodings[INTER_OFFSET(mode)]);
+}
+
static INLINE void write_be32(uint8_t *p, int value) {
p[0] = value >> 24;
p[1] = value >> 16;
@@ -169,6 +194,8 @@ static void prob_diff_update(const vp9_tree_index *tree,
int n, vp9_writer *w) {
int i;
unsigned int branch_ct[32][2];
+
+ // Assuming max number of probabilities <= 32
assert(n <= 32);
vp9_tree_probs_from_distribution(tree, branch_ct, counts);
@@ -211,10 +238,6 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
}
-static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
-}
-
static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
VP9_COMMON *const cm = &cpi->common;
int j;
@@ -291,14 +314,6 @@ static void pack_mb_tokens(vp9_writer* const w,
*tp = p + (p->token == EOSB_TOKEN);
}
-static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
- const vp9_prob *p) {
- assert(is_inter_mode(mode));
- write_token(w, vp9_inter_mode_tree, p,
- &vp9_inter_mode_encodings[INTER_OFFSET(mode)]);
-}
-
-
static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
int segment_id) {
if (seg->enabled && seg->update_map)
@@ -319,12 +334,12 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
if (!seg_ref_active) {
// does the feature use compound prediction or not
// (if not specified at the frame/segment level)
- if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
vp9_get_pred_prob_comp_inter_inter(cm, xd));
} else {
assert((mi->ref_frame[1] <= INTRA_FRAME) ==
- (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+ (cm->comp_pred_mode == SINGLE_REFERENCE));
}
if (mi->ref_frame[1] > INTRA_FRAME) {
@@ -420,7 +435,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
// If segment skip is not enabled code the mode.
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
if (bsize >= BLOCK_8X8) {
- write_sb_mv_ref(bc, mode, mv_ref_p);
+ write_inter_mode(bc, mode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
[INTER_OFFSET(mode)];
}
@@ -430,7 +445,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
write_token(bc, vp9_switchable_interp_tree,
cm->fc.switchable_interp_prob[ctx],
- &vp9_switchable_interp_encodings[mi->interp_filter]);
+ &switchable_interp_encodings[mi->interp_filter]);
} else {
assert(mi->interp_filter == cm->mcomp_filter_type);
}
@@ -443,7 +458,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
const int j = idy * 2 + idx;
const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
- write_sb_mv_ref(bc, blockmode, mv_ref_p);
+ write_inter_mode(bc, blockmode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
[INTER_OFFSET(blockmode)];
@@ -559,7 +574,7 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
const int has_cols = (mi_col + hbs) < cm->mi_cols;
if (has_rows && has_cols) {
- write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+ write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
} else if (!has_rows && has_cols) {
assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
vp9_write(w, p == PARTITION_SPLIT, probs[1]);
@@ -1357,8 +1372,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (cm->allow_comp_inter_inter) {
const int comp_pred_mode = cpi->common.comp_pred_mode;
- const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY;
- const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION;
+ const int use_compound_pred = comp_pred_mode != SINGLE_REFERENCE;
+ const int use_hybrid_pred = comp_pred_mode == REFERENCE_MODE_SELECT;
vp9_write_bit(&header_bc, use_compound_pred);
if (use_compound_pred) {
@@ -1370,7 +1385,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
}
}
- if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+ if (cm->comp_pred_mode != COMPOUND_REFERENCE) {
for (i = 0; i < REF_CONTEXTS; i++) {
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
cpi->single_ref_count[i][0]);
@@ -1379,7 +1394,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
}
}
- if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+ if (cm->comp_pred_mode != SINGLE_REFERENCE)
for (i = 0; i < REF_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
cpi->comp_ref_count[i]);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 444597067..71f7e7a52 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -69,6 +69,7 @@ typedef struct {
struct macroblock_plane {
DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+ int16_t *qcoeff;
int16_t *coeff;
struct buf_2d src;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 33839370a..89da78190 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -360,6 +360,52 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
adjust_act_zbin(cpi, x);
}
+// Select a segment for the current SB64
+static void select_in_frame_q_segment(VP9_COMP *cpi,
+ int mi_row, int mi_col,
+ int output_enabled, int projected_rate) {
+ VP9_COMMON * const cm = &cpi->common;
+ int target_rate = cpi->rc.sb64_target_rate << 8; // convert to bits << 8
+
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = 1 << mi_width_log2(BLOCK_64X64);
+ const int bh = 1 << mi_height_log2(BLOCK_64X64);
+ const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int complexity_metric = 64;
+ int x, y;
+
+ unsigned char segment;
+
+ if (!output_enabled) {
+ segment = 0;
+ } else {
+ // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+ // It is converted to bits * 256 units
+ target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+
+ if (projected_rate < (target_rate / 4)) {
+ segment = 2;
+ } else if (projected_rate < (target_rate / 2)) {
+ segment = 1;
+ } else {
+ segment = 0;
+ }
+
+ complexity_metric =
+ clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+ }
+
+ // Fill in the entires in the segment map corresponding to this SB64
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+ cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
+ (unsigned char)complexity_metric;
+ }
+ }
+}
+
static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
BLOCK_SIZE bsize, int output_enabled) {
int i, x_idx, y;
@@ -383,19 +429,24 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
assert(mi->mbmi.sb_type == bsize);
+ // For in frame adaptive Q copy over the chosen segment id into the
+ // mode innfo context for the chosen mode / partition.
+ if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && output_enabled)
+ mi->mbmi.segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+
*mi_addr = *mi;
max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
for (i = 0; i < max_plane; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][1];
- pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
pd[i].eobs = ctx->eobs_pbuf[i][1];
}
for (i = max_plane; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][2];
- pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
pd[i].eobs = ctx->eobs_pbuf[i][2];
}
@@ -405,10 +456,12 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
for (y = 0; y < mi_height; y++)
for (x_idx = 0; x_idx < mi_width; x_idx++)
if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
- && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
+ && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
xd->mi_8x8[x_idx + y * mis] = mi_addr;
+ }
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ if ((cpi->oxcf.aq_mode == VARIANCE_AQ) ||
+ (cpi->oxcf.aq_mode == COMPLEXITY_AQ)) {
vp9_mb_init_quantizer(cpi, x);
}
@@ -478,9 +531,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
}
- cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
- cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
- cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
+ cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+ cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+ cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
@@ -557,7 +610,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
/* segment ID */
if (seg->enabled) {
- if (!cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
uint8_t *map = seg->update_map ? cpi->segmentation_map
: cm->last_frame_seg_map;
mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
@@ -622,7 +675,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
for (i = 0; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][0];
- pd[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
pd[i].eobs = ctx->eobs_pbuf[i][0];
}
@@ -653,6 +706,14 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state(); // __asm emms;
x->rdmult = round(x->rdmult * rdmult_ratio);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ unsigned char complexity = cpi->complexity_map[mi_offset];
+ const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) ||
+ (mi_col == 0) || (mi_col == (cm->mi_cols - 1));
+
+ if (!is_edge && (complexity > 128))
+ x->rdmult = x->rdmult + ((x->rdmult * (complexity - 128)) / 256);
}
// Find best coding mode & reconstruct the MB so it is available
@@ -697,7 +758,7 @@ static void update_stats(VP9_COMP *cpi) {
// reference frame allowed for the segment so exclude it from
// the reference frame counts used to work out probabilities.
if (is_inter_block(mbmi) && !seg_ref_active) {
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
[has_second_ref(mbmi)]++;
@@ -1261,8 +1322,19 @@ static void rd_use_partition(VP9_COMP *cpi,
if ( bsize == BLOCK_64X64)
assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
- if (do_recon)
- encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ if (do_recon) {
+ int output_enabled = (bsize == BLOCK_64X64);
+
+ // Check the projected output rate for this SB against it's target
+ // and and if necessary apply a Q delta using segmentation to get
+ // closer to the target.
+ if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+ select_in_frame_q_segment(cpi, mi_row, mi_col,
+ output_enabled, chosen_rate);
+ }
+
+ encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+ }
*rate = chosen_rate;
*dist = chosen_dist;
@@ -1495,10 +1567,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
// Override skipping rectangular partition operations for edge blocks
const int force_horz_split = (mi_row + ms >= cm->mi_rows);
const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+ const int xss = x->e_mbd.plane[1].subsampling_x;
+ const int yss = x->e_mbd.plane[1].subsampling_y;
int partition_none_allowed = !force_horz_split && !force_vert_split;
- int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
- int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
+ int partition_horz_allowed = !force_vert_split && yss <= xss &&
+ bsize >= BLOCK_8X8;
+ int partition_vert_allowed = !force_horz_split && xss <= yss &&
+ bsize >= BLOCK_8X8;
int partition_split_done = 0;
(void) *tp_orig;
@@ -1740,8 +1816,17 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
*rate = best_rate;
*dist = best_dist;
- if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
- encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+ int output_enabled = (bsize == BLOCK_64X64);
+
+ // Check the projected output rate for this SB against it's target
+ // and and if necessary apply a Q delta using segmentation to get
+ // closer to the target.
+ if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+ select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate);
+ }
+ encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+ }
if (bsize == BLOCK_64X64) {
assert(tp_orig < *tp);
assert(best_rate < INT_MAX);
@@ -1868,10 +1953,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
xd->mode_info_stride = cm->mode_info_stride;
- // reset intra mode contexts
- if (frame_is_intra_only(cm))
- vp9_init_mbmode_probs(cm);
-
// Copy data over into macro block data structures.
vp9_setup_src_planes(x, cpi->Source, 0, 0);
@@ -2234,18 +2315,18 @@ void vp9_encode_frame(VP9_COMP *cpi) {
/* prediction (compound, single or hybrid) mode selection */
if (frame_type == 3 || !cm->allow_comp_inter_inter)
- pred_type = SINGLE_PREDICTION_ONLY;
+ pred_type = SINGLE_REFERENCE;
else if (cpi->rd_prediction_type_threshes[frame_type][1]
> cpi->rd_prediction_type_threshes[frame_type][0]
&& cpi->rd_prediction_type_threshes[frame_type][1]
> cpi->rd_prediction_type_threshes[frame_type][2]
&& check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
- pred_type = COMP_PREDICTION_ONLY;
+ pred_type = COMPOUND_REFERENCE;
else if (cpi->rd_prediction_type_threshes[frame_type][0]
> cpi->rd_prediction_type_threshes[frame_type][2])
- pred_type = SINGLE_PREDICTION_ONLY;
+ pred_type = SINGLE_REFERENCE;
else
- pred_type = HYBRID_PREDICTION;
+ pred_type = REFERENCE_MODE_SELECT;
/* filter type selection */
// FIXME(rbultje) for some odd reason, we often select smooth_filter
@@ -2282,7 +2363,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->common.mcomp_filter_type = filter_type;
encode_frame_internal(cpi);
- for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+ for (i = 0; i < REFERENCE_MODES; ++i) {
const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
cpi->rd_prediction_type_threshes[frame_type][i] += diff;
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
@@ -2305,7 +2386,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->rd_tx_select_threshes[frame_type][i] /= 2;
}
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
int single_count_zero = 0;
int comp_count_zero = 0;
@@ -2315,10 +2396,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
}
if (comp_count_zero == 0) {
- cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
+ cpi->common.comp_pred_mode = SINGLE_REFERENCE;
vp9_zero(cpi->comp_inter_count);
} else if (single_count_zero == 0) {
- cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
+ cpi->common.comp_pred_mode = COMPOUND_REFERENCE;
vp9_zero(cpi->comp_inter_count);
}
}
@@ -2415,7 +2496,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
- x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8;
+ x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
+ (cpi->oxcf.aq_mode != COMPLEXITY_AQ);
x->skip_optimize = ctx->is_coded;
ctx->is_coded = 1;
x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 88cf11214..3691e7a7b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -136,6 +136,7 @@ static void optimize_b(MACROBLOCK *mb,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
TX_SIZE tx_size) {
MACROBLOCKD *const xd = &mb->e_mbd;
+ struct macroblock_plane *p = &mb->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
vp9_token_state tokens[1025][2];
@@ -163,7 +164,7 @@ static void optimize_b(MACROBLOCK *mb,
assert((!type && !plane) || (type && plane));
dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
- qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+ qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
assert(eob <= default_eob);
/* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -368,26 +369,23 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+ int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const scan_order *so;
uint16_t *eob = &pd->eobs[block];
- const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
- const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
- int xoff, yoff;
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ int i, j;
int16_t *src_diff;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+ src_diff = &p->src_diff[4 * (j * diff_stride + i)];
switch (tx_size) {
case TX_32X32:
so = &vp9_default_scan_orders[TX_32X32];
- block >>= 6;
- xoff = 32 * (block & twmask);
- yoff = 32 * (block >> twl);
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->use_lp32x32fdct)
- vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
+ vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
else
- vp9_fdct32x32(src_diff, coeff, bw * 4);
+ vp9_fdct32x32(src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, so->scan,
@@ -395,32 +393,21 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
break;
case TX_16X16:
so = &vp9_default_scan_orders[TX_16X16];
- block >>= 4;
- xoff = 16 * (block & twmask);
- yoff = 16 * (block >> twl);
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
- vp9_fdct16x16(src_diff, coeff, bw * 4);
+ vp9_fdct16x16(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, so->scan, so->iscan);
break;
case TX_8X8:
so = &vp9_default_scan_orders[TX_8X8];
- block >>= 2;
- xoff = 8 * (block & twmask);
- yoff = 8 * (block >> twl);
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
- vp9_fdct8x8(src_diff, coeff, bw * 4);
+ vp9_fdct8x8(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, so->scan, so->iscan);
break;
case TX_4X4:
so = &vp9_default_scan_orders[TX_4X4];
- xoff = 4 * (block & twmask);
- yoff = 4 * (block >> twl);
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
- x->fwd_txm4x4(src_diff, coeff, bw * 4);
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, so->scan, so->iscan);
@@ -544,7 +531,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+ int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const scan_order *so;
TX_TYPE tx_type;
@@ -572,8 +559,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
block >>= 6;
vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
- dst, pd->dst.stride, dst, pd->dst.stride);
-
+ x->skip_encode ? src : dst,
+ x->skip_encode ? p->src.stride : pd->dst.stride,
+ dst, pd->dst.stride);
if (!x->skip_recode) {
vp9_subtract_block(32, 32, src_diff, diff_stride,
src, p->src.stride, dst, pd->dst.stride);
@@ -595,7 +583,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
block >>= 4;
vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
- dst, pd->dst.stride, dst, pd->dst.stride);
+ x->skip_encode ? src : dst,
+ x->skip_encode ? p->src.stride : pd->dst.stride,
+ dst, pd->dst.stride);
if (!x->skip_recode) {
vp9_subtract_block(16, 16, src_diff, diff_stride,
src, p->src.stride, dst, pd->dst.stride);
@@ -613,7 +603,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
block >>= 2;
vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
- dst, pd->dst.stride, dst, pd->dst.stride);
+ x->skip_encode ? src : dst,
+ x->skip_encode ? p->src.stride : pd->dst.stride,
+ dst, pd->dst.stride);
if (!x->skip_recode) {
vp9_subtract_block(8, 8, src_diff, diff_stride,
src, p->src.stride, dst, pd->dst.stride);
@@ -634,7 +626,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
- dst, pd->dst.stride, dst, pd->dst.stride);
+ x->skip_encode ? src : dst,
+ x->skip_encode ? p->src.stride : pd->dst.stride,
+ dst, pd->dst.stride);
if (!x->skip_recode) {
vp9_subtract_block(4, 4, src_diff, diff_stride,
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index cc4e347a3..3f01c778f 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -15,11 +15,22 @@
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_encodemv.h"
-
#ifdef ENTROPY_STATS
extern unsigned int active_section;
#endif
+static struct vp9_token mv_joint_encodings[MV_JOINTS];
+static struct vp9_token mv_class_encodings[MV_CLASSES];
+static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
+static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
+
+void vp9_entropy_mv_init() {
+ vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
+ vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
+ vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
+ vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
+}
+
static void encode_mv_component(vp9_writer* w, int comp,
const nmv_component* mvcomp, int usehp) {
int offset;
@@ -37,12 +48,12 @@ static void encode_mv_component(vp9_writer* w, int comp,
// Class
write_token(w, vp9_mv_class_tree, mvcomp->classes,
- &vp9_mv_class_encodings[mv_class]);
+ &mv_class_encodings[mv_class]);
// Integer bits
if (mv_class == MV_CLASS_0) {
write_token(w, vp9_mv_class0_tree, mvcomp->class0,
- &vp9_mv_class0_encodings[d]);
+ &mv_class0_encodings[d]);
} else {
int i;
const int n = mv_class + CLASS0_BITS - 1; // number of bits
@@ -53,7 +64,7 @@ static void encode_mv_component(vp9_writer* w, int comp,
// Fractional bits
write_token(w, vp9_mv_fp_tree,
mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp,
- &vp9_mv_fp_encodings[fr]);
+ &mv_fp_encodings[fr]);
// High precision bit
if (usehp)
@@ -137,111 +148,55 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
return update;
}
-static void counts_to_nmv_context(
- nmv_context_counts *nmv_count,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][MV_FP_SIZE - 1][2],
- unsigned int (*branch_ct_fp)[MV_FP_SIZE - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]) {
- int i, j, k;
- vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint,
- nmv_count->joints);
- for (i = 0; i < 2; ++i) {
- branch_ct_sign[i][0] = nmv_count->comps[i].sign[0];
- branch_ct_sign[i][1] = nmv_count->comps[i].sign[1];
- vp9_tree_probs_from_distribution(vp9_mv_class_tree,
- branch_ct_classes[i],
- nmv_count->comps[i].classes);
- vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
- branch_ct_class0[i],
- nmv_count->comps[i].class0);
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- branch_ct_bits[i][j][0] = nmv_count->comps[i].bits[j][0];
- branch_ct_bits[i][j][1] = nmv_count->comps[i].bits[j][1];
- }
- }
- for (i = 0; i < 2; ++i) {
- for (k = 0; k < CLASS0_SIZE; ++k) {
- vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- branch_ct_class0_fp[i][k],
- nmv_count->comps[i].class0_fp[k]);
- }
- vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- branch_ct_fp[i],
- nmv_count->comps[i].fp);
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- branch_ct_class0_hp[i][0] = nmv_count->comps[i].class0_hp[0];
- branch_ct_class0_hp[i][1] = nmv_count->comps[i].class0_hp[1];
+static void write_mv_update(const vp9_tree_index *tree,
+ vp9_prob probs[/*n - 1*/],
+ const unsigned int counts[/*n - 1*/],
+ int n, vp9_writer *w) {
+ int i;
+ unsigned int branch_ct[32][2];
- branch_ct_hp[i][0] = nmv_count->comps[i].hp[0];
- branch_ct_hp[i][1] = nmv_count->comps[i].hp[1];
- }
- }
+ // Assuming max number of probabilities <= 32
+ assert(n <= 32);
+
+ vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+ for (i = 0; i < n - 1; ++i)
+ update_mv(w, branch_ct[i], &probs[i], NMV_UPDATE_PROB);
}
-void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
+void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer *w) {
int i, j;
- unsigned int branch_ct_joint[MV_JOINTS - 1][2];
- unsigned int branch_ct_sign[2][2];
- unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
- unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
- unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
- unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][MV_FP_SIZE - 1][2];
- unsigned int branch_ct_fp[2][MV_FP_SIZE - 1][2];
- unsigned int branch_ct_class0_hp[2][2];
- unsigned int branch_ct_hp[2][2];
nmv_context *mvc = &cpi->common.fc.nmvc;
+ nmv_context_counts *counts = &cpi->NMVcount;
- counts_to_nmv_context(&cpi->NMVcount, usehp,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
-
- for (j = 0; j < MV_JOINTS - 1; ++j)
- update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
+ write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
for (i = 0; i < 2; ++i) {
- update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j)
- update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
- NMV_UPDATE_PROB);
-
- for (j = 0; j < CLASS0_SIZE - 1; ++j)
- update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
- NMV_UPDATE_PROB);
-
+ nmv_component *comp = &mvc->comps[i];
+ nmv_component_counts *comp_counts = &counts->comps[i];
+
+ update_mv(w, comp_counts->sign, &comp->sign, NMV_UPDATE_PROB);
+ write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes,
+ MV_CLASSES, w);
+ write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0,
+ CLASS0_SIZE, w);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
- NMV_UPDATE_PROB);
+ update_mv(w, comp_counts->bits[j], &comp->bits[j], NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- int k;
- for (k = 0; k < MV_FP_SIZE - 1; ++k)
- update_mv(bc, branch_ct_class0_fp[i][j][k],
- &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
- }
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j],
+ counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
- for (j = 0; j < MV_FP_SIZE - 1; ++j)
- update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
+ write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+ MV_FP_SIZE, w);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
- update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
- NMV_UPDATE_PROB);
- update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
+ update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
NMV_UPDATE_PROB);
+ update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, NMV_UPDATE_PROB);
}
}
}
@@ -254,7 +209,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
usehp = usehp && vp9_use_mv_hp(ref);
- write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
+ write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
if (mv_joint_vertical(j))
encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
@@ -314,3 +269,4 @@ void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
}
}
+
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index 633177885..4cc10da73 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -14,6 +14,8 @@
#include "vp9/encoder/vp9_onyx_int.h"
+void vp9_entropy_mv_init();
+
void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index df2841020..50d803680 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -76,6 +76,19 @@ static int select_cq_level(int qindex) {
return ret_val;
}
+static int gfboost_qadjust(int qindex) {
+ const double q = vp9_convert_qindex_to_q(qindex);
+ return (int)((0.00000828 * q * q * q) +
+ (-0.0055 * q * q) +
+ (1.32 * q) + 79.3);
+}
+
+static int kfboost_qadjust(int qindex) {
+ const double q = vp9_convert_qindex_to_q(qindex);
+ return (int)((0.00000973 * q * q * q) +
+ (-0.00613 * q * q) +
+ (1.316 * q) + 121.2);
+}
// Resets the first pass file to the given position using a relative seek from
// the current position.
@@ -336,9 +349,11 @@ static int frame_max_bits(VP9_COMP *cpi) {
const double max_bits = (1.0 * cpi->twopass.bits_left /
(cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
(cpi->oxcf.two_pass_vbrmax_section / 100.0);
-
- // Trap case where we are out of bits.
- return MAX((int)max_bits, 0);
+ if (max_bits < 0)
+ return 0;
+ if (max_bits >= INT_MAX)
+ return INT_MAX;
+ return (int)max_bits;
}
void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -528,7 +543,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
for (i = 0; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][1];
- pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
pd[i].eobs = ctx->eobs_pbuf[i][1];
}
@@ -926,11 +941,11 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi,
intra_cost = bitcost(av_intra);
// Estimate of extra bits per mv overhead for mbs
- // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
+ // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
// Crude estimate of overhead cost from modes
- // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
+ // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
mode_cost =
(int)((((av_pct_inter - av_pct_motion) * zz_cost) +
(av_pct_motion * motion_cost) +
@@ -1050,8 +1065,8 @@ static int estimate_max_q(VP9_COMP *cpi,
sr_correction * speed_correction *
cpi->twopass.est_max_qcorrection_factor;
- bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
- err_correction_factor);
+ bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+ err_correction_factor);
if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
break;
@@ -1138,7 +1153,7 @@ static int estimate_cq(VP9_COMP *cpi,
sr_correction * speed_correction * clip_iifactor;
bits_per_mb_at_this_q =
- vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
+ vp9_rc_bits_per_mb(INTER_FRAME, q, err_correction_factor);
if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
break;
@@ -1934,7 +1949,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int q = cpi->rc.last_q[INTER_FRAME];
int gf_bits;
- int boost = (cpi->rc.gfu_boost * vp9_gfboost_qadjust(q)) / 100;
+ int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
// Set max and minimum boost and hence minimum allocation
boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
@@ -2726,3 +2741,21 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// sizes.
cpi->twopass.modified_error_left -= kf_group_err;
}
+
+void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+#ifdef DISABLE_RC_LONG_TERM_MEM
+ cpi->twopass.bits_left -= cpi->rc.this_frame_target;
+#else
+ cpi->twopass.bits_left -= 8 * bytes_used;
+#endif
+ if (!cpi->refresh_alt_ref_frame) {
+ double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section
+ / 100);
+ if (two_pass_min_rate < lower_bounds_min_rate)
+ two_pass_min_rate = lower_bounds_min_rate;
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate /
+ cpi->oxcf.framerate);
+ }
+}
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
deleted file mode 100644
index 7eb659232..000000000
--- a/vp9/encoder/vp9_modecosts.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/common/vp9_entropymode.h"
-
-
-void vp9_init_mode_costs(VP9_COMP *c) {
- VP9_COMMON *const cm = &c->common;
- const vp9_tree_index *KT = vp9_intra_mode_tree;
- int i, j;
-
- for (i = 0; i < INTRA_MODES; i++) {
- for (j = 0; j < INTRA_MODES; j++) {
- vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
- KT);
- }
- }
-
- // TODO(rbultje) separate tables for superblock costing?
- vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
- vp9_intra_mode_tree);
- vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
- cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
- vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
- vp9_kf_uv_mode_prob[INTRA_MODES - 1],
- vp9_intra_mode_tree);
-
- for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
- vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
- cm->fc.switchable_interp_prob[i],
- vp9_switchable_interp_tree);
-}
diff --git a/vp9/encoder/vp9_modecosts.h b/vp9/encoder/vp9_modecosts.h
deleted file mode 100644
index f43033e5f..000000000
--- a/vp9/encoder/vp9_modecosts.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_MODECOSTS_H_
-#define VP9_ENCODER_VP9_MODECOSTS_H_
-
-void vp9_init_mode_costs(VP9_COMP *x);
-
-#endif // VP9_ENCODER_VP9_MODECOSTS_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index d7b179689..8ae70c9bb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -24,6 +24,8 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_mbgraph.h"
#include "vp9/encoder/vp9_onyx_int.h"
@@ -37,6 +39,9 @@
#include "vpx_ports/vpx_timer.h"
+void vp9_entropy_mode_init();
+void vp9_coef_tree_initialize();
+
static void set_default_lf_deltas(struct loopfilter *lf);
#define DEFAULT_INTERP_FILTER SWITCHABLE
@@ -109,6 +114,9 @@ extern unsigned __int64 Sectionbits[500];
extern void vp9_init_quantizer(VP9_COMP *cpi);
+static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
+ {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
switch (mode) {
case NORMAL:
@@ -151,11 +159,14 @@ void vp9_initialize_enc() {
if (!init_done) {
vp9_initialize_common();
+ vp9_coef_tree_initialize();
vp9_tokenize_initialize();
vp9_init_quant_tables();
vp9_init_me_luts();
- vp9_init_minq_luts();
+ vp9_rc_init_minq_luts();
// init_base_skip_probs();
+ vp9_entropy_mv_init();
+ vp9_entropy_mode_init();
init_done = 1;
}
}
@@ -192,6 +203,8 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vpx_free(cpi->coding_context.last_frame_seg_map_copy);
cpi->coding_context.last_frame_seg_map_copy = 0;
+ vpx_free(cpi->complexity_map);
+ cpi->complexity_map = 0;
vpx_free(cpi->active_map);
cpi->active_map = 0;
@@ -243,6 +256,79 @@ int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
return target_index - start_index;
}
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to thegiven rate ratio.
+
+int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
+ double base_q_index, double rate_target_ratio) {
+ int i;
+ int base_bits_per_mb;
+ int target_bits_per_mb;
+ int target_index = cpi->rc.worst_quality;
+
+ // Make SURE use of floating point in this function is safe.
+ vp9_clear_system_state();
+
+ // Look up the current projected bits per block for the base index
+ base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+ base_q_index, 1.0);
+
+ // Find the target bits per mb based on the base value and given ratio.
+ target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
+
+ // Convert the q target to an index
+ for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+ target_index = i;
+ if (vp9_rc_bits_per_mb(cpi->common.frame_type,
+ i, 1.0) <= target_bits_per_mb )
+ break;
+ }
+
+ return target_index - base_q_index;
+}
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+static void setup_in_frame_q_adj(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ struct segmentation *seg = &cm->seg;
+ // double q_ratio;
+ int segment;
+ int qindex_delta;
+
+ // Make SURE use of floating point in this function is safe.
+ vp9_clear_system_state();
+
+ if (cm->frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)) {
+ // Clear down the segment map
+ vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+ // Clear down the complexity map used for rd
+ vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+
+ // Enable segmentation
+ vp9_enable_segmentation((VP9_PTR)cpi);
+ vp9_clearall_segfeatures(seg);
+
+ // Select delta coding method
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ // Segment 0 "Q" feature is disabled so it defaults to the baseline Q
+ vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+
+ // Use some of the segments for in frame Q adjustment
+ for (segment = 1; segment < 3; segment++) {
+ qindex_delta =
+ vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
+ in_frame_q_adj_ratio[segment]);
+ vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
+ }
+}
+
static void configure_static_seg_features(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
struct segmentation *seg = &cm->seg;
@@ -1446,6 +1532,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
CHECK_MEM_ERROR(cm, cpi->segmentation_map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+ // Create a complexity map used for rd adjustment
+ CHECK_MEM_ERROR(cm, cpi->complexity_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+
// And a place holder structure is the coding context
// for use if we want to save and restore it
CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
@@ -2597,7 +2688,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
static void encode_with_recode_loop(VP9_COMP *cpi,
unsigned long *size,
uint8_t *dest,
- int q,
+ int *q,
int bottom_index,
int top_index,
int frame_over_shoot_limit,
@@ -2607,12 +2698,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
int loop = 0;
int overshoot_seen = 0;
int undershoot_seen = 0;
- int active_worst_qchanged = 0;
int q_low = bottom_index, q_high = top_index;
do {
vp9_clear_system_state(); // __asm emms;
- vp9_set_quantizer(cpi, q);
+ vp9_set_quantizer(cpi, *q);
if (loop_count == 0) {
// Set up entropy context depending on frame type. The decoder mandates
@@ -2630,8 +2720,12 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
}
}
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_vaq_frame_setup(cpi);
+ vp9_vaq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ setup_in_frame_q_adj(cpi);
}
// transform / motion compensation build reconstruction frame
@@ -2655,14 +2749,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
if (frame_over_shoot_limit == 0)
frame_over_shoot_limit = 1;
- active_worst_qchanged = 0;
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
loop = 0;
} else {
// Special case handling for forced key frames
if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
- int last_q = q;
+ int last_q = *q;
int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
int high_err_target = cpi->ambient_err;
@@ -2678,32 +2771,32 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
(kf_err > low_err_target &&
cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
// Lower q_high
- q_high = q > q_low ? q - 1 : q_low;
+ q_high = *q > q_low ? *q - 1 : q_low;
// Adjust Q
- q = (q * high_err_target) / kf_err;
- q = MIN(q, (q_high + q_low) >> 1);
+ *q = ((*q) * high_err_target) / kf_err;
+ *q = MIN((*q), (q_high + q_low) >> 1);
} else if (kf_err < low_err_target &&
cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
// The key frame is much better than the previous frame
// Raise q_low
- q_low = q < q_high ? q + 1 : q_high;
+ q_low = *q < q_high ? *q + 1 : q_high;
// Adjust Q
- q = (q * low_err_target) / kf_err;
- q = MIN(q, (q_high + q_low + 1) >> 1);
+ *q = ((*q) * low_err_target) / kf_err;
+ *q = MIN((*q), (q_high + q_low + 1) >> 1);
}
// Clamp Q to upper and lower limits:
- q = clamp(q, q_low, q_high);
+ *q = clamp(*q, q_low, q_high);
- loop = q != last_q;
+ loop = *q != last_q;
} else if (recode_loop_test(
cpi, frame_over_shoot_limit, frame_under_shoot_limit,
- q, top_index, bottom_index)) {
+ *q, top_index, bottom_index)) {
// Is the projected frame size out of range and are we allowed
// to attempt to recode.
- int last_q = q;
+ int last_q = *q;
int retries = 0;
// Frame size out of permitted range:
@@ -2712,26 +2805,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
// Frame is too large
if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
// Raise Qlow as to at least the current value
- q_low = q < q_high ? q + 1 : q_high;
+ q_low = *q < q_high ? *q + 1 : q_high;
if (undershoot_seen || loop_count > 1) {
// Update rate_correction_factor unless
- // cpi->rc.active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi, 1);
- q = (q_high + q_low + 1) / 2;
+ *q = (q_high + q_low + 1) / 2;
} else {
// Update rate_correction_factor unless
- // cpi->rc.active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi, 0);
- q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+ *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
- while (q < q_low && retries < 10) {
- vp9_update_rate_correction_factors(cpi, 0);
- q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+ while (*q < q_low && retries < 10) {
+ vp9_rc_update_rate_correction_factors(cpi, 0);
+ *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
retries++;
}
}
@@ -2739,34 +2828,33 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
overshoot_seen = 1;
} else {
// Frame is too small
- q_high = q > q_low ? q - 1 : q_low;
+ q_high = *q > q_low ? *q - 1 : q_low;
if (overshoot_seen || loop_count > 1) {
// Update rate_correction_factor unless
// cpi->rc.active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi, 1);
- q = (q_high + q_low) / 2;
+ *q = (q_high + q_low) / 2;
} else {
// Update rate_correction_factor unless
// cpi->rc.active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi, 0);
- q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+ *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
// Special case reset for qlow for constrained quality.
// This should only trigger where there is very substantial
// undershoot on a frame and the auto cq level is above
// the user passsed in value.
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
- q_low = q;
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+ *q < q_low) {
+ q_low = *q;
}
- while (q > q_high && retries < 10) {
- vp9_update_rate_correction_factors(cpi, 0);
- q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+ while (*q > q_high && retries < 10) {
+ vp9_rc_update_rate_correction_factors(cpi, 0);
+ *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
retries++;
}
}
@@ -2775,9 +2863,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
}
// Clamp Q to upper and lower limits:
- q = clamp(q, q_low, q_high);
+ *q = clamp(*q, q_low, q_high);
- loop = q != last_q;
+ loop = *q != last_q;
} else {
loop = 0;
}
@@ -2794,7 +2882,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
#endif
}
} while (loop);
- cpi->rc.active_worst_qchanged = active_worst_qchanged;
}
static void encode_frame_to_data_rate(VP9_COMP *cpi,
@@ -2919,15 +3006,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
configure_static_seg_features(cpi);
}
- // Decide how big to make the frame.
- vp9_pick_frame_size(cpi);
-
vp9_clear_system_state();
- q = vp9_pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+ // Decide how big to make the frame.
+ vp9_rc_pick_frame_size_and_bounds(cpi,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
- vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
- &frame_over_shoot_limit);
+ q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
+ &bottom_index,
+ &top_index);
#if CONFIG_MULTIPLE_ARF
// Force the quantizer determined by the coding order pattern.
@@ -2991,7 +3079,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
encode_with_recode_loop(cpi,
size,
dest,
- q,
+ &q,
bottom_index,
top_index,
frame_over_shoot_limit,
@@ -3073,106 +3161,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
* needed in motion search besides loopfilter */
cm->last_frame_type = cm->frame_type;
- // Update rate control heuristics
- cpi->rc.projected_frame_size = (*size) << 3;
-
- // Post encode loop adjustment of Q prediction.
- if (!cpi->rc.active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
- cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
-
-
- cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
-
- // Keep record of last boosted (KF/KF/ARF) Q value.
- // If the current frame is coded at a lower Q then we also update it.
- // If all mbs in this group are skipped only update if the Q value is
- // better than that already stored.
- // This is used to help set quality in forced key frames to reduce popping
- if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
- ((cpi->static_mb_pct < 100) &&
- ((cm->frame_type == KEY_FRAME) ||
- cpi->refresh_alt_ref_frame ||
- (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
- cpi->rc.last_boosted_qindex = cm->base_qindex;
- }
-
- if (cm->frame_type == KEY_FRAME) {
- vp9_adjust_key_frame_context(cpi);
- }
-
- // Keep a record of ambient average Q.
- if (cm->frame_type != KEY_FRAME)
- cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
- cm->base_qindex) >> 2;
-
- // Keep a record from which we can calculate the average Q excluding GF
- // updates and key frames.
- if (cm->frame_type != KEY_FRAME &&
- !cpi->refresh_golden_frame &&
- !cpi->refresh_alt_ref_frame) {
- cpi->rc.ni_frames++;
- cpi->rc.tot_q += vp9_convert_qindex_to_q(q);
- cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
-
- // Calculate the average Q for normal inter frames (not key or GFU frames).
- cpi->rc.ni_tot_qi += q;
- cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
- }
-
- // Update the buffer level variable.
- // Non-viewable frames are a special case and are treated as pure overhead.
- if (!cm->show_frame)
- cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
- else
- cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
- cpi->rc.projected_frame_size;
-
- // Clip the buffer level at the maximum buffer size
- if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
- cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
-
- // Rolling monitors of whether we are over or underspending used to help
- // regulate min and Max Q in two pass.
- if (cm->frame_type != KEY_FRAME) {
- cpi->rc.rolling_target_bits =
- ((cpi->rc.rolling_target_bits * 3) +
- cpi->rc.this_frame_target + 2) / 4;
- cpi->rc.rolling_actual_bits =
- ((cpi->rc.rolling_actual_bits * 3) +
- cpi->rc.projected_frame_size + 2) / 4;
- cpi->rc.long_rolling_target_bits =
- ((cpi->rc.long_rolling_target_bits * 31) +
- cpi->rc.this_frame_target + 16) / 32;
- cpi->rc.long_rolling_actual_bits =
- ((cpi->rc.long_rolling_actual_bits * 31) +
- cpi->rc.projected_frame_size + 16) / 32;
- }
-
- // Actual bits spent
- cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
-
- // Debug stats
- cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
- cpi->rc.projected_frame_size);
-
- cpi->rc.buffer_level = cpi->rc.bits_off_target;
-
-#ifndef DISABLE_RC_LONG_TERM_MEM
- // Update bits left to the kf and gf groups to account for overshoot or
- // undershoot on these frames
- if (cm->frame_type == KEY_FRAME) {
- cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
- cpi->rc.projected_frame_size;
-
- cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
- } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
- cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
- cpi->rc.projected_frame_size;
-
- cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
- }
-#endif
+ vp9_rc_postencode_update(cpi, *size, q);
#if 0
output_frame_level_debug_stats(cpi);
@@ -3302,6 +3291,10 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
// vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
+
+ vp9_twopass_postencode_update(cpi, *size);
+
+ /*
#ifdef DISABLE_RC_LONG_TERM_MEM
cpi->twopass.bits_left -= cpi->rc.this_frame_target;
#else
@@ -3320,6 +3313,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
/ cpi->oxcf.framerate);
}
+ */
}
static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
@@ -3614,8 +3608,12 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
VP9BORDERINPIXELS);
// Calculate scaling factors for each of the 3 available references
- for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+ for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
vp9_setup_scale_factors(cm, i);
+ if (vp9_is_scaled(&cm->active_ref_scale_comm[i]))
+ vp9_extend_frame_borders(&cm->yv12_fb[cm->active_ref_idx[i]],
+ cm->subsampling_x, cm->subsampling_y);
+ }
vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 54af75633..b8602e094 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -30,7 +30,6 @@
#include "vp9/encoder/vp9_lookahead.h"
#define DISABLE_RC_LONG_TERM_MEM 0
-
// #define MODE_TEST_HIT_STATS
// #define SPEEDSTATS 1
@@ -293,6 +292,7 @@ typedef struct {
// Rate targetting variables
int this_frame_target;
int projected_frame_size;
+ int sb64_target_rate;
int last_q[2]; // Separate values for Intra/Inter
int last_boosted_qindex; // Last boosted GF/KF/ARF q
@@ -339,7 +339,6 @@ typedef struct {
int active_worst_quality;
int best_quality;
int active_best_quality;
- int active_worst_qchanged;
} RATE_CONTROL;
typedef struct VP9_COMP {
@@ -431,8 +430,8 @@ typedef struct VP9_COMP {
int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
- int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
- int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+ int64_t rd_comp_pred_diff[REFERENCE_MODES];
+ int64_t rd_prediction_type_threshes[4][REFERENCE_MODES];
unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
unsigned int single_ref_count[REF_CONTEXTS][2][2];
@@ -516,6 +515,8 @@ typedef struct VP9_COMP {
// segment threashold for encode breakout
int segment_encode_breakout[MAX_SEGMENTS];
+ unsigned char *complexity_map;
+
unsigned char *active_map;
unsigned int active_map_enabled;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index d24be96f6..2591a5783 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -137,45 +137,18 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = eob + 1;
}
-struct plane_block_idx {
- int plane;
- int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
- int b_idx) {
- const int v_offset = y_blocks * 5 / 4;
- struct plane_block_idx res;
-
- if (b_idx < y_blocks) {
- res.plane = 0;
- res.block = b_idx;
- } else if (b_idx < v_offset) {
- res.plane = 1;
- res.block = b_idx - y_blocks;
- } else {
- assert(b_idx < y_blocks * 3 / 2);
- res.plane = 2;
- res.block = b_idx - v_offset;
- }
- return res;
-}
-
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan) {
MACROBLOCKD *const xd = &x->e_mbd;
- const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
- struct macroblock_plane* p = &x->plane[pb_idx.plane];
- struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+ struct macroblock_plane* p = &x->plane[plane];
+ struct macroblockd_plane* pd = &xd->plane[plane];
- vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+ vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
16, x->skip_block,
p->zbin, p->round, p->quant, p->quant_shift,
- BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
- BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
- pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
+ BLOCK_OFFSET(p->qcoeff, block),
+ BLOCK_OFFSET(pd->dqcoeff, block),
+ pd->dequant, p->zbin_extra, &pd->eobs[block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index c078e1d41..41cfa5283 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -13,7 +13,7 @@
#include "vp9/encoder/vp9_block.h"
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
struct VP9_COMP;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1293e860f..bf1fc4f31 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -26,6 +26,8 @@
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_seg_common.h"
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
#define MIN_BPB_FACTOR 0.005
#define MAX_BPB_FACTOR 50
@@ -67,7 +69,7 @@ static int calculate_minq_index(double maxq,
return QINDEX_RANGE - 1;
}
-void vp9_init_minq_luts(void) {
+void vp9_rc_init_minq_luts(void) {
int i;
for (i = 0; i < QINDEX_RANGE; i++) {
@@ -121,22 +123,8 @@ double vp9_convert_qindex_to_q(int qindex) {
return vp9_ac_quant(qindex, 0) / 4.0;
}
-int vp9_gfboost_qadjust(int qindex) {
- const double q = vp9_convert_qindex_to_q(qindex);
- return (int)((0.00000828 * q * q * q) +
- (-0.0055 * q * q) +
- (1.32 * q) + 79.3);
-}
-
-static int kfboost_qadjust(int qindex) {
- const double q = vp9_convert_qindex_to_q(qindex);
- return (int)((0.00000973 * q * q * q) +
- (-0.00613 * q * q) +
- (1.316 * q) + 121.2);
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
- double correction_factor) {
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor) {
const double q = vp9_convert_qindex_to_q(qindex);
int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
@@ -213,7 +201,7 @@ void vp9_setup_inter_frame(VP9_COMP *cpi) {
static int estimate_bits_at_q(int frame_kind, int q, int mbs,
double correction_factor) {
- const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));
+ const int bpm = (int)(vp9_rc_bits_per_mb(frame_kind, q, correction_factor));
// Attempt to retain reasonable accuracy without overflow. The cutoff is
// chosen such that the maximum product of Bpm and MBs fits 31 bits. The
@@ -240,11 +228,9 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
if (target > max_rate)
target = max_rate;
}
-
cpi->rc.this_frame_target = target;
}
-
// Do the best we can to define the parameters for the next GF based
// on what information we have available.
//
@@ -300,7 +286,7 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
}
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
const int q = cpi->common.base_qindex;
int correction_factor = 100;
double rate_correction_factor;
@@ -381,7 +367,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
}
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame) {
int q = cpi->rc.active_worst_quality;
int i;
@@ -413,8 +399,8 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
i = cpi->rc.active_best_quality;
do {
- bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,
- correction_factor);
+ bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cpi->common.frame_type, i,
+ correction_factor);
if (bits_per_mb_at_this_q <= target_bits_per_mb) {
if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -452,8 +438,9 @@ static int get_active_quality(int q,
return active_best_quality;
}
-int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
- int * bottom_index, int * top_index) {
+int vp9_rc_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+ int *bottom_index,
+ int *top_index) {
// Set an active best quality and if necessary active worst quality
int q = cpi->rc.active_worst_quality;
VP9_COMMON *const cm = &cpi->common;
@@ -472,7 +459,12 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
(last_boosted_q * 0.75));
cpi->rc.active_best_quality = MAX(qindex + delta_qindex,
- cpi->rc.best_quality);
+ cpi->rc.best_quality);
+ } else if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+ // If this is the first (key) frame in 1-pass, active best/worst is
+ // the user best/worst-allowed, and leave the top_index to active_worst.
+ cpi->rc.active_best_quality = cpi->oxcf.best_allowed_q;
+ cpi->rc.active_worst_quality = cpi->oxcf.worst_allowed_q;
} else {
int high = 5000;
int low = 400;
@@ -481,9 +473,9 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
// Baseline value derived from cpi->active_worst_quality and kf boost
cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.kf_boost,
- low, high,
- kf_low_motion_minq,
- kf_high_motion_minq);
+ low, high,
+ kf_low_motion_minq,
+ kf_high_motion_minq);
// Allow somewhat lower kf minq with small image formats.
if ((cm->width * cm->height) <= (352 * 288)) {
@@ -524,14 +516,14 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
q = cpi->cq_target_quality;
if (cpi->frames_since_key > 1) {
cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
- low, high,
- afq_low_motion_minq,
- afq_high_motion_minq);
+ low, high,
+ afq_low_motion_minq,
+ afq_high_motion_minq);
} else {
cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
- low, high,
- gf_low_motion_minq,
- gf_high_motion_minq);
+ low, high,
+ gf_low_motion_minq,
+ gf_high_motion_minq);
}
// Constrained quality use slightly lower active best.
cpi->rc.active_best_quality = cpi->rc.active_best_quality * 15 / 16;
@@ -541,22 +533,19 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
cpi->rc.active_best_quality = cpi->cq_target_quality;
} else {
if (cpi->frames_since_key > 1) {
- cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
- low, high,
- afq_low_motion_minq,
- afq_high_motion_minq);
+ cpi->rc.active_best_quality = get_active_quality(
+ q, cpi->rc.gfu_boost, low, high,
+ afq_low_motion_minq, afq_high_motion_minq);
} else {
- cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
- low, high,
- gf_low_motion_minq,
- gf_high_motion_minq);
+ cpi->rc.active_best_quality = get_active_quality(
+ q, cpi->rc.gfu_boost, low, high,
+ gf_low_motion_minq, gf_high_motion_minq);
}
}
} else {
- cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
- low, high,
- gf_low_motion_minq,
- gf_high_motion_minq);
+ cpi->rc.active_best_quality = get_active_quality(
+ q, cpi->rc.gfu_boost, low, high,
+ gf_low_motion_minq, gf_high_motion_minq);
}
} else {
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
@@ -596,25 +585,23 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
if (cpi->rc.active_worst_quality < cpi->rc.active_best_quality)
cpi->rc.active_worst_quality = cpi->rc.active_best_quality;
+ *top_index = cpi->rc.active_worst_quality;
+ *bottom_index = cpi->rc.active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
// Limit Q range for the adaptive loop.
if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
- *top_index =
- (cpi->rc.active_worst_quality + cpi->rc.active_best_quality * 3) / 4;
- // If this is the first (key) frame in 1-pass, active best is the user
- // best-allowed, and leave the top_index to active_worst.
- if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
- cpi->rc.active_best_quality = cpi->oxcf.best_allowed_q;
- *top_index = cpi->oxcf.worst_allowed_q;
+ if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+ *top_index =
+ (cpi->rc.active_worst_quality + cpi->rc.active_best_quality * 3) / 4;
}
} else if (!cpi->is_src_frame_alt_ref &&
(cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
*top_index =
(cpi->rc.active_worst_quality + cpi->rc.active_best_quality) / 2;
- } else {
- *top_index = cpi->rc.active_worst_quality;
}
- *bottom_index = cpi->rc.active_best_quality;
+#endif
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
q = cpi->rc.active_best_quality;
@@ -627,14 +614,13 @@ int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
// 1-pass: for now, use per-frame-bw for target size of frame, scaled
// by |x| for key frame.
int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
- q = vp9_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth);
+ q = vp9_rc_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth);
} else {
- q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+ q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
}
if (q > *top_index)
q = *top_index;
}
-
return q;
}
@@ -686,7 +672,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
}
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
+static void adjust_key_frame_context(VP9_COMP *cpi) {
// Clear down mmx registers to allow floating point in what follows
vp9_clear_system_state();
@@ -695,28 +681,30 @@ void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
}
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
- int *frame_over_shoot_limit) {
+static void compute_frame_size_bounds(const VP9_COMP *cpi,
+ int this_frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
// Set-up bounds on acceptable frame size:
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
*frame_under_shoot_limit = 0;
*frame_over_shoot_limit = INT_MAX;
} else {
if (cpi->common.frame_type == KEY_FRAME) {
- *frame_over_shoot_limit = cpi->rc.this_frame_target * 9 / 8;
- *frame_under_shoot_limit = cpi->rc.this_frame_target * 7 / 8;
+ *frame_over_shoot_limit = this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = this_frame_target * 7 / 8;
} else {
if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
- *frame_over_shoot_limit = cpi->rc.this_frame_target * 9 / 8;
- *frame_under_shoot_limit = cpi->rc.this_frame_target * 7 / 8;
+ *frame_over_shoot_limit = this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = this_frame_target * 7 / 8;
} else {
// Stron overshoot limit for constrained quality
if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- *frame_over_shoot_limit = cpi->rc.this_frame_target * 11 / 8;
- *frame_under_shoot_limit = cpi->rc.this_frame_target * 2 / 8;
+ *frame_over_shoot_limit = this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = this_frame_target * 2 / 8;
} else {
- *frame_over_shoot_limit = cpi->rc.this_frame_target * 11 / 8;
- *frame_under_shoot_limit = cpi->rc.this_frame_target * 5 / 8;
+ *frame_over_shoot_limit = this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = this_frame_target * 5 / 8;
}
}
}
@@ -731,9 +719,10 @@ void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
}
}
-
// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
+int vp9_rc_pick_frame_size_and_bounds(VP9_COMP *cpi,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
VP9_COMMON *cm = &cpi->common;
if (cm->frame_type == KEY_FRAME)
@@ -741,5 +730,112 @@ int vp9_pick_frame_size(VP9_COMP *cpi) {
else
calc_pframe_target_size(cpi);
+ // Target rate per SB64 (including partial SB64s.
+ cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
+ (cpi->common.width * cpi->common.height);
+ compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
+ frame_under_shoot_limit, frame_over_shoot_limit);
+
return 1;
}
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used, int q) {
+ VP9_COMMON *const cm = &cpi->common;
+ // Update rate control heuristics
+ cpi->rc.projected_frame_size = (bytes_used << 3);
+
+ // Post encode loop adjustment of Q prediction.
+ vp9_rc_update_rate_correction_factors(
+ cpi, (cpi->sf.recode_loop ||
+ cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+
+ cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
+
+ // Keep record of last boosted (KF/KF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+ ((cpi->static_mb_pct < 100) &&
+ ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+ cpi->rc.last_boosted_qindex = cm->base_qindex;
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ adjust_key_frame_context(cpi);
+ }
+
+ // Keep a record of ambient average Q.
+ if (cm->frame_type != KEY_FRAME)
+ cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
+ cm->base_qindex) >> 2;
+
+ // Keep a record from which we can calculate the average Q excluding GF
+ // updates and key frames.
+ if (cm->frame_type != KEY_FRAME &&
+ !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+ cpi->rc.ni_frames++;
+ cpi->rc.tot_q += vp9_convert_qindex_to_q(q);
+ cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+
+ // Calculate the average Q for normal inter frames (not key or GFU frames).
+ cpi->rc.ni_tot_qi += q;
+ cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+ }
+
+ // Update the buffer level variable.
+ // Non-viewable frames are a special case and are treated as pure overhead.
+ if (!cm->show_frame)
+ cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
+ else
+ cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
+ cpi->rc.projected_frame_size;
+
+ // Clip the buffer level at the maximum buffer size
+ if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
+ cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+ // Rolling monitors of whether we are over or underspending used to help
+ // regulate min and Max Q in two pass.
+ if (cm->frame_type != KEY_FRAME) {
+ cpi->rc.rolling_target_bits =
+ ((cpi->rc.rolling_target_bits * 3) +
+ cpi->rc.this_frame_target + 2) / 4;
+ cpi->rc.rolling_actual_bits =
+ ((cpi->rc.rolling_actual_bits * 3) +
+ cpi->rc.projected_frame_size + 2) / 4;
+ cpi->rc.long_rolling_target_bits =
+ ((cpi->rc.long_rolling_target_bits * 31) +
+ cpi->rc.this_frame_target + 16) / 32;
+ cpi->rc.long_rolling_actual_bits =
+ ((cpi->rc.long_rolling_actual_bits * 31) +
+ cpi->rc.projected_frame_size + 16) / 32;
+ }
+
+ // Actual bits spent
+ cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+
+ // Debug stats
+ cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
+ cpi->rc.projected_frame_size);
+
+ cpi->rc.buffer_level = cpi->rc.bits_off_target;
+
+#ifndef DISABLE_RC_LONG_TERM_MEM
+ // Update bits left to the kf and gf groups to account for overshoot or
+ // undershoot on these frames
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
+ cpi->rc.projected_frame_size;
+
+ cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
+ } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+ cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
+ cpi->rc.projected_frame_size;
+
+ cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+ }
+#endif
+}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 57dcd3f15..f01d18672 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -20,24 +20,41 @@ void vp9_save_coding_context(VP9_COMP *cpi);
void vp9_restore_coding_context(VP9_COMP *cpi);
void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
- int *frame_under_shoot_limit,
- int *frame_over_shoot_limit);
+void vp9_setup_inter_frame(VP9_COMP *cpi);
-void vp9_init_minq_luts(void);
+double vp9_convert_qindex_to_q(int qindex);
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi);
+// Updates rate correction factors
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-double vp9_convert_qindex_to_q(int qindex);
-int vp9_gfboost_qadjust(int qindex);
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
- double correction_factor);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
-int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
- int * bottom_index, int * top_index);
+// initialize luts for minq
+void vp9_rc_init_minq_luts(void);
+
+// return of 0 means drop frame
+// Changes rc.this_frame_target and rc.sb64_rate_target
+int vp9_rc_pick_frame_size_and_bounds(VP9_COMP *cpi,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+ int * bottom_index,
+ int * top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame);
+
+// Post encode update of the rate control parameters based
+// on bytes used and q used for the frame
+void vp9_rc_postencode_update(VP9_COMP *cpi,
+ uint64_t bytes_used,
+ int q_used);
+
+// estimates bits per mb for a given qindex and correction factor
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(VP9_COMP *cpi,
+ uint64_t bytes_used);
#endif // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index fde84298f..65cf5c797 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -17,7 +17,6 @@
#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/encoder/vp9_treewriter.h"
#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_modecosts.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
@@ -114,6 +113,43 @@ static int rd_thresh_block_size_factor[BLOCK_SIZES] =
#define MV_COST_WEIGHT 108
#define MV_COST_WEIGHT_SUB 120
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+ int raster_block, int stride) {
+ const int bw = b_width_log2(plane_bsize);
+ const int y = 4 * (raster_block >> bw);
+ const int x = 4 * (raster_block & ((1 << bw) - 1));
+ return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+ int raster_block, int16_t *base) {
+ const int stride = 4 << b_width_log2(plane_bsize);
+ return base + raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+static void fill_mode_costs(VP9_COMP *c) {
+ VP9_COMMON *const cm = &c->common;
+ int i, j;
+
+ for (i = 0; i < INTRA_MODES; i++)
+ for (j = 0; j < INTRA_MODES; j++)
+ vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+ vp9_intra_mode_tree);
+
+ // TODO(rbultje) separate tables for superblock costing?
+ vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
+ vp9_intra_mode_tree);
+ vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
+ cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+ vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
+ vp9_kf_uv_mode_prob[INTRA_MODES - 1],
+ vp9_intra_mode_tree);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
+ cm->fc.switchable_interp_prob[i],
+ vp9_switchable_interp_tree);
+}
+
static void fill_token_costs(vp9_coeff_cost *c,
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
int i, j, k, l;
@@ -247,7 +283,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
cm->frame_type != KEY_FRAME) ?
- 0 : 1;
+ 0 : 1;
set_block_thresholds(cpi);
@@ -258,7 +294,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
vp9_partition_tree);
/*rough estimate for costing*/
- vp9_init_mode_costs(cpi);
+ fill_mode_costs(cpi);
if (!frame_is_intra_only(cm)) {
vp9_build_nmv_cost_table(
@@ -267,15 +303,9 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
&cm->fc.nmvc,
cm->allow_high_precision_mv, 1, 1);
- for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
- MB_PREDICTION_MODE m;
-
- for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
- cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] =
- cost_token(vp9_inter_mode_tree,
- cm->fc.inter_mode_probs[i],
- &vp9_inter_mode_encodings[INTER_OFFSET(m)]);
- }
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+ cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
}
}
@@ -491,11 +521,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
const int16_t *scan, const int16_t *nb) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
const PLANE_TYPE type = pd->plane_type;
const int16_t *band_count = &band_counts[tx_size][1];
const int eob = pd->eobs[block];
- const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+ const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
x->token_costs[tx_size][type][ref];
@@ -588,8 +619,8 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
args->scan, args->nb);
}
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
+static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -698,7 +729,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
rd_stack->nb = so->neighbors;
foreach_transformed_block_in_plane(xd, bsize, plane,
- block_yrd_txfm, rd_stack);
+ block_rd_txfm, rd_stack);
if (rd_stack->skip) {
*rate = INT_MAX;
*distortion = INT64_MAX;
@@ -745,59 +776,42 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int64_t rd[TX_SIZES][2];
int n, m;
int s0, s1;
+ const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+ int64_t best_rd = INT64_MAX;
+ TX_SIZE best_tx = TX_4X4;
const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
-
- for (n = TX_4X4; n <= max_tx_size; n++) {
- r[n][1] = r[n][0];
- if (r[n][0] == INT_MAX)
- continue;
- for (m = 0; m <= n - (n == max_tx_size); m++) {
- if (m == n)
- r[n][1] += vp9_cost_zero(tx_probs[m]);
- else
- r[n][1] += vp9_cost_one(tx_probs[m]);
- }
- }
-
assert(skip_prob > 0);
s0 = vp9_cost_bit(skip_prob, 0);
s1 = vp9_cost_bit(skip_prob, 1);
for (n = TX_4X4; n <= max_tx_size; n++) {
+ r[n][1] = r[n][0];
+ if (r[n][0] < INT_MAX) {
+ for (m = 0; m <= n - (n == max_tx_size); m++) {
+ if (m == n)
+ r[n][1] += vp9_cost_zero(tx_probs[m]);
+ else
+ r[n][1] += vp9_cost_one(tx_probs[m]);
+ }
+ }
if (d[n] == INT64_MAX) {
rd[n][0] = rd[n][1] = INT64_MAX;
- continue;
- }
- if (s[n]) {
+ } else if (s[n]) {
rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
} else {
rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
}
- }
- if (max_tx_size == TX_32X32 &&
- (cm->tx_mode == ALLOW_32X32 ||
- (cm->tx_mode == TX_MODE_SELECT &&
- rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
- rd[TX_32X32][1] < rd[TX_4X4][1]))) {
- mbmi->tx_size = TX_32X32;
- } else if (max_tx_size >= TX_16X16 &&
- (cm->tx_mode == ALLOW_16X16 ||
- cm->tx_mode == ALLOW_32X32 ||
- (cm->tx_mode == TX_MODE_SELECT &&
- rd[TX_16X16][1] < rd[TX_8X8][1] &&
- rd[TX_16X16][1] < rd[TX_4X4][1]))) {
- mbmi->tx_size = TX_16X16;
- } else if (cm->tx_mode == ALLOW_8X8 ||
- cm->tx_mode == ALLOW_16X16 ||
- cm->tx_mode == ALLOW_32X32 ||
- (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
- mbmi->tx_size = TX_8X8;
- } else {
- mbmi->tx_size = TX_4X4;
+ if (rd[n][1] < best_rd) {
+ best_tx = n;
+ best_rd = rd[n][1];
+ }
}
+ mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+ best_tx : MIN(max_tx_size, max_mode_tx_size);
+
*distortion = d[mbmi->tx_size];
*rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
@@ -807,29 +821,18 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
- if (max_tx_size == TX_32X32 &&
- rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
- rd[TX_32X32][1] < rd[TX_4X4][1])
- tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
- else if (max_tx_size >= TX_16X16 &&
- rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
- tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
- else
- tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
- rd[TX_4X4][1] : rd[TX_8X8][1];
- if (max_tx_size == TX_32X32 &&
- rd[TX_32X32][1] < rd[TX_16X16][1] &&
- rd[TX_32X32][1] < rd[TX_8X8][1] &&
- rd[TX_32X32][1] < rd[TX_4X4][1]) {
+ if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+ tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
cpi->tx_stepdown_count[0]++;
- } else if (max_tx_size >= TX_16X16 &&
- rd[TX_16X16][1] < rd[TX_8X8][1] &&
- rd[TX_16X16][1] < rd[TX_4X4][1]) {
+ } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+ tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
} else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+ tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
} else {
+ tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
}
}
@@ -849,14 +852,17 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
int n, m;
int s0, s1;
double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
- // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
+ const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+ int64_t best_rd = INT64_MAX;
+ TX_SIZE best_tx = TX_4X4;
const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
-
- // for (n = TX_4X4; n <= max_txfm_size; n++)
- // r[n][0] = (r[n][0] * scale_r[n]);
+ assert(skip_prob > 0);
+ s0 = vp9_cost_bit(skip_prob, 0);
+ s1 = vp9_cost_bit(skip_prob, 1);
for (n = TX_4X4; n <= max_tx_size; n++) {
+ double scale = scale_rd[n];
r[n][1] = r[n][0];
for (m = 0; m <= n - (n == max_tx_size); m++) {
if (m == n)
@@ -864,62 +870,29 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
else
r[n][1] += vp9_cost_one(tx_probs[m]);
}
- }
-
- assert(skip_prob > 0);
- s0 = vp9_cost_bit(skip_prob, 0);
- s1 = vp9_cost_bit(skip_prob, 1);
-
- for (n = TX_4X4; n <= max_tx_size; n++) {
if (s[n]) {
- rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+ rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
} else {
- rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
- rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+ rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
+ rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+ }
+ if (rd[n][1] < best_rd) {
+ best_rd = rd[n][1];
+ best_tx = n;
}
- }
- for (n = TX_4X4; n <= max_tx_size; n++) {
- rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
- rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
}
- if (max_tx_size == TX_32X32 &&
- (cm->tx_mode == ALLOW_32X32 ||
- (cm->tx_mode == TX_MODE_SELECT &&
- rd[TX_32X32][1] <= rd[TX_16X16][1] &&
- rd[TX_32X32][1] <= rd[TX_8X8][1] &&
- rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
- mbmi->tx_size = TX_32X32;
- } else if (max_tx_size >= TX_16X16 &&
- (cm->tx_mode == ALLOW_16X16 ||
- cm->tx_mode == ALLOW_32X32 ||
- (cm->tx_mode == TX_MODE_SELECT &&
- rd[TX_16X16][1] <= rd[TX_8X8][1] &&
- rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
- mbmi->tx_size = TX_16X16;
- } else if (cm->tx_mode == ALLOW_8X8 ||
- cm->tx_mode == ALLOW_16X16 ||
- cm->tx_mode == ALLOW_32X32 ||
- (cm->tx_mode == TX_MODE_SELECT &&
- rd[TX_8X8][1] <= rd[TX_4X4][1])) {
- mbmi->tx_size = TX_8X8;
- } else {
- mbmi->tx_size = TX_4X4;
- }
+ mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+ best_tx : MIN(max_tx_size, max_mode_tx_size);
// Actually encode using the chosen mode if a model was used, but do not
// update the r, d costs
txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
&sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
- if (max_tx_size == TX_32X32 &&
- rd[TX_32X32][1] <= rd[TX_16X16][1] &&
- rd[TX_32X32][1] <= rd[TX_8X8][1] &&
- rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+ if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
cpi->tx_stepdown_count[0]++;
- } else if (max_tx_size >= TX_16X16 &&
- rd[TX_16X16][1] <= rd[TX_8X8][1] &&
- rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+ } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
} else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
@@ -939,6 +912,9 @@ static void super_block_yrd(VP9_COMP *cpi,
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
const int b_inter_mode = is_inter_block(mbmi);
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ TX_SIZE tx_size;
+
assert(bs == mbmi->sb_type);
if (b_inter_mode)
@@ -957,34 +933,16 @@ static void super_block_yrd(VP9_COMP *cpi,
if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
b_inter_mode) {
- if (bs >= BLOCK_32X32)
- model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
- &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
- if (bs >= BLOCK_16X16)
- model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
- &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-
- model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
- &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-
- model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
- &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
+ &r[tx_size][0], &d[tx_size], &s[tx_size]);
choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
skip, sse, ref_best_rd, bs);
} else {
- if (bs >= BLOCK_32X32)
- txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
- &s[TX_32X32], &sse[TX_32X32],
- ref_best_rd, 0, bs, TX_32X32);
- if (bs >= BLOCK_16X16)
- txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
- &s[TX_16X16], &sse[TX_16X16],
- ref_best_rd, 0, bs, TX_16X16);
- txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
- &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
- txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
- &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ txfm_rd_in_plane(x, rdcost_stack, &r[tx_size][0], &d[tx_size],
+ &s[tx_size], &sse[tx_size],
+ ref_best_rd, 0, bs, tx_size);
choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
skip, txfm_cache, bs);
}
@@ -1097,7 +1055,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
else
x->fwd_txm4x4(src_diff, coeff, 8);
- vp9_regular_quantize_b_4x4(x, 4, block, so->scan, so->iscan);
+ vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
so->scan, so->neighbors);
@@ -1341,11 +1299,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int this_rate_tokenonly, this_rate, s;
int64_t this_distortion, this_sse;
- // int mode_mask = (bsize <= BLOCK_8X8)
- // ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
-
- for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
- // if (!(mode_mask & (1 << mode)))
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
& (1 << mode)))
continue;
@@ -1373,7 +1327,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
struct macroblockd_plane *const pd = x->e_mbd.plane;
for (i = 1; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][2];
- pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
pd[i].eobs = ctx->eobs_pbuf[i][2];
@@ -1383,7 +1337,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0];
ctx->coeff_pbuf[i][0] = p[i].coeff;
- ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff;
+ ctx->qcoeff_pbuf[i][0] = p[i].qcoeff;
ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
ctx->eobs_pbuf[i][0] = pd[i].eobs;
}
@@ -1392,7 +1346,6 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
-
return best_rd;
}
@@ -1588,7 +1541,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
coeff = BLOCK_OFFSET(p->coeff, k);
x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
- vp9_regular_quantize_b_4x4(x, 4, k, so->scan, so->iscan);
+ vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
thissse += ssz;
@@ -2135,7 +2088,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
int best_index = 0;
int best_sad = INT_MAX;
int this_sad = INT_MAX;
- unsigned int max_mv = 0;
+ int max_mv = 0;
uint8_t *src_y_ptr = x->plane[0].src.buf;
uint8_t *ref_y_ptr;
@@ -2194,7 +2147,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
vp9_prob comp_inter_p = 128;
- if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
*comp_mode_p = comp_inter_p;
} else {
@@ -2203,12 +2156,12 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
- if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+ if (cm->comp_pred_mode != COMPOUND_REFERENCE) {
vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
base_cost += vp9_cost_bit(comp_inter_p, 0);
ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
@@ -2223,11 +2176,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
ref_costs_single[GOLDEN_FRAME] = 512;
ref_costs_single[ALTREF_FRAME] = 512;
}
- if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+ if (cm->comp_pred_mode != SINGLE_REFERENCE) {
vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT)
base_cost += vp9_cost_bit(comp_inter_p, 1);
ref_costs_comp[LAST_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 0);
@@ -2243,7 +2196,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int mode_index,
int_mv *ref_mv,
int_mv *second_ref_mv,
- int64_t comp_pred_diff[NB_PREDICTION_TYPES],
+ int64_t comp_pred_diff[REFERENCE_MODES],
int64_t tx_size_diff[TX_MODES],
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
MACROBLOCKD *const xd = &x->e_mbd;
@@ -2257,9 +2210,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
ctx->best_ref_mv.as_int = ref_mv->as_int;
ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
- ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
- ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
- ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
+ ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+ ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+ ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
@@ -2782,9 +2735,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (!(*mode_excluded)) {
if (is_comp_pred) {
- *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+ *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_REFERENCE);
} else {
- *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+ *mode_excluded = (cpi->common.comp_pred_mode == COMPOUND_REFERENCE);
}
}
@@ -3050,7 +3003,7 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
for (i = 0; i < max_plane; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][1];
- pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
pd[i].eobs = ctx->eobs_pbuf[i][1];
@@ -3060,7 +3013,7 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0];
ctx->coeff_pbuf[i][0] = p[i].coeff;
- ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff;
+ ctx->qcoeff_pbuf[i][0] = p[i].qcoeff;
ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
ctx->eobs_pbuf[i][0] = pd[i].eobs;
}
@@ -3149,8 +3102,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_rd = best_rd_so_far;
int64_t best_tx_rd[TX_MODES];
int64_t best_tx_diff[TX_MODES];
- int64_t best_pred_diff[NB_PREDICTION_TYPES];
- int64_t best_pred_rd[NB_PREDICTION_TYPES];
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_pred_rd[REFERENCE_MODES];
int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
@@ -3186,7 +3139,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
&comp_mode_p);
- for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ for (i = 0; i < REFERENCE_MODES; ++i)
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
@@ -3363,12 +3316,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
mode_excluded = mode_excluded
? mode_excluded
- : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ : cm->comp_pred_mode == SINGLE_REFERENCE;
} else {
if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
mode_excluded =
mode_excluded ?
- mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+ mode_excluded : cm->comp_pred_mode == COMPOUND_REFERENCE;
}
}
@@ -3491,7 +3444,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
continue;
}
- if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
rate2 += compmode_cost;
}
@@ -3576,7 +3529,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!disable_skip && ref_frame == INTRA_FRAME) {
- for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ for (i = 0; i < REFERENCE_MODES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -3636,9 +3589,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
/* keep record of best compound/single-only prediction */
if (!disable_skip && ref_frame != INTRA_FRAME) {
- int single_rd, hybrid_rd, single_rate, hybrid_rate;
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
- if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
single_rate = rate2 - compmode_cost;
hybrid_rate = rate2;
} else {
@@ -3650,14 +3603,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
if (second_ref_frame <= INTRA_FRAME &&
- single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
- best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+ single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+ best_pred_rd[SINGLE_REFERENCE] = single_rd;
} else if (second_ref_frame > INTRA_FRAME &&
- single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
- best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+ single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+ best_pred_rd[COMPOUND_REFERENCE] = single_rd;
}
- if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
- best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+ if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+ best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
}
/* keep record of best filter type */
@@ -3717,7 +3670,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->sf.use_uv_intra_rd_estimate) {
// Do Intra UV best rd mode selection if best mode choice above was intra.
if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
- TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+ TX_SIZE uv_tx_size;
+ *mbmi = best_mbmode;
+ uv_tx_size = get_uv_tx_size(mbmi);
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
@@ -3779,7 +3734,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*mbmi = best_mbmode;
x->skip |= best_skip2;
- for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+ for (i = 0; i < REFERENCE_MODES; ++i) {
if (best_pred_rd[i] == INT64_MAX)
best_pred_diff[i] = INT_MIN;
else
@@ -3850,8 +3805,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise
int64_t best_tx_rd[TX_MODES];
int64_t best_tx_diff[TX_MODES];
- int64_t best_pred_diff[NB_PREDICTION_TYPES];
- int64_t best_pred_rd[NB_PREDICTION_TYPES];
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_pred_rd[REFERENCE_MODES];
int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
@@ -3886,7 +3841,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
&comp_mode_p);
- for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ for (i = 0; i < REFERENCE_MODES; ++i)
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
@@ -4030,12 +3985,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
mode_excluded = mode_excluded
? mode_excluded
- : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ : cm->comp_pred_mode == SINGLE_REFERENCE;
} else {
if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
mode_excluded =
mode_excluded ?
- mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+ mode_excluded : cm->comp_pred_mode == COMPOUND_REFERENCE;
}
}
@@ -4241,9 +4196,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (!mode_excluded) {
if (comp_pred)
- mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ mode_excluded = cpi->common.comp_pred_mode == SINGLE_REFERENCE;
else
- mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+ mode_excluded = cpi->common.comp_pred_mode == COMPOUND_REFERENCE;
}
compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@@ -4271,7 +4226,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
rate2 += compmode_cost;
}
@@ -4332,7 +4287,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!disable_skip && ref_frame == INTRA_FRAME) {
- for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ for (i = 0; i < REFERENCE_MODES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -4387,9 +4342,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
/* keep record of best compound/single-only prediction */
if (!disable_skip && ref_frame != INTRA_FRAME) {
- int single_rd, hybrid_rd, single_rate, hybrid_rate;
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
single_rate = rate2 - compmode_cost;
hybrid_rate = rate2;
} else {
@@ -4401,14 +4356,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
if (second_ref_frame <= INTRA_FRAME &&
- single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
- best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+ single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+ best_pred_rd[SINGLE_REFERENCE] = single_rd;
} else if (second_ref_frame > INTRA_FRAME &&
- single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
- best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+ single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+ best_pred_rd[COMPOUND_REFERENCE] = single_rd;
}
- if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
- best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+ if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+ best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
}
/* keep record of best filter type */
@@ -4465,7 +4420,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->sf.use_uv_intra_rd_estimate) {
// Do Intra UV best rd mode selection if best mode choice above was intra.
if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
- TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+ TX_SIZE uv_tx_size;
+ *mbmi = best_mbmode;
+ uv_tx_size = get_uv_tx_size(mbmi);
rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
@@ -4524,7 +4481,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
}
- for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+ for (i = 0; i < REFERENCE_MODES; ++i) {
if (best_pred_rd[i] == INT64_MAX)
best_pred_diff[i] = INT_MIN;
else
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3f1cc6fe8..389ec152a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -26,6 +26,86 @@ const TOKENVALUE *vp9_dct_value_tokens_ptr;
static int dct_value_cost[DCT_MAX_VALUE * 2];
const int *vp9_dct_value_cost_ptr;
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
+ -DCT_EOB_TOKEN, 2, /* 0 = EOB */
+ -ZERO_TOKEN, 4, /* 1 = ZERO */
+ -ONE_TOKEN, 6, /* 2 = ONE */
+ 8, 12, /* 3 = LOW_VAL */
+ -TWO_TOKEN, 10, /* 4 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
+ 14, 16, /* 6 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
+ 18, 20, /* 8 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
+};
+
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
+ 2, 6, /* 0 = LOW_VAL */
+ -TWO_TOKEN, 4, /* 1 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 2 = THREE */
+ 8, 10, /* 3 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 4 = CAT_ONE */
+ 12, 14, /* 5 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 6 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 7 = CAT_FIVE */
+};
+
+static const vp9_prob Pcat1[] = { 159};
+static const vp9_prob Pcat2[] = { 165, 145};
+static const vp9_prob Pcat3[] = { 173, 148, 140};
+static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp9_prob Pcat6[] = {
+ 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+ int i = 0;
+
+ while (++i < n) {
+ p[0] = p[1] = i << 1;
+ p += 2;
+ }
+
+ p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+ init_bit_tree(cat1, 1);
+ init_bit_tree(cat2, 2);
+ init_bit_tree(cat3, 3);
+ init_bit_tree(cat4, 4);
+ init_bit_tree(cat5, 5);
+ init_bit_tree(cat6, 14);
+}
+
+const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
+ {0, 0, 0, 0}, // ZERO_TOKEN
+ {0, 0, 0, 1}, // ONE_TOKEN
+ {0, 0, 0, 2}, // TWO_TOKEN
+ {0, 0, 0, 3}, // THREE_TOKEN
+ {0, 0, 0, 4}, // FOUR_TOKEN
+ {cat1, Pcat1, 1, 5}, // DCT_VAL_CATEGORY1
+ {cat2, Pcat2, 2, 7}, // DCT_VAL_CATEGORY2
+ {cat3, Pcat3, 3, 11}, // DCT_VAL_CATEGORY3
+ {cat4, Pcat4, 4, 19}, // DCT_VAL_CATEGORY4
+ {cat5, Pcat5, 5, 35}, // DCT_VAL_CATEGORY5
+ {cat6, Pcat6, 14, 67}, // DCT_VAL_CATEGORY6
+ {0, 0, 0, 0} // DCT_EOB_TOKEN
+};
+
+struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+void vp9_coef_tree_initialize() {
+ init_bit_trees();
+ vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
static void fill_value_tokens() {
TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
const vp9_extra_bit *const e = vp9_extra_bits;
@@ -108,7 +188,8 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
TOKENEXTRA *t = *tp; /* store tokens starting here */
const int eob = pd->eobs[block];
const PLANE_TYPE type = pd->plane_type;
- const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+ struct macroblock_plane *p = &cpi->mb.plane[plane];
+ const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index e24e31b80..2e3bf5203 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -28,6 +28,10 @@ typedef struct {
uint8_t skip_eob_node;
} TOKENEXTRA;
+extern const vp9_tree_index vp9_coef_tree[];
+extern const vp9_tree_index vp9_coef_con_tree[];
+extern struct vp9_token vp9_coef_encodings[];
+
int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane);
diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index c9bf4dabe..3245960ac 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h
@@ -64,11 +64,6 @@ static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
return cost;
}
-static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
- const struct vp9_token *token) {
- return treed_cost(tree, probs, token->value, token->len);
-}
-
void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
diff --git a/vp9/encoder/x86/vp9_dct32x32_avx2.c b/vp9/encoder/x86/vp9_dct32x32_avx2.c
new file mode 100644
index 000000000..9ea22fed2
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct32x32_avx2.c
@@ -0,0 +1,2710 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vpx_ports/mem.h"
+
+#define pair256_set_epi16(a, b) \
+ _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+
+#define pair256_set_epi32(a, b) \
+ _mm256_set_epi32(b, a, b, a, b, a, b, a)
+
+
+
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+ __m256i buf0, buf1;
+ buf0 = _mm256_mul_epu32(a, b);
+ a = _mm256_srli_epi64(a, 32);
+ b = _mm256_srli_epi64(b, 32);
+ buf1 = _mm256_mul_epu32(a, b);
+ return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+ __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input,
+ int16_t *output_org, int stride) {
+ // Calculate pre-multiplied strides
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+ const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ const __m256i kZero = _mm256_set1_epi16(0);
+ const __m256i kOne = _mm256_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process sixteen columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 16) {
+ __m256i step1[32];
+ __m256i step2[32];
+ __m256i step3[32];
+ __m256i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ const int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
+ __m256i *step1a = &step1[ 0];
+ __m256i *step1b = &step1[31];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
+ __m256i *step1a = &step1[ 4];
+ __m256i *step1b = &step1[27];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
+ __m256i *step1a = &step1[ 8];
+ __m256i *step1b = &step1[23];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ {
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
+ __m256i *step1a = &step1[12];
+ __m256i *step1b = &step1[19];
+ const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+ const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1));
+ const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2));
+ const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3));
+ const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3));
+ const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2));
+ const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1));
+ const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+ step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+ step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
+ __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
+ __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
+ __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
+ __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+ __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+ __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+ __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+ step1[ 0] = _mm256_add_epi16(in00, in31);
+ step1[ 1] = _mm256_add_epi16(in01, in30);
+ step1[ 2] = _mm256_add_epi16(in02, in29);
+ step1[ 3] = _mm256_add_epi16(in03, in28);
+ step1[28] = _mm256_sub_epi16(in03, in28);
+ step1[29] = _mm256_sub_epi16(in02, in29);
+ step1[30] = _mm256_sub_epi16(in01, in30);
+ step1[31] = _mm256_sub_epi16(in00, in31);
+ }
+ {
+ __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
+ __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
+ __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
+ __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
+ __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+ __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+ __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+ __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+ step1[ 4] = _mm256_add_epi16(in04, in27);
+ step1[ 5] = _mm256_add_epi16(in05, in26);
+ step1[ 6] = _mm256_add_epi16(in06, in25);
+ step1[ 7] = _mm256_add_epi16(in07, in24);
+ step1[24] = _mm256_sub_epi16(in07, in24);
+ step1[25] = _mm256_sub_epi16(in06, in25);
+ step1[26] = _mm256_sub_epi16(in05, in26);
+ step1[27] = _mm256_sub_epi16(in04, in27);
+ }
+ {
+ __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
+ __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
+ __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+ __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+ __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+ __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+ __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+ __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+ step1[ 8] = _mm256_add_epi16(in08, in23);
+ step1[ 9] = _mm256_add_epi16(in09, in22);
+ step1[10] = _mm256_add_epi16(in10, in21);
+ step1[11] = _mm256_add_epi16(in11, in20);
+ step1[20] = _mm256_sub_epi16(in11, in20);
+ step1[21] = _mm256_sub_epi16(in10, in21);
+ step1[22] = _mm256_sub_epi16(in09, in22);
+ step1[23] = _mm256_sub_epi16(in08, in23);
+ }
+ {
+ __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+ __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+ __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+ __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+ __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+ __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+ __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+ __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+ step1[12] = _mm256_add_epi16(in12, in19);
+ step1[13] = _mm256_add_epi16(in13, in18);
+ step1[14] = _mm256_add_epi16(in14, in17);
+ step1[15] = _mm256_add_epi16(in15, in16);
+ step1[16] = _mm256_sub_epi16(in15, in16);
+ step1[17] = _mm256_sub_epi16(in14, in17);
+ step1[18] = _mm256_sub_epi16(in13, in18);
+ step1[19] = _mm256_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
+ step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
+ step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
+ step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
+ step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
+ step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
+ step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
+ step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
+ step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
+ step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+ step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+ const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+ const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+ const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+ const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+ const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+ const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+ const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+ const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
+ __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
+ __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
+ __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
+ __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
+ __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
+ __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
+ __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
+ __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
+ __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
+ __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
+ __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
+ __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
+ __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
+ __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
+ __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
+ __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
+ __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
+ __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
+ __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
+ __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
+ __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
+ __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
+ __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
+ __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
+ __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
+ __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
+ __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
+ __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
+ __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
+ __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
+ __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
+
+ step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
+ step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
+ step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
+ step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
+ step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
+ step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
+ step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
+ step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
+ step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
+ step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+ step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+ step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
+ step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
+ step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
+ step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
+ step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
+ step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
+ step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
+ step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
+ step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
+ step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+ step2[10] = _mm256_add_epi16(step2[10], kOne);
+ step2[11] = _mm256_add_epi16(step2[11], kOne);
+ step2[12] = _mm256_add_epi16(step2[12], kOne);
+ step2[13] = _mm256_add_epi16(step2[13], kOne);
+ step2[14] = _mm256_add_epi16(step2[14], kOne);
+ step2[15] = _mm256_add_epi16(step2[15], kOne);
+ step1[16] = _mm256_add_epi16(step1[16], kOne);
+ step1[17] = _mm256_add_epi16(step1[17], kOne);
+ step1[18] = _mm256_add_epi16(step1[18], kOne);
+ step1[19] = _mm256_add_epi16(step1[19], kOne);
+ step2[20] = _mm256_add_epi16(step2[20], kOne);
+ step2[21] = _mm256_add_epi16(step2[21], kOne);
+ step2[22] = _mm256_add_epi16(step2[22], kOne);
+ step2[23] = _mm256_add_epi16(step2[23], kOne);
+ step2[24] = _mm256_add_epi16(step2[24], kOne);
+ step2[25] = _mm256_add_epi16(step2[25], kOne);
+ step2[26] = _mm256_add_epi16(step2[26], kOne);
+ step2[27] = _mm256_add_epi16(step2[27], kOne);
+ step1[28] = _mm256_add_epi16(step1[28], kOne);
+ step1[29] = _mm256_add_epi16(step1[29], kOne);
+ step1[30] = _mm256_add_epi16(step1[30], kOne);
+ step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+ step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
+ step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
+ step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
+ step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
+ step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
+ step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
+ step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
+ step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
+ step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
+ step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+ step2[10] = _mm256_srai_epi16(step2[10], 2);
+ step2[11] = _mm256_srai_epi16(step2[11], 2);
+ step2[12] = _mm256_srai_epi16(step2[12], 2);
+ step2[13] = _mm256_srai_epi16(step2[13], 2);
+ step2[14] = _mm256_srai_epi16(step2[14], 2);
+ step2[15] = _mm256_srai_epi16(step2[15], 2);
+ step1[16] = _mm256_srai_epi16(step1[16], 2);
+ step1[17] = _mm256_srai_epi16(step1[17], 2);
+ step1[18] = _mm256_srai_epi16(step1[18], 2);
+ step1[19] = _mm256_srai_epi16(step1[19], 2);
+ step2[20] = _mm256_srai_epi16(step2[20], 2);
+ step2[21] = _mm256_srai_epi16(step2[21], 2);
+ step2[22] = _mm256_srai_epi16(step2[22], 2);
+ step2[23] = _mm256_srai_epi16(step2[23], 2);
+ step2[24] = _mm256_srai_epi16(step2[24], 2);
+ step2[25] = _mm256_srai_epi16(step2[25], 2);
+ step2[26] = _mm256_srai_epi16(step2[26], 2);
+ step2[27] = _mm256_srai_epi16(step2[27], 2);
+ step1[28] = _mm256_srai_epi16(step1[28], 2);
+ step1[29] = _mm256_srai_epi16(step1[29], 2);
+ step1[30] = _mm256_srai_epi16(step1[30], 2);
+ step1[31] = _mm256_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+ }
+
+ // Stage 4
+ {
+ step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
+ step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
+ step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
+ step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
+ step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
+ step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
+ step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
+ step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
+ step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+ const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+ const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+ const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+ const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+ const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+ const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+ const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+ const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+ const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+ const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+ const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+ const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+ const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+ const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+ out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
+ const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
+ const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+ const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+ const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+ const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+ const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+ const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+ const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
+ step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
+ step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+ const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+ const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+ const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+ const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+ const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+ const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+ const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+ const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
+ const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
+ const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
+ const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
+ const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+ const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+ const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+ const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+ const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+ out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+ const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+ const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+ const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+ const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+ const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+ const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+ const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+ const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+ out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+ out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+ const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+ const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+ const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+ const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+ const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+ const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+ const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+ const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+ out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m256i lstep1[64], lstep2[64], lstep3[64];
+ __m256i u[32], v[32], sign[16];
+ const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
+ lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
+ lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
+ lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
+ lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
+ lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
+ lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
+ lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
+ lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
+ lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
+ lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
+ lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
+ lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
+ lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
+ lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
+ lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
+ lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
+ lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
+ lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
+ lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
+ lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
+ lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
+ lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
+ lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
+ lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
+ lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+ lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+ lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+ lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+ lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+ lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+ lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+ lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
+ lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
+ lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
+ lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
+ lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
+ lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
+ lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
+ lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
+ lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
+ lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
+ lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
+ lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
+ lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
+ lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
+ lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
+ lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+ }
+ {
+ const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+ const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+ const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+ const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+ const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+ lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+ lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+ lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+ lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+ lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+ lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+ lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+ lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+ lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+ lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+ lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+ lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+ lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+ lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+ lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+ lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+ lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+ lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+ lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+ lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+ lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+ lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+ lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+ lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+ lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+ lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+ lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+ lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+ lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+ lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+ lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+ lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+ lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+ lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+ lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+ lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+ lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+ lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+ lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+ lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+ lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+ lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+ lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+ lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+ lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+ lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+ lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+ lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+ lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+ lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+ lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+ lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+ lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+ lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+ lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+ lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+ lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+ lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+ lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+ lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+ lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+ lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+ lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+ lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+ lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
+ lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
+ lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
+ lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+ lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+ lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+ lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+ lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+ lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+ lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+ lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+ lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+ lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+ lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+ lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+ lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+ lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
+ lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
+ lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
+ lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
+ lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
+ lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
+ lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
+ lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+ lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
+ v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
+ v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
+ v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
+ v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
+ v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
+ v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
+ v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
+ v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
+ v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
+ v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
+ v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
+ v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
+ v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
+ v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
+ v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
+ v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
+ v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
+
+ u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
+ lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
+ lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
+ lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+ lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+ const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+ const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+ v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+ v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+ v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+ v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+ v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+ v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+ v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+ v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+ v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ // Combine
+ out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+ out[16] = _mm256_packs_epi32(u[2], u[3]);
+ out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+ out[24] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+ v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+ v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+ const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+ u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+ u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+ u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+ v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+ v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+ v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+ u[0] = k_packs_epi64_avx2(v[0], v[1]);
+ u[1] = k_packs_epi64_avx2(v[2], v[3]);
+ u[2] = k_packs_epi64_avx2(v[4], v[5]);
+ u[3] = k_packs_epi64_avx2(v[6], v[7]);
+ u[4] = k_packs_epi64_avx2(v[8], v[9]);
+ u[5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+ sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+ sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+ sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+ sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+ sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+ sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+ sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+ u[0] = _mm256_sub_epi32(u[0], sign[0]);
+ u[1] = _mm256_sub_epi32(u[1], sign[1]);
+ u[2] = _mm256_sub_epi32(u[2], sign[2]);
+ u[3] = _mm256_sub_epi32(u[3], sign[3]);
+ u[4] = _mm256_sub_epi32(u[4], sign[4]);
+ u[5] = _mm256_sub_epi32(u[5], sign[5]);
+ u[6] = _mm256_sub_epi32(u[6], sign[6]);
+ u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm256_add_epi32(u[0], K32One);
+ u[1] = _mm256_add_epi32(u[1], K32One);
+ u[2] = _mm256_add_epi32(u[2], K32One);
+ u[3] = _mm256_add_epi32(u[3], K32One);
+ u[4] = _mm256_add_epi32(u[4], K32One);
+ u[5] = _mm256_add_epi32(u[5], K32One);
+ u[6] = _mm256_add_epi32(u[6], K32One);
+ u[7] = _mm256_add_epi32(u[7], K32One);
+
+ u[0] = _mm256_srai_epi32(u[0], 2);
+ u[1] = _mm256_srai_epi32(u[1], 2);
+ u[2] = _mm256_srai_epi32(u[2], 2);
+ u[3] = _mm256_srai_epi32(u[3], 2);
+ u[4] = _mm256_srai_epi32(u[4], 2);
+ u[5] = _mm256_srai_epi32(u[5], 2);
+ u[6] = _mm256_srai_epi32(u[6], 2);
+ u[7] = _mm256_srai_epi32(u[7], 2);
+
+ out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+ out[20] = _mm256_packs_epi32(u[2], u[3]);
+ out[12] = _mm256_packs_epi32(u[4], u[5]);
+ out[28] = _mm256_packs_epi32(u[6], u[7]);
+ }
+ {
+ lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
+ -cospi_20_64);
+ const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+ const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
+ v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
+ v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
+ v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
+ v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
+ v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
+ v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
+ v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
+ v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+ v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+ v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
+ v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+ v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
+ v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
+ v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
+ v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
+ v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
+ v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
+ v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
+ v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
+
+ u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
+ const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
+ const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
+ const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64, cospi_26_64);
+ const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
+ v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
+ v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
+ v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
+ v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
+ v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
+ v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
+ v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
+ v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
+ v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
+ v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
+ v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
+ v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
+ v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
+ v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
+ v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
+ v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
+ v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
+
+ u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+ v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+ v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+ v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+ v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+ v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+ v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+ v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+ v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+ v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+ v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+ u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[ 0] = _mm256_add_epi32(u[ 0], K32One);
+ v[ 1] = _mm256_add_epi32(u[ 1], K32One);
+ v[ 2] = _mm256_add_epi32(u[ 2], K32One);
+ v[ 3] = _mm256_add_epi32(u[ 3], K32One);
+ v[ 4] = _mm256_add_epi32(u[ 4], K32One);
+ v[ 5] = _mm256_add_epi32(u[ 5], K32One);
+ v[ 6] = _mm256_add_epi32(u[ 6], K32One);
+ v[ 7] = _mm256_add_epi32(u[ 7], K32One);
+ v[ 8] = _mm256_add_epi32(u[ 8], K32One);
+ v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[ 0] = _mm256_srai_epi32(v[ 0], 2);
+ u[ 1] = _mm256_srai_epi32(v[ 1], 2);
+ u[ 2] = _mm256_srai_epi32(v[ 2], 2);
+ u[ 3] = _mm256_srai_epi32(v[ 3], 2);
+ u[ 4] = _mm256_srai_epi32(v[ 4], 2);
+ u[ 5] = _mm256_srai_epi32(v[ 5], 2);
+ u[ 6] = _mm256_srai_epi32(v[ 6], 2);
+ u[ 7] = _mm256_srai_epi32(v[ 7], 2);
+ u[ 8] = _mm256_srai_epi32(v[ 8], 2);
+ u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+ out[18] = _mm256_packs_epi32(u[2], u[3]);
+ out[10] = _mm256_packs_epi32(u[4], u[5]);
+ out[26] = _mm256_packs_epi32(u[6], u[7]);
+ out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+ out[22] = _mm256_packs_epi32(u[10], u[11]);
+ out[14] = _mm256_packs_epi32(u[12], u[13]);
+ out[30] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
+ const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
+ const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
+ const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
+ const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
+ v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
+ v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
+ v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
+ v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
+ v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
+ v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
+ v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
+ v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
+ v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
+ v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
+ v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
+ v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
+ v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
+ v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
+ v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
+ v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
+ v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
+
+ u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+ v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+ v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+ v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+ v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+ v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+ v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+ v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+ v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+ v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+ v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+ u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+ out[17] = _mm256_packs_epi32(u[2], u[3]);
+ out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+ out[25] = _mm256_packs_epi32(u[6], u[7]);
+ out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+ out[23] = _mm256_packs_epi32(u[10], u[11]);
+ out[15] = _mm256_packs_epi32(u[12], u[13]);
+ out[31] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ {
+ const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
+ const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
+ const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
+ const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
+ const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
+ v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
+ v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
+ v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
+ v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
+ v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
+ v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
+ v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
+ v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
+ v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+ v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
+ v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+ v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
+ v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
+ v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
+ v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
+ v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
+ v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
+ v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
+ v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
+
+ u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+ u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+ u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+ u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+ u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+ u[10] = k_packs_epi64_avx2(v[20], v[21]);
+ u[11] = k_packs_epi64_avx2(v[22], v[23]);
+ u[12] = k_packs_epi64_avx2(v[24], v[25]);
+ u[13] = k_packs_epi64_avx2(v[26], v[27]);
+ u[14] = k_packs_epi64_avx2(v[28], v[29]);
+ u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+ v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+ v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+ v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+ v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+ v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+ v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+ v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+ v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+ v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+ v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+ v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+ v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+ v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+ v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+ v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+ v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+ u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm256_sub_epi32(u[10], v[10]);
+ u[11] = _mm256_sub_epi32(u[11], v[11]);
+ u[12] = _mm256_sub_epi32(u[12], v[12]);
+ u[13] = _mm256_sub_epi32(u[13], v[13]);
+ u[14] = _mm256_sub_epi32(u[14], v[14]);
+ u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm256_add_epi32(u[0], K32One);
+ v[1] = _mm256_add_epi32(u[1], K32One);
+ v[2] = _mm256_add_epi32(u[2], K32One);
+ v[3] = _mm256_add_epi32(u[3], K32One);
+ v[4] = _mm256_add_epi32(u[4], K32One);
+ v[5] = _mm256_add_epi32(u[5], K32One);
+ v[6] = _mm256_add_epi32(u[6], K32One);
+ v[7] = _mm256_add_epi32(u[7], K32One);
+ v[8] = _mm256_add_epi32(u[8], K32One);
+ v[9] = _mm256_add_epi32(u[9], K32One);
+ v[10] = _mm256_add_epi32(u[10], K32One);
+ v[11] = _mm256_add_epi32(u[11], K32One);
+ v[12] = _mm256_add_epi32(u[12], K32One);
+ v[13] = _mm256_add_epi32(u[13], K32One);
+ v[14] = _mm256_add_epi32(u[14], K32One);
+ v[15] = _mm256_add_epi32(u[15], K32One);
+
+ u[0] = _mm256_srai_epi32(v[0], 2);
+ u[1] = _mm256_srai_epi32(v[1], 2);
+ u[2] = _mm256_srai_epi32(v[2], 2);
+ u[3] = _mm256_srai_epi32(v[3], 2);
+ u[4] = _mm256_srai_epi32(v[4], 2);
+ u[5] = _mm256_srai_epi32(v[5], 2);
+ u[6] = _mm256_srai_epi32(v[6], 2);
+ u[7] = _mm256_srai_epi32(v[7], 2);
+ u[8] = _mm256_srai_epi32(v[8], 2);
+ u[9] = _mm256_srai_epi32(v[9], 2);
+ u[10] = _mm256_srai_epi32(v[10], 2);
+ u[11] = _mm256_srai_epi32(v[11], 2);
+ u[12] = _mm256_srai_epi32(v[12], 2);
+ u[13] = _mm256_srai_epi32(v[13], 2);
+ u[14] = _mm256_srai_epi32(v[14], 2);
+ u[15] = _mm256_srai_epi32(v[15], 2);
+
+ out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+ out[21] = _mm256_packs_epi32(u[2], u[3]);
+ out[13] = _mm256_packs_epi32(u[4], u[5]);
+ out[29] = _mm256_packs_epi32(u[6], u[7]);
+ out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+ out[19] = _mm256_packs_epi32(u[10], u[11]);
+ out[11] = _mm256_packs_epi32(u[12], u[13]);
+ out[27] = _mm256_packs_epi32(u[14], u[15]);
+ }
+ }
+#endif
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output_currStep,*output_nextStep;
+ if (0 == pass){
+ output_currStep = &intermediate[column_start * 32];
+ output_nextStep = &intermediate[(column_start + 8) * 32];
+ } else{
+ output_currStep = &output_org[column_start * 32];
+ output_nextStep = &output_org[(column_start + 8) * 32];
+ }
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m256i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
+ // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
+ // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
+ // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
+ // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+ // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+ // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+ const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31
+ // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71
+ // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35
+ // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75
+ // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101
+ // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+ // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115
+ // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+ const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+ const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+ const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+ const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+ const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+ const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+ const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+ const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69
+ // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73
+ // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71
+ // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75
+ // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+ // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+ // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+ // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+ __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+ // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+ // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+ // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+ // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+ // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+ // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+ // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+ __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+ __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+ __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+ __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+ __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+ __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+ __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
+ _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
+ _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
+ _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
+ _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
+ _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
+ _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
+
+ _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+ // Process next 8x8
+ output_currStep += 8;
+ output_nextStep += 8;
+ }
+ }
+ }
+ }
+} // NOLINT
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
new file mode 100644
index 000000000..d81b72bba
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -0,0 +1,2579 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vpx_ports/mem.h"
+
+void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we tranpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i kOne = _mm_set1_epi16(1);
+ __m128i in0, in1, in2, in3;
+ // Load inputs.
+ {
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ // x = x << 4
+ in0 = _mm_slli_epi16(in0, 4);
+ in1 = _mm_slli_epi16(in1, 4);
+ in2 = _mm_slli_epi16(in2, 4);
+ in3 = _mm_slli_epi16(in3, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ {
+ // The mask will only contain wether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+ in0 = _mm_add_epi16(in0, mask);
+ in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+ }
+ }
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // Transform 1/2: Add/substract
+ const __m128i r0 = _mm_add_epi16(in0, in3);
+ const __m128i r1 = _mm_add_epi16(in1, in2);
+ const __m128i r2 = _mm_sub_epi16(in1, in2);
+ const __m128i r3 = _mm_sub_epi16(in0, in3);
+ // Transform 1/2: Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ // Combine and transpose
+ const __m128i res0 = _mm_packs_epi32(w0, w2);
+ const __m128i res1 = _mm_packs_epi32(w4, w6);
+ // 00 01 02 03 20 21 22 23
+ // 10 11 12 13 30 31 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
+ // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3
+ if (0 == pass) {
+ // Extract values in the high part for second pass as transform code
+ // only uses the first four values.
+ in1 = _mm_unpackhi_epi64(in0, in0);
+ in3 = _mm_unpackhi_epi64(in2, in2);
+ } else {
+ // Post-condition output and store it (v + 1) >> 2, taking advantage
+ // of the fact 1/3 are stored just after 0/2.
+ __m128i out01 = _mm_add_epi16(in0, kOne);
+ __m128i out23 = _mm_add_epi16(in2, kOne);
+ out01 = _mm_srai_epi16(out01, 2);
+ out23 = _mm_srai_epi16(out23, 2);
+ _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+ _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
+ }
+ }
+}
+
+static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
+ int stride) {
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i mask;
+
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 4);
+ in[1] = _mm_slli_epi16(in[1], 4);
+ in[2] = _mm_slli_epi16(in[2], 4);
+ in[3] = _mm_slli_epi16(in[3], 4);
+
+ mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+ in[0] = _mm_add_epi16(in[0], mask);
+ in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ __m128i out01 = _mm_add_epi16(in01, kOne);
+ __m128i out23 = _mm_add_epi16(in23, kOne);
+ out01 = _mm_srai_epi16(out01, 2);
+ out23 = _mm_srai_epi16(out23, 2);
+ _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+ _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4_avx2(__m128i *res) {
+ // Combine and transpose
+ // 00 01 02 03 20 21 22 23
+ // 10 11 12 13 30 31 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ // 00 10 20 30 01 11 21 31
+ // 02 12 22 32 03 13 23 33
+ // only use the first 4 16-bit integers
+ res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+ res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_avx2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u[4], v[4];
+ u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+ u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
+ u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ transpose_4x4_avx2(in);
+}
+
+void fadst4_1d_avx2(__m128i *in) {
+ const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+ const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+ const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+ __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpacklo_epi16(in[2], kZero);
+ u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ transpose_4x4_avx2(in);
+}
+
+void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in[4];
+ load_buffer_4x4_avx2(input, in, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct4_1d_avx2(in);
+ fdct4_1d_avx2(in);
+ break;
+ case 1: // ADST_DCT
+ fadst4_1d_avx2(in);
+ fdct4_1d_avx2(in);
+ break;
+ case 2: // DCT_ADST
+ fdct4_1d_avx2(in);
+ fadst4_1d_avx2(in);
+ break;
+ case 3: // ADST_ADST
+ fadst4_1d_avx2(in);
+ fadst4_1d_avx2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ write_buffer_4x4_avx2(output, in);
+}
+
+void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/substract
+ const __m128i q0 = _mm_add_epi16(in0, in7);
+ const __m128i q1 = _mm_add_epi16(in1, in6);
+ const __m128i q2 = _mm_add_epi16(in2, in5);
+ const __m128i q3 = _mm_add_epi16(in3, in4);
+ const __m128i q4 = _mm_sub_epi16(in3, in4);
+ const __m128i q5 = _mm_sub_epi16(in2, in5);
+ const __m128i q6 = _mm_sub_epi16(in1, in6);
+ const __m128i q7 = _mm_sub_epi16(in0, in7);
+ // Work on first four results
+ {
+ // Add/substract
+ const __m128i r0 = _mm_add_epi16(q0, q3);
+ const __m128i r1 = _mm_add_epi16(q1, q2);
+ const __m128i r2 = _mm_sub_epi16(q1, q2);
+ const __m128i r3 = _mm_sub_epi16(q0, q3);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+ // Add/substract
+ const __m128i x0 = _mm_add_epi16(q4, r0);
+ const __m128i x1 = _mm_sub_epi16(q4, r0);
+ const __m128i x2 = _mm_sub_epi16(q7, r1);
+ const __m128i x3 = _mm_add_epi16(q7, r1);
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+ _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+ _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+ _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+ _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+ _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+ _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+ _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+ }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
+ int stride) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 2);
+ in[1] = _mm_slli_epi16(in[1], 2);
+ in[2] = _mm_slli_epi16(in[2], 2);
+ in[3] = _mm_slli_epi16(in[3], 2);
+ in[4] = _mm_slli_epi16(in[4], 2);
+ in[5] = _mm_slli_epi16(in[5], 2);
+ in[6] = _mm_slli_epi16(in[6], 2);
+ in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ const int bit_m02 = bit - 2;
+ __m128i sign0 = _mm_srai_epi16(res[0], 15);
+ __m128i sign1 = _mm_srai_epi16(res[1], 15);
+ __m128i sign2 = _mm_srai_epi16(res[2], 15);
+ __m128i sign3 = _mm_srai_epi16(res[3], 15);
+ __m128i sign4 = _mm_srai_epi16(res[4], 15);
+ __m128i sign5 = _mm_srai_epi16(res[5], 15);
+ __m128i sign6 = _mm_srai_epi16(res[6], 15);
+ __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+ if (bit_m02 >= 0) {
+ __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+ res[0] = _mm_add_epi16(res[0], k_const_rounding);
+ res[1] = _mm_add_epi16(res[1], k_const_rounding);
+ res[2] = _mm_add_epi16(res[2], k_const_rounding);
+ res[3] = _mm_add_epi16(res[3], k_const_rounding);
+ res[4] = _mm_add_epi16(res[4], k_const_rounding);
+ res[5] = _mm_add_epi16(res[5], k_const_rounding);
+ res[6] = _mm_add_epi16(res[6], k_const_rounding);
+ res[7] = _mm_add_epi16(res[7], k_const_rounding);
+ }
+
+ res[0] = _mm_sub_epi16(res[0], sign0);
+ res[1] = _mm_sub_epi16(res[1], sign1);
+ res[2] = _mm_sub_epi16(res[2], sign2);
+ res[3] = _mm_sub_epi16(res[3], sign3);
+ res[4] = _mm_sub_epi16(res[4], sign4);
+ res[5] = _mm_sub_epi16(res[5], sign5);
+ res[6] = _mm_sub_epi16(res[6], sign6);
+ res[7] = _mm_sub_epi16(res[7], sign7);
+
+ res[0] = _mm_srai_epi16(res[0], bit);
+ res[1] = _mm_srai_epi16(res[1], bit);
+ res[2] = _mm_srai_epi16(res[2], bit);
+ res[3] = _mm_srai_epi16(res[3], bit);
+ res[4] = _mm_srai_epi16(res[4], bit);
+ res[5] = _mm_srai_epi16(res[5], bit);
+ res[6] = _mm_srai_epi16(res[6], bit);
+ res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
+ _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+ _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 44 54 45 55 46 56 47 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 25 35
+ // 44 54 64 74 45 55 65 75
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_avx2(__m128i *in) {
+ // constants
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 1
+ s0 = _mm_add_epi16(in[0], in[7]);
+ s1 = _mm_add_epi16(in[1], in[6]);
+ s2 = _mm_add_epi16(in[2], in[5]);
+ s3 = _mm_add_epi16(in[3], in[4]);
+ s4 = _mm_sub_epi16(in[3], in[4]);
+ s5 = _mm_sub_epi16(in[2], in[5]);
+ s6 = _mm_sub_epi16(in[1], in[6]);
+ s7 = _mm_sub_epi16(in[0], in[7]);
+
+ u0 = _mm_add_epi16(s0, s3);
+ u1 = _mm_add_epi16(s1, s2);
+ u2 = _mm_sub_epi16(s1, s2);
+ u3 = _mm_sub_epi16(s0, s3);
+ // interleave and perform butterfly multiplication/addition
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpackhi_epi16(u0, u1);
+ v2 = _mm_unpacklo_epi16(u2, u3);
+ v3 = _mm_unpackhi_epi16(u2, u3);
+
+ u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+ u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+ u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+ u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+ u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+ u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+ u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+ u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[4] = _mm_packs_epi32(u2, u3);
+ in[6] = _mm_packs_epi32(u6, u7);
+
+ // stage 2
+ // interleave and perform butterfly multiplication/addition
+ u0 = _mm_unpacklo_epi16(s6, s5);
+ u1 = _mm_unpackhi_epi16(s6, s5);
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+ u0 = _mm_packs_epi32(v0, v1);
+ u1 = _mm_packs_epi32(v2, v3);
+
+ // stage 3
+ s0 = _mm_add_epi16(s4, u0);
+ s1 = _mm_sub_epi16(s4, u0);
+ s2 = _mm_sub_epi16(s7, u1);
+ s3 = _mm_add_epi16(s7, u1);
+
+ // stage 4
+ u0 = _mm_unpacklo_epi16(s0, s3);
+ u1 = _mm_unpackhi_epi16(s0, s3);
+ u2 = _mm_unpacklo_epi16(s1, s2);
+ u3 = _mm_unpackhi_epi16(s1, s2);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+ v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+ v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+ v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+ v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+ v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+ v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+ v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v0, v1);
+ in[3] = _mm_packs_epi32(v4, v5);
+ in[5] = _mm_packs_epi32(v2, v3);
+ in[7] = _mm_packs_epi32(v6, v7);
+
+ // transpose
+ array_transpose_8x8_avx2(in, in);
+}
+
+void fadst8_1d_avx2(__m128i *in) {
+ // Constants
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ // FIXME(jingning): do subtract using bit inversion?
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+
+ // transpose
+ array_transpose_8x8_avx2(in, in);
+}
+
+void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in[8];
+ load_buffer_8x8_avx2(input, in, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct8_1d_avx2(in);
+ fdct8_1d_avx2(in);
+ break;
+ case 1: // ADST_DCT
+ fadst8_1d_avx2(in);
+ fdct8_1d_avx2(in);
+ break;
+ case 2: // DCT_ADST
+ fdct8_1d_avx2(in);
+ fadst8_1d_avx2(in);
+ break;
+ case 3: // ADST_ADST
+ fadst8_1d_avx2(in);
+ fadst8_1d_avx2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ right_shift_8x8_avx2(in, 1);
+ write_buffer_8x8_avx2(output, in, 8);
+}
+
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we tranpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ int pass;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
+ const int16_t *in = input;
+ int16_t *out = intermediate;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = _mm_add_epi16(in00, in15);
+ input1 = _mm_add_epi16(in01, in14);
+ input2 = _mm_add_epi16(in02, in13);
+ input3 = _mm_add_epi16(in03, in12);
+ input4 = _mm_add_epi16(in04, in11);
+ input5 = _mm_add_epi16(in05, in10);
+ input6 = _mm_add_epi16(in06, in09);
+ input7 = _mm_add_epi16(in07, in08);
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = _mm_sub_epi16(in07, in08);
+ step1_1 = _mm_sub_epi16(in06, in09);
+ step1_2 = _mm_sub_epi16(in05, in10);
+ step1_3 = _mm_sub_epi16(in04, in11);
+ step1_4 = _mm_sub_epi16(in03, in12);
+ step1_5 = _mm_sub_epi16(in02, in13);
+ step1_6 = _mm_sub_epi16(in01, in14);
+ step1_7 = _mm_sub_epi16(in00, in15);
+ }
+ // Work on the first eight values; fdct8_1d(input, even_results);
+ {
+ // Add/substract
+ const __m128i q0 = _mm_add_epi16(input0, input7);
+ const __m128i q1 = _mm_add_epi16(input1, input6);
+ const __m128i q2 = _mm_add_epi16(input2, input5);
+ const __m128i q3 = _mm_add_epi16(input3, input4);
+ const __m128i q4 = _mm_sub_epi16(input3, input4);
+ const __m128i q5 = _mm_sub_epi16(input2, input5);
+ const __m128i q6 = _mm_sub_epi16(input1, input6);
+ const __m128i q7 = _mm_sub_epi16(input0, input7);
+ // Work on first four results
+ {
+ // Add/substract
+ const __m128i r0 = _mm_add_epi16(q0, q3);
+ const __m128i r1 = _mm_add_epi16(q1, q2);
+ const __m128i r2 = _mm_sub_epi16(q1, q2);
+ const __m128i r3 = _mm_sub_epi16(q0, q3);
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res00 = _mm_packs_epi32(w0, w1);
+ res08 = _mm_packs_epi32(w2, w3);
+ res04 = _mm_packs_epi32(w4, w5);
+ res12 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+ // Add/substract
+ const __m128i x0 = _mm_add_epi16(q4, r0);
+ const __m128i x1 = _mm_sub_epi16(q4, r0);
+ const __m128i x2 = _mm_sub_epi16(q7, r1);
+ const __m128i x3 = _mm_add_epi16(q7, r1);
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res02 = _mm_packs_epi32(w0, w1);
+ res14 = _mm_packs_epi32(w2, w3);
+ res10 = _mm_packs_epi32(w4, w5);
+ res06 = _mm_packs_epi32(w6, w7);
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_2 = _mm_packs_epi32(w0, w1);
+ step2_3 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_5 = _mm_packs_epi32(w0, w1);
+ step2_4 = _mm_packs_epi32(w2, w3);
+ }
+ // step 3
+ {
+ step3_0 = _mm_add_epi16(step1_0, step2_3);
+ step3_1 = _mm_add_epi16(step1_1, step2_2);
+ step3_2 = _mm_sub_epi16(step1_1, step2_2);
+ step3_3 = _mm_sub_epi16(step1_0, step2_3);
+ step3_4 = _mm_sub_epi16(step1_7, step2_4);
+ step3_5 = _mm_sub_epi16(step1_6, step2_5);
+ step3_6 = _mm_add_epi16(step1_6, step2_5);
+ step3_7 = _mm_add_epi16(step1_7, step2_4);
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_1 = _mm_packs_epi32(w0, w1);
+ step2_2 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_6 = _mm_packs_epi32(w0, w1);
+ step2_5 = _mm_packs_epi32(w2, w3);
+ }
+ // step 5
+ {
+ step1_0 = _mm_add_epi16(step3_0, step2_1);
+ step1_1 = _mm_sub_epi16(step3_0, step2_1);
+ step1_2 = _mm_sub_epi16(step3_3, step2_2);
+ step1_3 = _mm_add_epi16(step3_3, step2_2);
+ step1_4 = _mm_add_epi16(step3_4, step2_5);
+ step1_5 = _mm_sub_epi16(step3_4, step2_5);
+ step1_6 = _mm_sub_epi16(step3_7, step2_6);
+ step1_7 = _mm_add_epi16(step3_7, step2_6);
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res01 = _mm_packs_epi32(w0, w1);
+ res09 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res05 = _mm_packs_epi32(w0, w1);
+ res13 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res11 = _mm_packs_epi32(w0, w1);
+ res03 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res15 = _mm_packs_epi32(w0, w1);
+ res07 = _mm_packs_epi32(w2, w3);
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
+ _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
+ _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
+ _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
+ _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
+ _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
+ _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
+ _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
+ }
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ // Store results
+ _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+ _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+ _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+ _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+ _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+ _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+ _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+ _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+ }
+ out += 8*16;
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ out = output;
+ }
+}
+
+static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
+ __m128i *in1, int stride) {
+ // load first 8 columns
+ load_buffer_8x8_avx2(input, in0, stride);
+ load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
+
+ input += 8;
+ // load second 8 columns
+ load_buffer_8x8_avx2(input, in1, stride);
+ load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
+ __m128i *in1, int stride) {
+ // write first 8 columns
+ write_buffer_8x8_avx2(output, in0, stride);
+ write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
+ // write second 8 columns
+ output += 8;
+ write_buffer_8x8_avx2(output, in1, stride);
+ write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8_avx2(res0, res0);
+ array_transpose_8x8_avx2(res1, tbuf);
+ array_transpose_8x8_avx2(res0 + 8, res1);
+ array_transpose_8x8_avx2(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
+ // perform rounding operations
+ right_shift_8x8_avx2(res0, 2);
+ right_shift_8x8_avx2(res0 + 8, 2);
+ right_shift_8x8_avx2(res1, 2);
+ right_shift_8x8_avx2(res1 + 8, 2);
+}
+
+void fdct16_1d_8col_avx2(__m128i *in) {
+ // perform 16x16 1-D DCT for 8 columns
+ __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ // stage 1
+ i[0] = _mm_add_epi16(in[0], in[15]);
+ i[1] = _mm_add_epi16(in[1], in[14]);
+ i[2] = _mm_add_epi16(in[2], in[13]);
+ i[3] = _mm_add_epi16(in[3], in[12]);
+ i[4] = _mm_add_epi16(in[4], in[11]);
+ i[5] = _mm_add_epi16(in[5], in[10]);
+ i[6] = _mm_add_epi16(in[6], in[9]);
+ i[7] = _mm_add_epi16(in[7], in[8]);
+
+ s[0] = _mm_sub_epi16(in[7], in[8]);
+ s[1] = _mm_sub_epi16(in[6], in[9]);
+ s[2] = _mm_sub_epi16(in[5], in[10]);
+ s[3] = _mm_sub_epi16(in[4], in[11]);
+ s[4] = _mm_sub_epi16(in[3], in[12]);
+ s[5] = _mm_sub_epi16(in[2], in[13]);
+ s[6] = _mm_sub_epi16(in[1], in[14]);
+ s[7] = _mm_sub_epi16(in[0], in[15]);
+
+ p[0] = _mm_add_epi16(i[0], i[7]);
+ p[1] = _mm_add_epi16(i[1], i[6]);
+ p[2] = _mm_add_epi16(i[2], i[5]);
+ p[3] = _mm_add_epi16(i[3], i[4]);
+ p[4] = _mm_sub_epi16(i[3], i[4]);
+ p[5] = _mm_sub_epi16(i[2], i[5]);
+ p[6] = _mm_sub_epi16(i[1], i[6]);
+ p[7] = _mm_sub_epi16(i[0], i[7]);
+
+ u[0] = _mm_add_epi16(p[0], p[3]);
+ u[1] = _mm_add_epi16(p[1], p[2]);
+ u[2] = _mm_sub_epi16(p[1], p[2]);
+ u[3] = _mm_sub_epi16(p[0], p[3]);
+
+ v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+ v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+ v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+ v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+ u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+ u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+ u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+ u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+ u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+ u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[4] = _mm_packs_epi32(u[4], u[5]);
+ in[8] = _mm_packs_epi32(u[2], u[3]);
+ in[12] = _mm_packs_epi32(u[6], u[7]);
+
+ u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[2], v[3]);
+
+ t[0] = _mm_add_epi16(p[4], u[0]);
+ t[1] = _mm_sub_epi16(p[4], u[0]);
+ t[2] = _mm_sub_epi16(p[7], u[1]);
+ t[3] = _mm_add_epi16(p[7], u[1]);
+
+ u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+ u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+ u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ in[2] = _mm_packs_epi32(v[0], v[1]);
+ in[6] = _mm_packs_epi32(v[4], v[5]);
+ in[10] = _mm_packs_epi32(v[2], v[3]);
+ in[14] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[2] = _mm_packs_epi32(v[0], v[1]);
+ t[3] = _mm_packs_epi32(v[2], v[3]);
+ t[4] = _mm_packs_epi32(v[4], v[5]);
+ t[5] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 3
+ p[0] = _mm_add_epi16(s[0], t[3]);
+ p[1] = _mm_add_epi16(s[1], t[2]);
+ p[2] = _mm_sub_epi16(s[1], t[2]);
+ p[3] = _mm_sub_epi16(s[0], t[3]);
+ p[4] = _mm_sub_epi16(s[7], t[4]);
+ p[5] = _mm_sub_epi16(s[6], t[5]);
+ p[6] = _mm_add_epi16(s[6], t[5]);
+ p[7] = _mm_add_epi16(s[7], t[4]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+ u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+ u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[1] = _mm_packs_epi32(v[0], v[1]);
+ t[2] = _mm_packs_epi32(v[2], v[3]);
+ t[5] = _mm_packs_epi32(v[4], v[5]);
+ t[6] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 5
+ s[0] = _mm_add_epi16(p[0], t[1]);
+ s[1] = _mm_sub_epi16(p[0], t[1]);
+ s[2] = _mm_sub_epi16(p[3], t[2]);
+ s[3] = _mm_add_epi16(p[3], t[2]);
+ s[4] = _mm_add_epi16(p[4], t[5]);
+ s[5] = _mm_sub_epi16(p[4], t[5]);
+ s[6] = _mm_sub_epi16(p[7], t[6]);
+ s[7] = _mm_add_epi16(p[7], t[6]);
+
+ // stage 6
+ u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+ u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+ v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+ v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+ v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+ v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+ v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+ v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+ v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+ v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+ v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+ v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v[0], v[1]);
+ in[9] = _mm_packs_epi32(v[2], v[3]);
+ in[5] = _mm_packs_epi32(v[4], v[5]);
+ in[13] = _mm_packs_epi32(v[6], v[7]);
+ in[3] = _mm_packs_epi32(v[8], v[9]);
+ in[11] = _mm_packs_epi32(v[10], v[11]);
+ in[7] = _mm_packs_epi32(v[12], v[13]);
+ in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col_avx2(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
+ fdct16_1d_8col_avx2(in0);
+ fdct16_1d_8col_avx2(in1);
+ array_transpose_16x16_avx2(in0, in1);
+}
+
+void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
+ fadst16_1d_8col_avx2(in0);
+ fadst16_1d_8col_avx2(in1);
+ array_transpose_16x16_avx2(in0, in1);
+}
+
+void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in0[16], in1[16];
+ load_buffer_16x16_avx2(input, in0, in1, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct16_1d_avx2(in0, in1);
+ right_shift_16x16_avx2(in0, in1);
+ fdct16_1d_avx2(in0, in1);
+ break;
+ case 1: // ADST_DCT
+ fadst16_1d_avx2(in0, in1);
+ right_shift_16x16_avx2(in0, in1);
+ fdct16_1d_avx2(in0, in1);
+ break;
+ case 2: // DCT_ADST
+ fdct16_1d_avx2(in0, in1);
+ right_shift_16x16_avx2(in0, in1);
+ fadst16_1d_avx2(in0, in1);
+ break;
+ case 3: // ADST_ADST
+ fadst16_1d_avx2(in0, in1);
+ right_shift_16x16_avx2(in0, in1);
+ fadst16_1d_avx2(in0, in1);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ write_buffer_16x16_avx2(output, in0, in1, 16);
+}
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 6e4a498cb..eefbd1ac9 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -124,7 +124,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-#VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index bd13518f5..ce83a6703 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -38,7 +38,6 @@ VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
-VP9_CX_SRCS-yes += encoder/vp9_modecosts.h
VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h
VP9_CX_SRCS-yes += encoder/vp9_psnr.h
VP9_CX_SRCS-yes += encoder/vp9_quantize.h
@@ -49,7 +48,6 @@ VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
VP9_CX_SRCS-yes += encoder/vp9_variance.h
VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
-VP9_CX_SRCS-yes += encoder/vp9_modecosts.c
VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
@@ -106,4 +104,7 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 7e76682d4..f43172170 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -30,7 +30,6 @@ VP9_DX_SRCS-yes += decoder/vp9_onyxd.h
VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h
VP9_DX_SRCS-yes += decoder/vp9_thread.c
VP9_DX_SRCS-yes += decoder/vp9_thread.h
-VP9_DX_SRCS-yes += decoder/vp9_treereader.h
VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h